summaryrefslogtreecommitdiffstats
path: root/lib/Target/AArch64
diff options
context:
space:
mode:
authordim <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
committerdim <dim@FreeBSD.org>2015-01-18 16:17:27 +0000
commit081af4da16b9046c019ca40f64b1fb7ee8c6dca1 (patch)
tree4abb9cbeecc7901726dd0b4a37369596c852e9ef /lib/Target/AArch64
parent3c7e7a1538a873b0d3b012ef8811969ac4552c2a (diff)
downloadFreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.zip
FreeBSD-src-081af4da16b9046c019ca40f64b1fb7ee8c6dca1.tar.gz
Vendor import of llvm RELEASE_360/rc1 tag r226102 (effectively, 3.6.0 RC1):
https://llvm.org/svn/llvm-project/llvm/tags/RELEASE_360/rc1@226102
Diffstat (limited to 'lib/Target/AArch64')
-rw-r--r--lib/Target/AArch64/AArch64.h11
-rw-r--r--lib/Target/AArch64/AArch64A53Fix835769.cpp240
-rw-r--r--lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp706
-rw-r--r--lib/Target/AArch64/AArch64AddressTypePromotion.cpp6
-rw-r--r--lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp12
-rw-r--r--lib/Target/AArch64/AArch64AsmPrinter.cpp33
-rw-r--r--lib/Target/AArch64/AArch64BranchRelaxation.cpp11
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.h141
-rw-r--r--lib/Target/AArch64/AArch64CallingConvention.td35
-rw-r--r--lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp4
-rw-r--r--lib/Target/AArch64/AArch64CollectLOH.cpp52
-rw-r--r--lib/Target/AArch64/AArch64ConditionOptimizer.cpp422
-rw-r--r--lib/Target/AArch64/AArch64ConditionalCompares.cpp12
-rw-r--r--lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp9
-rw-r--r--lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp3
-rw-r--r--lib/Target/AArch64/AArch64FastISel.cpp4510
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.cpp57
-rw-r--r--lib/Target/AArch64/AArch64FrameLowering.h4
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp272
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp1319
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h42
-rw-r--r--lib/Target/AArch64/AArch64InstrAtomics.td9
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td44
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.cpp997
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.h42
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td416
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp16
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.cpp3
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.h6
-rw-r--r--lib/Target/AArch64/AArch64MachineCombinerPattern.h42
-rw-r--r--lib/Target/AArch64/AArch64MachineFunctionInfo.h6
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.cpp383
-rw-r--r--lib/Target/AArch64/AArch64PBQPRegAlloc.h38
-rw-r--r--lib/Target/AArch64/AArch64PerfectShuffle.h5
-rw-r--r--lib/Target/AArch64/AArch64PromoteConstant.cpp8
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.cpp12
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.h6
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td5
-rw-r--r--lib/Target/AArch64/AArch64SchedA57.td371
-rw-r--r--lib/Target/AArch64/AArch64SchedA57WriteRes.td52
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.cpp3
-rw-r--r--lib/Target/AArch64/AArch64SelectionDAGInfo.h4
-rw-r--r--lib/Target/AArch64/AArch64StorePairSuppress.cpp14
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp34
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h40
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.cpp133
-rw-r--r--lib/Target/AArch64/AArch64TargetMachine.h34
-rw-r--r--lib/Target/AArch64/AArch64TargetObjectFile.h4
-rw-r--r--lib/Target/AArch64/AArch64TargetTransformInfo.cpp78
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp149
-rw-r--r--lib/Target/AArch64/CMakeLists.txt6
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp67
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.h11
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h4
-rw-r--r--lib/Target/AArch64/Disassembler/LLVMBuild.txt2
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp18
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h9
-rw-r--r--lib/Target/AArch64/LLVMBuild.txt2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h104
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp53
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp2
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp74
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h8
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h4
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h4
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp5
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp5
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h7
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp47
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h15
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp85
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp2
-rw-r--r--lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp13
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp36
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h13
76 files changed, 9622 insertions, 1836 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 1c022aa..e96d18b 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -12,13 +12,13 @@
//
//===----------------------------------------------------------------------===//
-#ifndef TARGET_AArch64_H
-#define TARGET_AArch64_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64_H
-#include "Utils/AArch64BaseInfo.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
+#include "Utils/AArch64BaseInfo.h"
#include "llvm/Support/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
namespace llvm {
@@ -36,7 +36,10 @@ FunctionPass *createAArch64StorePairSuppressPass();
FunctionPass *createAArch64ExpandPseudoPass();
FunctionPass *createAArch64LoadStoreOptimizationPass();
ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64ConditionOptimizerPass();
FunctionPass *createAArch64AddressTypePromotionPass();
+FunctionPass *createAArch64A57FPLoadBalancing();
+FunctionPass *createAArch64A53Fix835769();
/// \brief Creates an ARM-specific Target Transformation Info pass.
ImmutablePass *
createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
new file mode 100644
index 0000000..852a635
--- /dev/null
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -0,0 +1,240 @@
+//===-- AArch64A53Fix835769.cpp -------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass changes code to work around Cortex-A53 erratum 835769.
+// It works around it by inserting a nop instruction in code sequences that
+// in some circumstances may trigger the erratum.
+// It inserts a nop instruction between a sequence of the following 2 classes
+// of instructions:
+// instr 1: mem-instr (including loads, stores and prefetches).
+// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769"
+
+STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769");
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a match for the instruction that comes first in the
+// sequence of instructions that can trigger the erratum?
+static bool isFirstInstructionInSequence(MachineInstr *MI) {
+ // Must return true if this instruction is a load, a store or a prefetch.
+ switch (MI->getOpcode()) {
+ case AArch64::PRFMl:
+ case AArch64::PRFMroW:
+ case AArch64::PRFMroX:
+ case AArch64::PRFMui:
+ case AArch64::PRFUMi:
+ return true;
+ default:
+ return (MI->mayLoad() || MI->mayStore());
+ }
+}
+
+// Is the instruction a match for the instruction that comes second in the
+// sequence that can trigger the erratum?
+static bool isSecondInstructionInSequence(MachineInstr *MI) {
+ // Must return true for non-SIMD integer multiply-accumulates, writing
+ // to a 64-bit register.
+ switch (MI->getOpcode()) {
+ // Erratum cannot be triggered when the destination register is 32 bits,
+ // therefore only include the following.
+ case AArch64::MSUBXrrr:
+ case AArch64::MADDXrrr:
+ case AArch64::SMADDLrrr:
+ case AArch64::SMSUBLrrr:
+ case AArch64::UMADDLrrr:
+ case AArch64::UMSUBLrrr:
+ // Erratum can only be triggered by multiply-adds, not by regular
+ // non-accumulating multiplies, i.e. when Ra=XZR='11111'
+ return MI->getOperand(3).getReg() != AArch64::XZR;
+ default:
+ return false;
+ }
+}
+
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64A53Fix835769 : public MachineFunctionPass {
+ const AArch64InstrInfo *TII;
+
+public:
+ static char ID;
+ explicit AArch64A53Fix835769() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ const char *getPassName() const override {
+ return "Workaround A53 erratum 835769 pass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool runOnBasicBlock(MachineBasicBlock &MBB);
+};
+char AArch64A53Fix835769::ID = 0;
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool
+AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) {
+ const TargetMachine &TM = F.getTarget();
+
+ bool Changed = false;
+ DEBUG(dbgs() << "***** AArch64A53Fix835769 *****\n");
+
+ TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+ for (auto &MBB : F) {
+ Changed |= runOnBasicBlock(MBB);
+ }
+
+ return Changed;
+}
+
+// Return the block that was fallen through to get to MBB, if any,
+// otherwise nullptr.
+static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
+ // Get the previous machine basic block in the function.
+ MachineFunction::iterator MBBI = *MBB;
+
+ // Can't go off top of function.
+ if (MBBI == MBB->getParent()->begin())
+ return nullptr;
+
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ SmallVector<MachineOperand, 2> Cond;
+
+ MachineBasicBlock *PrevBB = std::prev(MBBI);
+ for (MachineBasicBlock *S : MBB->predecessors())
+ if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
+ !TBB && !FBB)
+ return S;
+
+ return nullptr;
+}
+
+// Iterate through fallen through blocks trying to find a previous non-pseudo if
+// there is one, otherwise return nullptr. Only look for instructions in
+// previous blocks, not the current block, since we only use this to look at
+// previous blocks.
+static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB,
+ const TargetInstrInfo *TII) {
+ MachineBasicBlock *FMBB = &MBB;
+
+ // If there is no non-pseudo in the current block, loop back around and try
+ // the previous block (if there is one).
+ while ((FMBB = getBBFallenThrough(FMBB, TII))) {
+ for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) {
+ if (!I->isPseudo())
+ return &*I;
+ }
+ }
+
+ // There was no previous non-pseudo in the fallen through blocks
+ return nullptr;
+}
+
+static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI,
+ const TargetInstrInfo *TII) {
+ // If we are the first instruction of the block, put the NOP at the end of
+ // the previous fallthrough block
+ if (MI == &MBB.front()) {
+ MachineInstr *I = getLastNonPseudo(MBB, TII);
+ assert(I && "Expected instruction");
+ DebugLoc DL = I->getDebugLoc();
+ BuildMI(I->getParent(), DL, TII->get(AArch64::HINT)).addImm(0);
+ }
+ else {
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0);
+ }
+
+ ++NumNopsAdded;
+}
+
+bool
+AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+ // First, scan the basic block, looking for a sequence of 2 instructions
+ // that match the conditions under which the erratum may trigger.
+
+ // List of terminating instructions in matching sequences
+ std::vector<MachineInstr*> Sequences;
+ unsigned Idx = 0;
+ MachineInstr *PrevInstr = nullptr;
+
+ // Try and find the last non-pseudo instruction in any fallen through blocks,
+ // if there isn't one, then we use nullptr to represent that.
+ PrevInstr = getLastNonPseudo(MBB, TII);
+
+ for (auto &MI : MBB) {
+ MachineInstr *CurrInstr = &MI;
+ DEBUG(dbgs() << " Examining: " << MI);
+ if (PrevInstr) {
+ DEBUG(dbgs() << " PrevInstr: " << *PrevInstr
+ << " CurrInstr: " << *CurrInstr
+ << " isFirstInstructionInSequence(PrevInstr): "
+ << isFirstInstructionInSequence(PrevInstr) << "\n"
+ << " isSecondInstructionInSequence(CurrInstr): "
+ << isSecondInstructionInSequence(CurrInstr) << "\n");
+ if (isFirstInstructionInSequence(PrevInstr) &&
+ isSecondInstructionInSequence(CurrInstr)) {
+ DEBUG(dbgs() << " ** pattern found at Idx " << Idx << "!\n");
+ Sequences.push_back(CurrInstr);
+ }
+ }
+ if (!CurrInstr->isPseudo())
+ PrevInstr = CurrInstr;
+ ++Idx;
+ }
+
+ DEBUG(dbgs() << "Scan complete, "<< Sequences.size()
+ << " occurences of pattern found.\n");
+
+ // Then update the basic block, inserting nops between the detected sequences.
+ for (auto &MI : Sequences) {
+ Changed = true;
+ insertNopBeforeInstruction(MBB, MI, TII);
+ }
+
+ return Changed;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to
+// the passmanager.
+FunctionPass *llvm::createAArch64A53Fix835769() {
+ return new AArch64A53Fix835769();
+}
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
new file mode 100644
index 0000000..dd1a1ea
--- /dev/null
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -0,0 +1,706 @@
+//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// For best-case performance on Cortex-A57, we should try to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply or
+// multiply-accumulate operations.
+//
+// This pass attempts to detect situations where the register allocation may
+// adversely affect this load balancing and to change the registers used so as
+// to better utilize the CPU.
+//
+// Ideally we'd just take each multiply or multiply-accumulate in turn and
+// allocate it alternating even or odd registers. However, multiply-accumulates
+// are most efficiently performed in the same functional unit as their
+// accumulation operand. Therefore this pass tries to find maximal sequences
+// ("Chains") of multiply-accumulates linked via their accumulation operand,
+// and assign them all the same "color" (oddness/evenness).
+//
+// This optimization affects S-register and D-register floating point
+// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
+// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
+// not affected.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
+
+// Enforce the algorithm to use the scavenged register even when the original
+// destination register is the correct color. Used for testing.
+static cl::opt<bool>
+TransformAll("aarch64-a57-fp-load-balancing-force-all",
+ cl::desc("Always modify dest registers regardless of color"),
+ cl::init(false), cl::Hidden);
+
+// Never use the balance information obtained from chains - return a specific
+// color always. Used for testing.
+static cl::opt<unsigned>
+OverrideBalance("aarch64-a57-fp-load-balancing-override",
+ cl::desc("Ignore balance information, always return "
+ "(1: Even, 2: Odd)."),
+ cl::init(0), cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
+static bool isMul(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AArch64::FMULSrr:
+ case AArch64::FNMULSrr:
+ case AArch64::FMULDrr:
+ case AArch64::FNMULDrr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
+static bool isMla(MachineInstr *MI) {
+ switch (MI->getOpcode()) {
+ case AArch64::FMSUBSrrr:
+ case AArch64::FMADDSrrr:
+ case AArch64::FNMSUBSrrr:
+ case AArch64::FNMADDSrrr:
+ case AArch64::FMSUBDrrr:
+ case AArch64::FMADDDrrr:
+ case AArch64::FNMSUBDrrr:
+ case AArch64::FNMADDDrrr:
+ return true;
+ default:
+ return false;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// A "color", which is either even or odd. Yes, these aren't really colors
+/// but the algorithm is conceptually doing two-color graph coloring.
+enum class Color { Even, Odd };
+#ifndef NDEBUG
+static const char *ColorNames[2] = { "Even", "Odd" };
+#endif
+
+class Chain;
+
+class AArch64A57FPLoadBalancing : public MachineFunctionPass {
+ const AArch64InstrInfo *TII;
+ MachineRegisterInfo *MRI;
+ const TargetRegisterInfo *TRI;
+ RegisterClassInfo RCI;
+
+public:
+ static char ID;
+ explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &F) override;
+
+ const char *getPassName() const override {
+ return "A57 FP Anti-dependency breaker";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool runOnBasicBlock(MachineBasicBlock &MBB);
+ bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
+ int &Balance);
+ bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
+ int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
+ void scanInstruction(MachineInstr *MI, unsigned Idx,
+ std::map<unsigned, Chain*> &Active,
+ std::set<std::unique_ptr<Chain>> &AllChains);
+ void maybeKillChain(MachineOperand &MO, unsigned Idx,
+ std::map<unsigned, Chain*> &RegChains);
+ Color getColor(unsigned Register);
+ Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
+};
+char AArch64A57FPLoadBalancing::ID = 0;
+
+/// A Chain is a sequence of instructions that are linked together by
+/// an accumulation operand. For example:
+///
+/// fmul d0<def>, ?
+/// fmla d1<def>, ?, ?, d0<kill>
+/// fmla d2<def>, ?, ?, d1<kill>
+///
+/// There may be other instructions interleaved in the sequence that
+/// do not belong to the chain. These other instructions must not use
+/// the "chain" register at any point.
+///
+/// We currently only support chains where the "chain" operand is killed
+/// at each link in the chain for simplicity.
+/// A chain has three important instructions - Start, Last and Kill.
+/// * The start instruction is the first instruction in the chain.
+/// * Last is the final instruction in the chain.
+/// * Kill may or may not be defined. If defined, Kill is the instruction
+/// where the outgoing value of the Last instruction is killed.
+/// This information is important as if we know the outgoing value is
+/// killed with no intervening uses, we can safely change its register.
+///
+/// Without a kill instruction, we must assume the outgoing value escapes
+/// beyond our model and either must not change its register or must
+/// create a fixup FMOV to keep the old register value consistent.
+///
+class Chain {
+public:
+ /// The important (marker) instructions.
+ MachineInstr *StartInst, *LastInst, *KillInst;
+ /// The index, from the start of the basic block, that each marker
+ /// appears. These are stored so we can do quick interval tests.
+ unsigned StartInstIdx, LastInstIdx, KillInstIdx;
+ /// All instructions in the chain.
+ std::set<MachineInstr*> Insts;
+ /// True if KillInst cannot be modified. If this is true,
+ /// we cannot change LastInst's outgoing register.
+ /// This will be true for tied values and regmasks.
+ bool KillIsImmutable;
+ /// The "color" of LastInst. This will be the preferred chain color,
+ /// as changing intermediate nodes is easy but changing the last
+ /// instruction can be more tricky.
+ Color LastColor;
+
+ Chain(MachineInstr *MI, unsigned Idx, Color C)
+ : StartInst(MI), LastInst(MI), KillInst(nullptr),
+ StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
+ LastColor(C) {
+ Insts.insert(MI);
+ }
+
+ /// Add a new instruction into the chain. The instruction's dest operand
+ /// has the given color.
+ void add(MachineInstr *MI, unsigned Idx, Color C) {
+ LastInst = MI;
+ LastInstIdx = Idx;
+ LastColor = C;
+ assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+ "Chain: broken invariant. A Chain can only be killed after its last "
+ "def");
+
+ Insts.insert(MI);
+ }
+
+ /// Return true if MI is a member of the chain.
+ bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; }
+
+ /// Return the number of instructions in the chain.
+ unsigned size() const {
+ return Insts.size();
+ }
+
+ /// Inform the chain that its last active register (the dest register of
+ /// LastInst) is killed by MI with no intervening uses or defs.
+ void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
+ KillInst = MI;
+ KillInstIdx = Idx;
+ KillIsImmutable = Immutable;
+ assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
+ "Chain: broken invariant. A Chain can only be killed after its last "
+ "def");
+ }
+
+ /// Return the first instruction in the chain.
+ MachineInstr *getStart() const { return StartInst; }
+ /// Return the last instruction in the chain.
+ MachineInstr *getLast() const { return LastInst; }
+ /// Return the "kill" instruction (as set with setKill()) or NULL.
+ MachineInstr *getKill() const { return KillInst; }
+ /// Return an instruction that can be used as an iterator for the end
+ /// of the chain. This is the maximum of KillInst (if set) and LastInst.
+ MachineBasicBlock::iterator getEnd() const {
+ return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
+ }
+
+ /// Can the Kill instruction (assuming one exists) be modified?
+ bool isKillImmutable() const { return KillIsImmutable; }
+
+ /// Return the preferred color of this chain.
+ Color getPreferredColor() {
+ if (OverrideBalance != 0)
+ return OverrideBalance == 1 ? Color::Even : Color::Odd;
+ return LastColor;
+ }
+
+ /// Return true if this chain (StartInst..KillInst) overlaps with Other.
+ bool rangeOverlapsWith(const Chain &Other) const {
+ unsigned End = KillInst ? KillInstIdx : LastInstIdx;
+ unsigned OtherEnd = Other.KillInst ?
+ Other.KillInstIdx : Other.LastInstIdx;
+
+ return StartInstIdx <= OtherEnd && Other.StartInstIdx <= End;
+ }
+
+ /// Return true if this chain starts before Other.
+ bool startsBefore(Chain *Other) {
+ return StartInstIdx < Other->StartInstIdx;
+ }
+
+ /// Return true if the group will require a fixup MOV at the end.
+ bool requiresFixup() const {
+ return (getKill() && isKillImmutable()) || !getKill();
+ }
+
+ /// Return a simple string representation of the chain.
+ std::string str() const {
+ std::string S;
+ raw_string_ostream OS(S);
+
+ OS << "{";
+ StartInst->print(OS, NULL, true);
+ OS << " -> ";
+ LastInst->print(OS, NULL, true);
+ if (KillInst) {
+ OS << " (kill @ ";
+ KillInst->print(OS, NULL, true);
+ OS << ")";
+ }
+ OS << "}";
+
+ return OS.str();
+ }
+
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
+ bool Changed = false;
+ DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
+
+ const TargetMachine &TM = F.getTarget();
+ MRI = &F.getRegInfo();
+ TRI = F.getRegInfo().getTargetRegisterInfo();
+ TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
+ RCI.runOnMachineFunction(F);
+
+ for (auto &MBB : F) {
+ Changed |= runOnBasicBlock(MBB);
+ }
+
+ return Changed;
+}
+
+bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
+ bool Changed = false;
+ DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+ // First, scan the basic block producing a set of chains.
+
+ // The currently "active" chains - chains that can be added to and haven't
+ // been killed yet. This is keyed by register - all chains can only have one
+ // "link" register between each inst in the chain.
+ std::map<unsigned, Chain*> ActiveChains;
+ std::set<std::unique_ptr<Chain>> AllChains;
+ unsigned Idx = 0;
+ for (auto &MI : MBB)
+ scanInstruction(&MI, Idx++, ActiveChains, AllChains);
+
+ DEBUG(dbgs() << "Scan complete, "<< AllChains.size() << " chains created.\n");
+
+ // Group the chains into disjoint sets based on their liveness range. This is
+ // a poor-man's version of graph coloring. Ideally we'd create an interference
+ // graph and perform full-on graph coloring on that, but;
+ // (a) That's rather heavyweight for only two colors.
+ // (b) We expect multiple disjoint interference regions - in practice the live
+ // range of chains is quite small and they are clustered between loads
+ // and stores.
+ EquivalenceClasses<Chain*> EC;
+ for (auto &I : AllChains)
+ EC.insert(I.get());
+
+ for (auto &I : AllChains)
+ for (auto &J : AllChains)
+ if (I != J && I->rangeOverlapsWith(*J))
+ EC.unionSets(I.get(), J.get());
+ DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
+
+ // Now we assume that every member of an equivalence class interferes
+ // with every other member of that class, and with no members of other classes.
+
+ // Convert the EquivalenceClasses to a simpler set of sets.
+ std::vector<std::vector<Chain*> > V;
+ for (auto I = EC.begin(), E = EC.end(); I != E; ++I) {
+ std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
+ if (Cs.empty()) continue;
+ V.push_back(std::move(Cs));
+ }
+
+ // Now we have a set of sets, order them by start address so
+ // we can iterate over them sequentially.
+ std::sort(V.begin(), V.end(),
+ [](const std::vector<Chain*> &A,
+ const std::vector<Chain*> &B) {
+ return A.front()->startsBefore(B.front());
+ });
+
+ // As we only have two colors, we can track the global (BB-level) balance of
+ // odds versus evens. We aim to keep this near zero to keep both execution
+ // units fed.
+ // Positive means we're even-heavy, negative we're odd-heavy.
+ //
+ // FIXME: If chains have interdependencies, for example:
+ // mul r0, r1, r2
+ // mul r3, r0, r1
+ // We do not model this and may color each one differently, assuming we'll
+ // get ILP when we obviously can't. This hasn't been seen to be a problem
+ // in practice so far, so we simplify the algorithm by ignoring it.
+ int Parity = 0;
+
+ for (auto &I : V)
+ Changed |= colorChainSet(std::move(I), MBB, Parity);
+
+ return Changed;
+}
+
+Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
+ std::vector<Chain*> &L) {
+ if (L.empty())
+ return nullptr;
+
+ // We try and get the best candidate from L to color next, given that our
+ // preferred color is "PreferredColor". L is ordered from larger to smaller
+ // chains. It is beneficial to color the large chains before the small chains,
+ // but if we can't find a chain of the maximum length with the preferred color,
+ // we fuzz the size and look for slightly smaller chains before giving up and
+ // returning a chain that must be recolored.
+
+ // FIXME: Does this need to be configurable?
+ const unsigned SizeFuzz = 1;
+ unsigned MinSize = L.front()->size() - SizeFuzz;
+ for (auto I = L.begin(), E = L.end(); I != E; ++I) {
+ if ((*I)->size() <= MinSize) {
+ // We've gone past the size limit. Return the previous item.
+ Chain *Ch = *--I;
+ L.erase(I);
+ return Ch;
+ }
+
+ if ((*I)->getPreferredColor() == PreferredColor) {
+ Chain *Ch = *I;
+ L.erase(I);
+ return Ch;
+ }
+ }
+
+ // Bailout case - just return the first item.
+ Chain *Ch = L.front();
+ L.erase(L.begin());
+ return Ch;
+}
+
+bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
+ MachineBasicBlock &MBB,
+ int &Parity) {
+ bool Changed = false;
+ DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
+
+ // Sort by descending size order so that we allocate the most important
+ // sets first.
+ // Tie-break equivalent sizes by sorting chains requiring fixups before
+ // those without fixups. The logic here is that we should look at the
+ // chains that we cannot change before we look at those we can,
+ // so the parity counter is updated and we know what color we should
+ // change them to!
+ std::sort(GV.begin(), GV.end(), [](const Chain *G1, const Chain *G2) {
+ if (G1->size() != G2->size())
+ return G1->size() > G2->size();
+ return G1->requiresFixup() > G2->requiresFixup();
+ });
+
+ Color PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+ while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
+ // Start off by assuming we'll color to our own preferred color.
+ Color C = PreferredColor;
+ if (Parity == 0)
+ // But if we really don't care, use the chain's preferred color.
+ C = G->getPreferredColor();
+
+ DEBUG(dbgs() << " - Parity=" << Parity << ", Color="
+ << ColorNames[(int)C] << "\n");
+
+ // If we'll need a fixup FMOV, don't bother. Testing has shown that this
+ // happens infrequently and when it does it has at least a 50% chance of
+ // slowing code down instead of speeding it up.
+ if (G->requiresFixup() && C != G->getPreferredColor()) {
+ C = G->getPreferredColor();
+ DEBUG(dbgs() << " - " << G->str() << " - not worthwhile changing; "
+ "color remains " << ColorNames[(int)C] << "\n");
+ }
+
+ Changed |= colorChain(G, C, MBB);
+
+ Parity += (C == Color::Even) ? G->size() : -G->size();
+ PreferredColor = Parity < 0 ? Color::Even : Color::Odd;
+ }
+
+ return Changed;
+}
+
+int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
+ MachineBasicBlock &MBB) {
+ RegScavenger RS;
+ RS.enterBasicBlock(&MBB);
+ RS.forward(MachineBasicBlock::iterator(G->getStart()));
+
+ // Can we find an appropriate register that is available throughout the life
+ // of the chain?
+ unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
+ BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+ for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
+ I != E; ++I) {
+ RS.forward(I);
+ AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
+
+ // Remove any registers clobbered by a regmask or any def register that is
+ // immediately dead.
+ for (auto J : I->operands()) {
+ if (J.isRegMask())
+ AvailableRegs.clearBitsNotInMask(J.getRegMask());
+
+ if (J.isReg() && J.isDef() && AvailableRegs[J.getReg()]) {
+ assert(J.isDead() && "Non-dead def should have been removed by now!");
+ AvailableRegs.reset(J.getReg());
+ }
+ }
+ }
+
+ // Make sure we allocate in-order, to get the cheapest registers first.
+ auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
+ for (auto Reg : Ord) {
+ if (!AvailableRegs[Reg])
+ continue;
+ if ((C == Color::Even && (Reg % 2) == 0) ||
+ (C == Color::Odd && (Reg % 2) == 1))
+ return Reg;
+ }
+
+ return -1;
+}
+
+bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
+ MachineBasicBlock &MBB) {
+ bool Changed = false;
+ DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
+ << ColorNames[(int)C] << ")\n");
+
+ // Try and obtain a free register of the right class. Without a register
+ // to play with we cannot continue.
+ int Reg = scavengeRegister(G, C, MBB);
+ if (Reg == -1) {
+ DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
+ return false;
+ }
+ DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
+
+ std::map<unsigned, unsigned> Substs;
+ for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
+ I != E; ++I) {
+ if (!G->contains(I) &&
+ (&*I != G->getKill() || G->isKillImmutable()))
+ continue;
+
+ // I is a member of G, or I is a mutable instruction that kills G.
+
+ std::vector<unsigned> ToErase;
+ for (auto &U : I->operands()) {
+ if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
+ unsigned OrigReg = U.getReg();
+ U.setReg(Substs[OrigReg]);
+ if (U.isKill())
+ // Don't erase straight away, because there may be other operands
+ // that also reference this substitution!
+ ToErase.push_back(OrigReg);
+ } else if (U.isRegMask()) {
+ for (auto J : Substs) {
+ if (U.clobbersPhysReg(J.first))
+ ToErase.push_back(J.first);
+ }
+ }
+ }
+ // Now it's safe to remove the substs identified earlier.
+ for (auto J : ToErase)
+ Substs.erase(J);
+
+ // Only change the def if this isn't the last instruction.
+ if (&*I != G->getKill()) {
+ MachineOperand &MO = I->getOperand(0);
+
+ bool Change = TransformAll || getColor(MO.getReg()) != C;
+ if (G->requiresFixup() && &*I == G->getLast())
+ Change = false;
+
+ if (Change) {
+ Substs[MO.getReg()] = Reg;
+ MO.setReg(Reg);
+ MRI->setPhysRegUsed(Reg);
+
+ Changed = true;
+ }
+ }
+ }
+ assert(Substs.size() == 0 && "No substitutions should be left active!");
+
+ if (G->getKill()) {
+ DEBUG(dbgs() << " - Kill instruction seen.\n");
+ } else {
+ // We didn't have a kill instruction, but we didn't seem to need to change
+ // the destination register anyway.
+ DEBUG(dbgs() << " - Destination register not changed.\n");
+ }
+ return Changed;
+}
+
+void AArch64A57FPLoadBalancing::
+scanInstruction(MachineInstr *MI, unsigned Idx,
+ std::map<unsigned, Chain*> &ActiveChains,
+ std::set<std::unique_ptr<Chain>> &AllChains) {
+ // Inspect "MI", updating ActiveChains and AllChains.
+
+ if (isMul(MI)) {
+
+ for (auto &I : MI->uses())
+ maybeKillChain(I, Idx, ActiveChains);
+ for (auto &I : MI->defs())
+ maybeKillChain(I, Idx, ActiveChains);
+
+ // Create a new chain. Multiplies don't require forwarding so can go on any
+ // unit.
+ unsigned DestReg = MI->getOperand(0).getReg();
+
+ DEBUG(dbgs() << "New chain started for register "
+ << TRI->getName(DestReg) << " at " << *MI);
+
+ auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ ActiveChains[DestReg] = G.get();
+ AllChains.insert(std::move(G));
+
+ } else if (isMla(MI)) {
+
+ // It is beneficial to keep MLAs on the same functional unit as their
+ // accumulator operand.
+ unsigned DestReg = MI->getOperand(0).getReg();
+ unsigned AccumReg = MI->getOperand(3).getReg();
+
+ maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
+ maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
+ if (DestReg != AccumReg)
+ maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
+
+ if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
+ DEBUG(dbgs() << "Chain found for accumulator register "
+ << TRI->getName(AccumReg) << " in MI " << *MI);
+
+ // For simplicity we only chain together sequences of MULs/MLAs where the
+ // accumulator register is killed on each instruction. This means we don't
+ // need to track other uses of the registers we want to rewrite.
+ //
+ // FIXME: We could extend to handle the non-kill cases for more coverage.
+ if (MI->getOperand(3).isKill()) {
+ // Add to chain.
+ DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
+ ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
+ // Handle cases where the destination is not the same as the accumulator.
+ if (DestReg != AccumReg) {
+ ActiveChains[DestReg] = ActiveChains[AccumReg];
+ ActiveChains.erase(AccumReg);
+ }
+ return;
+ }
+
+ DEBUG(dbgs() << "Cannot add to chain because accumulator operand wasn't "
+ << "marked <kill>!\n");
+ maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
+ }
+
+ DEBUG(dbgs() << "Creating new chain for dest register "
+ << TRI->getName(DestReg) << "\n");
+ auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
+ ActiveChains[DestReg] = G.get();
+ AllChains.insert(std::move(G));
+
+ } else {
+
+ // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
+ // lists.
+ for (auto &I : MI->uses())
+ maybeKillChain(I, Idx, ActiveChains);
+ for (auto &I : MI->defs())
+ maybeKillChain(I, Idx, ActiveChains);
+
+ }
+}
+
+void AArch64A57FPLoadBalancing::
+maybeKillChain(MachineOperand &MO, unsigned Idx,
+ std::map<unsigned, Chain*> &ActiveChains) {
+ // Given an operand and the set of active chains (keyed by register),
+ // determine if a chain should be ended and remove from ActiveChains.
+ MachineInstr *MI = MO.getParent();
+
+ if (MO.isReg()) {
+
+ // If this is a KILL of a current chain, record it.
+ if (MO.isKill() && ActiveChains.find(MO.getReg()) != ActiveChains.end()) {
+ DEBUG(dbgs() << "Kill seen for chain " << TRI->getName(MO.getReg())
+ << "\n");
+ ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
+ }
+ ActiveChains.erase(MO.getReg());
+
+ } else if (MO.isRegMask()) {
+
+ for (auto I = ActiveChains.begin(), E = ActiveChains.end();
+ I != E;) {
+ if (MO.clobbersPhysReg(I->first)) {
+ DEBUG(dbgs() << "Kill (regmask) seen for chain "
+ << TRI->getName(I->first) << "\n");
+ I->second->setKill(MI, Idx, /*Immutable=*/true);
+ ActiveChains.erase(I++);
+ } else
+ ++I;
+ }
+
+ }
+}
+
+Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
+ if ((TRI->getEncodingValue(Reg) % 2) == 0)
+ return Color::Even;
+ else
+ return Color::Odd;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
+FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
+ return new AArch64A57FPLoadBalancing();
+}
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index ab2c4b7..287989f 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -19,7 +19,7 @@
// a = add nsw i64 f, 3
// e = getelementptr ..., i64 a
//
-// This is legal to do so if the computations are markers with either nsw or nuw
+// This is legal to do if the computations are marked with either nsw or nuw
// markers.
// Moreover, the current heuristic is simple: it does not create new sext
// operations, i.e., it gives up when a sext would have forked (e.g., if
@@ -223,7 +223,7 @@ AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
}
// Input:
-// - SExtInsts contains all the sext instructions that are use direclty in
+// - SExtInsts contains all the sext instructions that are used directly in
// GetElementPtrInst, i.e., access to memory.
// Algorithm:
// - For each sext operation in SExtInsts:
@@ -353,7 +353,7 @@ AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
// If the use is already of the right type, connect its uses to its argument
// and delete it.
- // This can happen for an Instruction which all uses are sign extended.
+ // This can happen for an Instruction all uses of which are sign extended.
if (!ToRemove.count(SExt) &&
SExt->getType() == SExt->getOperand(0)->getType()) {
DEBUG(dbgs() << "Sign extension is useless, attach its use to "
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 734fb21..5afe0f4 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -36,9 +36,10 @@
#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -166,6 +167,12 @@ static int getTransformOpcode(unsigned Opc) {
return AArch64::ADDv1i64;
case AArch64::SUBXrr:
return AArch64::SUBv1i64;
+ case AArch64::ANDXrr:
+ return AArch64::ANDv8i8;
+ case AArch64::EORXrr:
+ return AArch64::EORv8i8;
+ case AArch64::ORRXrr:
+ return AArch64::ORRv8i8;
}
// No AdvSIMD equivalent, so just return the original opcode.
return Opc;
@@ -371,7 +378,8 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
const TargetMachine &TM = mf.getTarget();
MRI = &mf.getRegInfo();
- TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+ TII = static_cast<const AArch64InstrInfo *>(
+ TM.getSubtargetImpl()->getInstrInfo());
// Just check things on a one-block-at-a-time basis.
for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index cd94e24..08ee687 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -13,8 +13,8 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
-#include "AArch64MachineFunctionInfo.h"
#include "AArch64MCInstLower.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "InstPrinter/AArch64InstPrinter.h"
@@ -23,8 +23,8 @@
#include "llvm/ADT/Twine.h"
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
@@ -54,7 +54,7 @@ public:
AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
: AsmPrinter(TM, Streamer),
Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
- MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+ MCInstLowering(OutContext, *this), SM(*this), AArch64FI(nullptr),
LOHLabelCounter(0) {}
const char *getPassName() const override {
@@ -145,7 +145,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
if (!Stubs.empty()) {
OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
- const DataLayout *TD = TM.getDataLayout();
+ const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
OutStreamer.EmitLabel(Stubs[i].first);
@@ -252,8 +252,8 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
const TargetRegisterClass *RC,
bool isVector, raw_ostream &O) {
assert(MO.isReg() && "Should only get here with a register!");
- const AArch64RegisterInfo *RI =
- static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+ const AArch64RegisterInfo *RI = static_cast<const AArch64RegisterInfo *>(
+ TM.getSubtargetImpl()->getRegisterInfo());
unsigned Reg = MO.getReg();
unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
assert(RI->regsOverlap(RegToPrint, Reg));
@@ -381,8 +381,23 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
unsigned NumNOPBytes = MI.getOperand(1).getImm();
SM.recordStackMap(MI);
- // Emit padding.
assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+
+ // Scan ahead to trim the shadow.
+ const MachineBasicBlock &MBB = *MI.getParent();
+ MachineBasicBlock::const_iterator MII(MI);
+ ++MII;
+ while (NumNOPBytes > 0) {
+ if (MII == MBB.end() || MII->isCall() ||
+ MII->getOpcode() == AArch64::DBG_VALUE ||
+ MII->getOpcode() == TargetOpcode::PATCHPOINT ||
+ MII->getOpcode() == TargetOpcode::STACKMAP)
+ break;
+ ++MII;
+ NumNOPBytes -= 4;
+ }
+
+ // Emit nops.
for (unsigned i = 0; i < NumNOPBytes; i += 4)
EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
}
@@ -518,7 +533,5 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
extern "C" void LLVMInitializeAArch64AsmPrinter() {
RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
-
- RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
- RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+ RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target);
}
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index 484e7e8..e2b6367 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -12,15 +12,16 @@
#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CommandLine.h"
using namespace llvm;
#define DEBUG_TYPE "aarch64-branch-relax"
@@ -136,7 +137,7 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
if (NextBB == MBB->getParent()->end())
return false;
- for (MachineBasicBlock *S : MBB->successors())
+ for (MachineBasicBlock *S : MBB->successors())
if (S == NextBB)
return true;
@@ -475,7 +476,9 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
- TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
+ TII = (const AArch64InstrInfo *)MF->getTarget()
+ .getSubtargetImpl()
+ ->getInstrInfo();
// Renumber all of the machine basic blocks in the function, guaranteeing that
// the numbers agree with the position of the block in the function.
diff --git a/lib/Target/AArch64/AArch64CallingConvention.h b/lib/Target/AArch64/AArch64CallingConvention.h
new file mode 100644
index 0000000..baf80bc
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallingConvention.h
@@ -0,0 +1,141 @@
+//=== AArch64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the AArch64 Calling Convention
+// that aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLINGCONVENTION_H
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace {
+using namespace llvm;
+
+static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2,
+ AArch64::X3, AArch64::X4, AArch64::X5,
+ AArch64::X6, AArch64::X7};
+static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2,
+ AArch64::H3, AArch64::H4, AArch64::H5,
+ AArch64::H6, AArch64::H7};
+static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2,
+ AArch64::S3, AArch64::S4, AArch64::S5,
+ AArch64::S6, AArch64::S7};
+static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2,
+ AArch64::D3, AArch64::D4, AArch64::D5,
+ AArch64::D6, AArch64::D7};
+static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
+ AArch64::Q3, AArch64::Q4, AArch64::Q5,
+ AArch64::Q6, AArch64::Q7};
+
+static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
+ MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
+ CCState &State, unsigned SlotAlign) {
+ unsigned Size = LocVT.getSizeInBits() / 8;
+ unsigned StackAlign = State.getMachineFunction()
+ .getSubtarget()
+ .getDataLayout()
+ ->getStackAlignment();
+ unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign);
+
+ for (auto &It : PendingMembers) {
+ It.convertToMem(State.AllocateStack(Size, std::max(Align, SlotAlign)));
+ State.addLoc(It);
+ SlotAlign = 1;
+ }
+
+ // All pending members have now been allocated
+ PendingMembers.clear();
+ return true;
+}
+
+/// The Darwin variadic PCS places anonymous arguments in 8-byte stack slots. An
+/// [N x Ty] type must still be contiguous in memory though.
+static bool CC_AArch64_Custom_Stack_Block(
+ unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // block.
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+}
+
+/// Given an [N x Ty] block, it should be passed in a consecutive sequence of
+/// registers. If no such sequence is available, mark the rest of the registers
+/// of that type as used and place the argument on the stack.
+static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+ CCValAssign::LocInfo &LocInfo,
+ ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+ // Try to allocate a contiguous block of registers, each of the correct
+ // size to hold one member.
+ ArrayRef<uint16_t> RegList;
+ if (LocVT.SimpleTy == MVT::i64)
+ RegList = XRegList;
+ else if (LocVT.SimpleTy == MVT::f16)
+ RegList = HRegList;
+ else if (LocVT.SimpleTy == MVT::f32 || LocVT.is32BitVector())
+ RegList = SRegList;
+ else if (LocVT.SimpleTy == MVT::f64 || LocVT.is64BitVector())
+ RegList = DRegList;
+ else if (LocVT.SimpleTy == MVT::f128 || LocVT.is128BitVector())
+ RegList = QRegList;
+ else {
+ // Not an array we want to split up after all.
+ return false;
+ }
+
+ SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+
+ // Add the argument to the list to be allocated once we know the size of the
+ // block.
+ PendingMembers.push_back(
+ CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+ if (!ArgFlags.isInConsecutiveRegsLast())
+ return true;
+
+ unsigned RegResult = State.AllocateRegBlock(RegList, PendingMembers.size());
+ if (RegResult) {
+ for (auto &It : PendingMembers) {
+ It.convertToReg(RegResult);
+ State.addLoc(It);
+ ++RegResult;
+ }
+ PendingMembers.clear();
+ return true;
+ }
+
+ // Mark all regs in the class as unavailable
+ for (auto Reg : RegList)
+ State.AllocateReg(Reg);
+
+ const AArch64Subtarget &Subtarget = static_cast<const AArch64Subtarget &>(
+ State.getMachineFunction().getSubtarget());
+ unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+
+ return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
+}
+
+}
+
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 1fe5138..1a80402 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> :
CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
/// CCIfBigEndian - Match only if we're in big endian mode.
class CCIfBigEndian<CCAction A> :
- CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+ CCIf<"State.getMachineFunction().getSubtarget().getDataLayout()->isBigEndian()", A>;
//===----------------------------------------------------------------------===//
// ARM AAPCS64 Calling Convention
@@ -40,6 +40,8 @@ def CC_AArch64_AAPCS : CallingConv<[
// slot is 64-bit.
CCIfByVal<CCPassByVal<8, 8>>,
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
// up to eight each of GPR and FPR.
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -60,18 +62,18 @@ def CC_AArch64_AAPCS : CallingConv<[
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
// If more than will fit in registers, pass them on the stack instead.
CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
- CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
CCAssignToStack<8, 8>>,
- CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
CCAssignToStack<16, 16>>
]>;
@@ -96,10 +98,10 @@ def RetCC_AArch64_AAPCS : CallingConv<[
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
]>;
@@ -119,6 +121,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
// slot is 64-bit.
CCIfByVal<CCPassByVal<8, 8>>,
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
+
// Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
// up to eight each of GPR and FPR.
CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -139,25 +143,28 @@ def CC_AArch64_DarwinPCS : CallingConv<[
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
+ CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
// If more than will fit in registers, pass them on the stack instead.
CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
- CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
+ CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
CCAssignToStack<8, 8>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToStack<16, 16>>
]>;
def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Stack_Block">>,
+
// Handle all scalar types as either i64 or f64.
CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
CCIfType<[f16, f32], CCPromoteToType<f64>>,
@@ -165,8 +172,10 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
// Everything is on the stack.
// i128 is split to two i64s, and its stack alignment is 16 bytes.
CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
- CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32], CCAssignToStack<8, 8>>,
- CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+ CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+ CCAssignToStack<8, 8>>,
+ CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+ CCAssignToStack<16, 16>>
]>;
// The WebKit_JS calling convention only passes the first argument (the callee)
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 4d23dc5..aab8e38 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -94,7 +94,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
MachineFunction *MF = I->getParent()->getParent();
const AArch64TargetMachine *TM =
static_cast<const AArch64TargetMachine *>(&MF->getTarget());
- const AArch64InstrInfo *TII = TM->getInstrInfo();
+ const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
// Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
// code sequence assumes the address will be.
@@ -114,7 +114,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
MachineFunction *MF = I->getParent()->getParent();
const AArch64TargetMachine *TM =
static_cast<const AArch64TargetMachine *>(&MF->getTarget());
- const AArch64InstrInfo *TII = TM->getInstrInfo();
+ const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
// Create a virtual register for the TLS base address.
MachineRegisterInfo &RegInfo = MF->getRegInfo();
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 6b1f096..87b545b 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -101,25 +101,26 @@
#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
#define DEBUG_TYPE "aarch64-collect-loh"
@@ -194,12 +195,14 @@ typedef SetVector<const MachineInstr *> SetOfMachineInstr;
/// Map a basic block to a set of instructions per register.
/// This is used to represent the exposed uses of a basic block
/// per register.
-typedef MapVector<const MachineBasicBlock *, SetOfMachineInstr *>
+typedef MapVector<const MachineBasicBlock *,
+ std::unique_ptr<SetOfMachineInstr[]>>
BlockToSetOfInstrsPerColor;
/// Map a basic block to an instruction per register.
/// This is used to represent the live-out definitions of a basic block
/// per register.
-typedef MapVector<const MachineBasicBlock *, const MachineInstr **>
+typedef MapVector<const MachineBasicBlock *,
+ std::unique_ptr<const MachineInstr *[]>>
BlockToInstrPerColor;
/// Map an instruction to a set of instructions. Used to represent the
/// mapping def to reachable uses or use to definitions.
@@ -236,9 +239,9 @@ static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
SetOfMachineInstr *result;
BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
if (it != sets.end())
- result = it->second;
+ result = it->second.get();
else
- result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
+ result = (sets[&MBB] = make_unique<SetOfMachineInstr[]>(nbRegs)).get();
return result[reg];
}
@@ -283,14 +286,14 @@ static void initReachingDef(MachineFunction &MF,
const MapRegToId &RegToId,
const MachineInstr *DummyOp, bool ADRPMode) {
const TargetMachine &TM = MF.getTarget();
- const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+ const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
unsigned NbReg = RegToId.size();
for (MachineBasicBlock &MBB : MF) {
- const MachineInstr **&BBGen = Gen[&MBB];
- BBGen = new const MachineInstr *[NbReg];
- memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
+ auto &BBGen = Gen[&MBB];
+ BBGen = make_unique<const MachineInstr *[]>(NbReg);
+ std::fill(BBGen.get(), BBGen.get() + NbReg, nullptr);
BitVector &BBKillSet = Kill[&MBB];
BBKillSet.resize(NbReg);
@@ -421,22 +424,6 @@ static void reachingDefAlgorithm(MachineFunction &MF,
} while (HasChanged);
}
-/// Release all memory dynamically allocated during the reaching
-/// definition algorithm.
-static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
- BlockToSetOfInstrsPerColor &Out,
- BlockToInstrPerColor &Gen,
- BlockToSetOfInstrsPerColor &ReachableUses) {
- for (auto &IT : Out)
- delete[] IT.second;
- for (auto &IT : In)
- delete[] IT.second;
- for (auto &IT : ReachableUses)
- delete[] IT.second;
- for (auto &IT : Gen)
- delete[] IT.second;
-}
-
/// Reaching definition algorithm.
/// \param MF function on which the algorithm will operate.
/// \param[out] ColorOpToReachedUses will contain the result of the reaching
@@ -473,9 +460,6 @@ static void reachingDef(MachineFunction &MF,
if (!DummyOp)
reachingDefAlgorithm(MF, ColorOpToReachedUses, In, Out, Gen, Kill,
ReachableUses, RegToId.size());
-
- // finit.
- finitReachingDef(In, Out, Gen, ReachableUses);
}
#ifndef NDEBUG
@@ -1043,7 +1027,7 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
const TargetMachine &TM = MF.getTarget();
- const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+ const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
MapRegToId RegToId;
@@ -1059,8 +1043,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
MachineInstr *DummyOp = nullptr;
if (BasicBlockScopeOnly) {
- const AArch64InstrInfo *TII =
- static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
+ const AArch64InstrInfo *TII = static_cast<const AArch64InstrInfo *>(
+ TM.getSubtargetImpl()->getInstrInfo());
// For local analysis, create a dummy operation to record uses that are not
// local.
DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
new file mode 100644
index 0000000..0fbd3c6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -0,0 +1,422 @@
+//=- AArch64ConditionOptimizer.cpp - Remove useless comparisons for AArch64 -=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to make consecutive compares of values use same operands to
+// allow CSE pass to remove duplicated instructions. For this it analyzes
+// branches and adjusts comparisons with immediate values by converting:
+// * GE -> GT
+// * GT -> GE
+// * LT -> LE
+// * LE -> LT
+// and adjusting immediate values appropriately. It basically corrects two
+// immediate values towards each other to make them equal.
+//
+// Consider the following example in C:
+//
+// if ((a < 5 && ...) || (a > 5 && ...)) {
+// ~~~~~ ~~~~~
+// ^ ^
+// x y
+//
+// Here both "x" and "y" expressions compare "a" with "5". When "x" evaluates
+// to "false", "y" can just check flags set by the first comparison. As a
+// result of the canonicalization employed by
+// SelectionDAGBuilder::visitSwitchCase, DAGCombine, and other target-specific
+// code, assembly ends up in the form that is not CSE friendly:
+//
+// ...
+// cmp w8, #4
+// b.gt .LBB0_3
+// ...
+// .LBB0_3:
+// cmp w8, #6
+// b.lt .LBB0_6
+// ...
+//
+// Same assembly after the pass:
+//
+// ...
+// cmp w8, #5
+// b.ge .LBB0_3
+// ...
+// .LBB0_3:
+// cmp w8, #5 // <-- CSE pass removes this instruction
+// b.le .LBB0_6
+// ...
+//
+// Currently only SUBS and ADDS followed by b.?? are supported.
+//
+// TODO: maybe handle TBNZ/TBZ the same way as CMP when used instead for "a < 0"
+// TODO: handle other conditional instructions (e.g. CSET)
+// TODO: allow second branching to be anything if it doesn't require adjusting
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cstdlib>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-condopt"
+
+STATISTIC(NumConditionsAdjusted, "Number of conditions adjusted");
+
+namespace {
+class AArch64ConditionOptimizer : public MachineFunctionPass {
+ const TargetInstrInfo *TII;
+ MachineDominatorTree *DomTree;
+
+public:
+ // Stores immediate, compare instruction opcode and branch condition (in this
+ // order) of adjusted comparison.
+ typedef std::tuple<int, int, AArch64CC::CondCode> CmpInfo;
+
+ static char ID;
+ AArch64ConditionOptimizer() : MachineFunctionPass(ID) {}
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ MachineInstr *findSuitableCompare(MachineBasicBlock *MBB);
+ CmpInfo adjustCmp(MachineInstr *CmpMI, AArch64CC::CondCode Cmp);
+ void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info);
+ bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To,
+ int ToImm);
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ const char *getPassName() const override {
+ return "AArch64 Condition Optimizer";
+ }
+};
+} // end anonymous namespace
+
+char AArch64ConditionOptimizer::ID = 0;
+
+namespace llvm {
+void initializeAArch64ConditionOptimizerPass(PassRegistry &);
+}
+
+INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt",
+ "AArch64 CondOpt Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt",
+ "AArch64 CondOpt Pass", false, false)
+
+FunctionPass *llvm::createAArch64ConditionOptimizerPass() {
+ return new AArch64ConditionOptimizer();
+}
+
+void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+// Finds compare instruction that corresponds to supported types of branching.
+// Returns the instruction or nullptr on failures or detecting unsupported
+// instructions.
+MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
+ MachineBasicBlock *MBB) {
+ MachineBasicBlock::iterator I = MBB->getFirstTerminator();
+ if (I == MBB->end())
+ return nullptr;
+
+ if (I->getOpcode() != AArch64::Bcc)
+ return nullptr;
+
+ // Now find the instruction controlling the terminator.
+ for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
+ --I;
+ assert(!I->isTerminator() && "Spurious terminator");
+ switch (I->getOpcode()) {
+ // cmp is an alias for subs with a dead destination register.
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ // cmn is an alias for adds with a dead destination register.
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri:
+ if (I->getOperand(0).isDead())
+ return I;
+
+ DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+ return nullptr;
+
+ // Prevent false positive case like:
+ // cmp w19, #0
+ // cinc w0, w19, gt
+ // ...
+ // fcmp d8, #0.0
+ // b.gt .LBB0_5
+ case AArch64::FCMPDri:
+ case AArch64::FCMPSri:
+ case AArch64::FCMPESri:
+ case AArch64::FCMPEDri:
+
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXrr:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXrr:
+ case AArch64::FCMPSrr:
+ case AArch64::FCMPDrr:
+ case AArch64::FCMPESrr:
+ case AArch64::FCMPEDrr:
+ // Skip comparison instructions without immediate operands.
+ return nullptr;
+ }
+ }
+ DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
+ return nullptr;
+}
+
+// Changes opcode adds <-> subs considering register operand width.
+static int getComplementOpc(int Opc) {
+ switch (Opc) {
+ case AArch64::ADDSWri: return AArch64::SUBSWri;
+ case AArch64::ADDSXri: return AArch64::SUBSXri;
+ case AArch64::SUBSWri: return AArch64::ADDSWri;
+ case AArch64::SUBSXri: return AArch64::ADDSXri;
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+// Changes form of comparison inclusive <-> exclusive.
+static AArch64CC::CondCode getAdjustedCmp(AArch64CC::CondCode Cmp) {
+ switch (Cmp) {
+ case AArch64CC::GT: return AArch64CC::GE;
+ case AArch64CC::GE: return AArch64CC::GT;
+ case AArch64CC::LT: return AArch64CC::LE;
+ case AArch64CC::LE: return AArch64CC::LT;
+ default:
+ llvm_unreachable("Unexpected condition code");
+ }
+}
+
+// Transforms GT -> GE, GE -> GT, LT -> LE, LE -> LT by updating comparison
+// operator and condition code.
+AArch64ConditionOptimizer::CmpInfo AArch64ConditionOptimizer::adjustCmp(
+ MachineInstr *CmpMI, AArch64CC::CondCode Cmp) {
+ int Opc = CmpMI->getOpcode();
+
+ // CMN (compare with negative immediate) is an alias to ADDS (as
+ // "operand - negative" == "operand + positive")
+ bool Negative = (Opc == AArch64::ADDSWri || Opc == AArch64::ADDSXri);
+
+ int Correction = (Cmp == AArch64CC::GT) ? 1 : -1;
+ // Negate Correction value for comparison with negative immediate (CMN).
+ if (Negative) {
+ Correction = -Correction;
+ }
+
+ const int OldImm = (int)CmpMI->getOperand(2).getImm();
+ const int NewImm = std::abs(OldImm + Correction);
+
+ // Handle +0 -> -1 and -0 -> +1 (CMN with 0 immediate) transitions by
+ // adjusting compare instruction opcode.
+ if (OldImm == 0 && ((Negative && Correction == 1) ||
+ (!Negative && Correction == -1))) {
+ Opc = getComplementOpc(Opc);
+ }
+
+ return CmpInfo(NewImm, Opc, getAdjustedCmp(Cmp));
+}
+
+// Applies changes to comparison instruction suggested by adjustCmp().
+void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
+ const CmpInfo &Info) {
+ int Imm;
+ int Opc;
+ AArch64CC::CondCode Cmp;
+ std::tie(Imm, Opc, Cmp) = Info;
+
+ MachineBasicBlock *const MBB = CmpMI->getParent();
+
+ // Change immediate in comparison instruction (ADDS or SUBS).
+ BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
+ .addOperand(CmpMI->getOperand(0))
+ .addOperand(CmpMI->getOperand(1))
+ .addImm(Imm)
+ .addOperand(CmpMI->getOperand(3));
+ CmpMI->eraseFromParent();
+
+ // The fact that this comparison was picked ensures that it's related to the
+ // first terminator instruction.
+ MachineInstr *BrMI = MBB->getFirstTerminator();
+
+ // Change condition in branch instruction.
+ BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc))
+ .addImm(Cmp)
+ .addOperand(BrMI->getOperand(1));
+ BrMI->eraseFromParent();
+
+ MBB->updateTerminator();
+
+ ++NumConditionsAdjusted;
+}
+
+// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// corresponding to TBB.
+// Returns true if parsing was successful, otherwise false is returned.
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+ // A normal br.cond simply has the condition code.
+ if (Cond[0].getImm() != -1) {
+ assert(Cond.size() == 1 && "Unknown Cond array format");
+ CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+ return true;
+ }
+ return false;
+}
+
+// Adjusts one cmp instruction to another one if result of adjustment will allow
+// CSE. Returns true if compare instruction was changed, otherwise false is
+// returned.
+bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
+ AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm)
+{
+ CmpInfo Info = adjustCmp(CmpMI, Cmp);
+ if (std::get<0>(Info) == ToImm && std::get<1>(Info) == To->getOpcode()) {
+ modifyCmp(CmpMI, Info);
+ return true;
+ }
+ return false;
+}
+
+bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
+ DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
+ << "********** Function: " << MF.getName() << '\n');
+ TII = MF.getTarget().getSubtargetImpl()->getInstrInfo();
+ DomTree = &getAnalysis<MachineDominatorTree>();
+
+ bool Changed = false;
+
+ // Visit blocks in dominator tree pre-order. The pre-order enables multiple
+ // cmp-conversions from the same head block.
+ // Note that updateDomTree() modifies the children of the DomTree node
+ // currently being visited. The df_iterator supports that; it doesn't look at
+ // child_begin() / child_end() until after a node has been visited.
+ for (MachineDomTreeNode *I : depth_first(DomTree)) {
+ MachineBasicBlock *HBB = I->getBlock();
+
+ SmallVector<MachineOperand, 4> HeadCond;
+ MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+ if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+ continue;
+ }
+
+ // Equivalence check is to skip loops.
+ if (!TBB || TBB == HBB) {
+ continue;
+ }
+
+ SmallVector<MachineOperand, 4> TrueCond;
+ MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
+ if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+ continue;
+ }
+
+ MachineInstr *HeadCmpMI = findSuitableCompare(HBB);
+ if (!HeadCmpMI) {
+ continue;
+ }
+
+ MachineInstr *TrueCmpMI = findSuitableCompare(TBB);
+ if (!TrueCmpMI) {
+ continue;
+ }
+
+ AArch64CC::CondCode HeadCmp;
+ if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) {
+ continue;
+ }
+
+ AArch64CC::CondCode TrueCmp;
+ if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) {
+ continue;
+ }
+
+ const int HeadImm = (int)HeadCmpMI->getOperand(2).getImm();
+ const int TrueImm = (int)TrueCmpMI->getOperand(2).getImm();
+
+ DEBUG(dbgs() << "Head branch:\n");
+ DEBUG(dbgs() << "\tcondition: "
+ << AArch64CC::getCondCodeName(HeadCmp) << '\n');
+ DEBUG(dbgs() << "\timmediate: " << HeadImm << '\n');
+
+ DEBUG(dbgs() << "True branch:\n");
+ DEBUG(dbgs() << "\tcondition: "
+ << AArch64CC::getCondCodeName(TrueCmp) << '\n');
+ DEBUG(dbgs() << "\timmediate: " << TrueImm << '\n');
+
+ if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::LT) ||
+ (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::GT)) &&
+ std::abs(TrueImm - HeadImm) == 2) {
+ // This branch transforms machine instructions that correspond to
+ //
+ // 1) (a > {TrueImm} && ...) || (a < {HeadImm} && ...)
+ // 2) (a < {TrueImm} && ...) || (a > {HeadImm} && ...)
+ //
+ // into
+ //
+ // 1) (a >= {NewImm} && ...) || (a <= {NewImm} && ...)
+ // 2) (a <= {NewImm} && ...) || (a >= {NewImm} && ...)
+
+ CmpInfo HeadCmpInfo = adjustCmp(HeadCmpMI, HeadCmp);
+ CmpInfo TrueCmpInfo = adjustCmp(TrueCmpMI, TrueCmp);
+ if (std::get<0>(HeadCmpInfo) == std::get<0>(TrueCmpInfo) &&
+ std::get<1>(HeadCmpInfo) == std::get<1>(TrueCmpInfo)) {
+ modifyCmp(HeadCmpMI, HeadCmpInfo);
+ modifyCmp(TrueCmpMI, TrueCmpInfo);
+ Changed = true;
+ }
+ } else if (((HeadCmp == AArch64CC::GT && TrueCmp == AArch64CC::GT) ||
+ (HeadCmp == AArch64CC::LT && TrueCmp == AArch64CC::LT)) &&
+ std::abs(TrueImm - HeadImm) == 1) {
+ // This branch transforms machine instructions that correspond to
+ //
+ // 1) (a > {TrueImm} && ...) || (a > {HeadImm} && ...)
+ // 2) (a < {TrueImm} && ...) || (a < {HeadImm} && ...)
+ //
+ // into
+ //
+ // 1) (a <= {NewImm} && ...) || (a > {NewImm} && ...)
+ // 2) (a < {NewImm} && ...) || (a >= {NewImm} && ...)
+
+ // GT -> GE transformation increases immediate value, so picking the
+ // smaller one; LT -> LE decreases immediate value so invert the choice.
+ bool adjustHeadCond = (HeadImm < TrueImm);
+ if (HeadCmp == AArch64CC::LT) {
+ adjustHeadCond = !adjustHeadCond;
+ }
+
+ if (adjustHeadCond) {
+ Changed |= adjustTo(HeadCmpMI, HeadCmp, TrueCmpMI, TrueImm);
+ } else {
+ Changed |= adjustTo(TrueCmpMI, TrueCmp, HeadCmpMI, HeadImm);
+ }
+ }
+ // Other transformation cases almost never occur due to generation of < or >
+ // comparisons instead of <= and >=.
+ }
+
+ return Changed;
+}
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 452cdec..54f53dc 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -191,8 +191,8 @@ public:
/// runOnMachineFunction - Initialize per-function data structures.
void runOnMachineFunction(MachineFunction &MF) {
this->MF = &MF;
- TII = MF.getTarget().getInstrInfo();
- TRI = MF.getTarget().getRegisterInfo();
+ TII = MF.getSubtarget().getInstrInfo();
+ TRI = MF.getSubtarget().getRegisterInfo();
MRI = &MF.getRegInfo();
}
@@ -723,7 +723,7 @@ namespace {
class AArch64ConditionalCompares : public MachineFunctionPass {
const TargetInstrInfo *TII;
const TargetRegisterInfo *TRI;
- const MCSchedModel *SchedModel;
+ MCSchedModel SchedModel;
// Does the proceeded function has Oz attribute.
bool MinSize;
MachineRegisterInfo *MRI;
@@ -845,7 +845,7 @@ bool AArch64ConditionalCompares::shouldConvert() {
// the cost of a misprediction.
//
// Set a limit on the delay we will accept.
- unsigned DelayLimit = SchedModel->MispredictPenalty * 3 / 4;
+ unsigned DelayLimit = SchedModel.MispredictPenalty * 3 / 4;
// Instruction depths can be computed for all trace instructions above CmpBB.
unsigned HeadDepth =
@@ -891,8 +891,8 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
<< "********** Function: " << MF.getName() << '\n');
- TII = MF.getTarget().getInstrInfo();
- TRI = MF.getTarget().getRegisterInfo();
+ TII = MF.getSubtarget().getInstrInfo();
+ TRI = MF.getSubtarget().getRegisterInfo();
SchedModel =
MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
MRI = &MF.getRegInfo();
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index a2d853c..74fc167 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -14,11 +14,12 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
#define DEBUG_TYPE "aarch64-dead-defs"
@@ -36,11 +37,11 @@ public:
static char ID; // Pass identification, replacement for typeid.
explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
- virtual bool runOnMachineFunction(MachineFunction &F) override;
+ bool runOnMachineFunction(MachineFunction &F) override;
const char *getPassName() const override { return "Dead register definitions"; }
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -119,7 +120,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
// Scan the function for instructions that have a dead definition of a
// register. Replace that register with the zero register when possible.
bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
- TRI = MF.getTarget().getRegisterInfo();
+ TRI = MF.getSubtarget().getRegisterInfo();
bool Changed = false;
DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 8839085..c850680 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -16,6 +16,7 @@
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/Support/MathExtras.h"
@@ -722,7 +723,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
}
bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
- TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+ TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
bool Modified = false;
for (auto &MBB : MF)
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 2164d77..419fbc8 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -14,9 +14,11 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64CallingConvention.h"
#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -39,8 +41,7 @@ using namespace llvm;
namespace {
-class AArch64FastISel : public FastISel {
-
+class AArch64FastISel final : public FastISel {
class Address {
public:
typedef enum {
@@ -50,16 +51,23 @@ class AArch64FastISel : public FastISel {
private:
BaseKind Kind;
+ AArch64_AM::ShiftExtendType ExtType;
union {
unsigned Reg;
int FI;
} Base;
+ unsigned OffsetReg;
+ unsigned Shift;
int64_t Offset;
+ const GlobalValue *GV;
public:
- Address() : Kind(RegBase), Offset(0) { Base.Reg = 0; }
+ Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
+ OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
void setKind(BaseKind K) { Kind = K; }
BaseKind getKind() const { return Kind; }
+ void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
+ AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
bool isRegBase() const { return Kind == RegBase; }
bool isFIBase() const { return Kind == FrameIndexBase; }
void setReg(unsigned Reg) {
@@ -70,6 +78,12 @@ class AArch64FastISel : public FastISel {
assert(isRegBase() && "Invalid base register access!");
return Base.Reg;
}
+ void setOffsetReg(unsigned Reg) {
+ OffsetReg = Reg;
+ }
+ unsigned getOffsetReg() const {
+ return OffsetReg;
+ }
void setFI(unsigned FI) {
assert(isFIBase() && "Invalid base frame index access!");
Base.FI = FI;
@@ -80,8 +94,11 @@ class AArch64FastISel : public FastISel {
}
void setOffset(int64_t O) { Offset = O; }
int64_t getOffset() { return Offset; }
+ void setShift(unsigned S) { Shift = S; }
+ unsigned getShift() { return Shift; }
- bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
+ void setGlobalValue(const GlobalValue *G) { GV = G; }
+ const GlobalValue *getGlobalValue() { return GV; }
};
/// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
@@ -89,74 +106,152 @@ class AArch64FastISel : public FastISel {
const AArch64Subtarget *Subtarget;
LLVMContext *Context;
+ bool fastLowerArguments() override;
+ bool fastLowerCall(CallLoweringInfo &CLI) override;
+ bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
+
private:
// Selection routines.
- bool SelectLoad(const Instruction *I);
- bool SelectStore(const Instruction *I);
- bool SelectBranch(const Instruction *I);
- bool SelectIndirectBr(const Instruction *I);
- bool SelectCmp(const Instruction *I);
- bool SelectSelect(const Instruction *I);
- bool SelectFPExt(const Instruction *I);
- bool SelectFPTrunc(const Instruction *I);
- bool SelectFPToInt(const Instruction *I, bool Signed);
- bool SelectIntToFP(const Instruction *I, bool Signed);
- bool SelectRem(const Instruction *I, unsigned ISDOpcode);
- bool SelectCall(const Instruction *I, const char *IntrMemName);
- bool SelectIntrinsicCall(const IntrinsicInst &I);
- bool SelectRet(const Instruction *I);
- bool SelectTrunc(const Instruction *I);
- bool SelectIntExt(const Instruction *I);
- bool SelectMul(const Instruction *I);
+ bool selectAddSub(const Instruction *I);
+ bool selectLogicalOp(const Instruction *I);
+ bool selectLoad(const Instruction *I);
+ bool selectStore(const Instruction *I);
+ bool selectBranch(const Instruction *I);
+ bool selectIndirectBr(const Instruction *I);
+ bool selectCmp(const Instruction *I);
+ bool selectSelect(const Instruction *I);
+ bool selectFPExt(const Instruction *I);
+ bool selectFPTrunc(const Instruction *I);
+ bool selectFPToInt(const Instruction *I, bool Signed);
+ bool selectIntToFP(const Instruction *I, bool Signed);
+ bool selectRem(const Instruction *I, unsigned ISDOpcode);
+ bool selectRet(const Instruction *I);
+ bool selectTrunc(const Instruction *I);
+ bool selectIntExt(const Instruction *I);
+ bool selectMul(const Instruction *I);
+ bool selectShift(const Instruction *I);
+ bool selectBitCast(const Instruction *I);
+ bool selectFRem(const Instruction *I);
+ bool selectSDiv(const Instruction *I);
+ bool selectGetElementPtr(const Instruction *I);
// Utility helper routines.
bool isTypeLegal(Type *Ty, MVT &VT);
- bool isLoadStoreTypeLegal(Type *Ty, MVT &VT);
- bool ComputeAddress(const Value *Obj, Address &Addr);
- bool SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
- bool UseUnscaled);
- void AddLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
- unsigned Flags, bool UseUnscaled);
- bool IsMemCpySmall(uint64_t Len, unsigned Alignment);
- bool TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
+ bool isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed = false);
+ bool isValueAvailable(const Value *V) const;
+ bool computeAddress(const Value *Obj, Address &Addr, Type *Ty = nullptr);
+ bool computeCallAddress(const Value *V, Address &Addr);
+ bool simplifyAddress(Address &Addr, MVT VT);
+ void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
+ unsigned Flags, unsigned ScaleFactor,
+ MachineMemOperand *MMO);
+ bool isMemCpySmall(uint64_t Len, unsigned Alignment);
+ bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
unsigned Alignment);
- // Emit functions.
- bool EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt);
- bool EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
- bool UseUnscaled = false);
- bool EmitStore(MVT VT, unsigned SrcReg, Address Addr,
- bool UseUnscaled = false);
- unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
- unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+ bool foldXALUIntrinsic(AArch64CC::CondCode &CC, const Instruction *I,
+ const Value *Cond);
+ bool optimizeIntExtLoad(const Instruction *I, MVT RetVT, MVT SrcVT);
+ bool optimizeSelect(const SelectInst *SI);
+ std::pair<unsigned, bool> getRegForGEPIndex(const Value *Idx);
+
+ // Emit helper routines.
+ unsigned emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+ const Value *RHS, bool SetFlags = false,
+ bool WantResult = true, bool IsZExt = false);
+ unsigned emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ bool SetFlags = false, bool WantResult = true);
+ unsigned emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, uint64_t Imm, bool SetFlags = false,
+ bool WantResult = true);
+ unsigned emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType,
+ uint64_t ShiftImm, bool SetFlags = false,
+ bool WantResult = true);
+ unsigned emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ExtType,
+ uint64_t ShiftImm, bool SetFlags = false,
+ bool WantResult = true);
- unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
- unsigned AArch64MaterializeGV(const GlobalValue *GV);
+ // Emit functions.
+ bool emitCompareAndBranch(const BranchInst *BI);
+ bool emitCmp(const Value *LHS, const Value *RHS, bool IsZExt);
+ bool emitICmp(MVT RetVT, const Value *LHS, const Value *RHS, bool IsZExt);
+ bool emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+ bool emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS);
+ unsigned emitLoad(MVT VT, MVT ResultVT, Address Addr, bool WantZExt = true,
+ MachineMemOperand *MMO = nullptr);
+ bool emitStore(MVT VT, unsigned SrcReg, Address Addr,
+ MachineMemOperand *MMO = nullptr);
+ unsigned emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+ unsigned emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
+ unsigned emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags = false, bool WantResult = true,
+ bool IsZExt = false);
+ unsigned emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill, int64_t Imm);
+ unsigned emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags = false, bool WantResult = true,
+ bool IsZExt = false);
+ unsigned emitSubs_rr(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ unsigned RHSReg, bool RHSIsKill, bool WantResult = true);
+ unsigned emitSubs_rs(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ unsigned RHSReg, bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType, uint64_t ShiftImm,
+ bool WantResult = true);
+ unsigned emitLogicalOp(unsigned ISDOpc, MVT RetVT, const Value *LHS,
+ const Value *RHS);
+ unsigned emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, uint64_t Imm);
+ unsigned emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg, bool RHSIsKill,
+ uint64_t ShiftImm);
+ unsigned emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill, uint64_t Imm);
+ unsigned emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+ unsigned emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+ unsigned emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill);
+ unsigned emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill);
+ unsigned emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+ uint64_t Imm, bool IsZExt = true);
+ unsigned emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill);
+ unsigned emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+ uint64_t Imm, bool IsZExt = true);
+ unsigned emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill);
+ unsigned emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0Reg, bool Op0IsKill,
+ uint64_t Imm, bool IsZExt = false);
+
+ unsigned materializeInt(const ConstantInt *CI, MVT VT);
+ unsigned materializeFP(const ConstantFP *CFP, MVT VT);
+ unsigned materializeGV(const GlobalValue *GV);
// Call handling routines.
private:
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC) const;
- bool ProcessCallArgs(SmallVectorImpl<Value *> &Args,
- SmallVectorImpl<unsigned> &ArgRegs,
- SmallVectorImpl<MVT> &ArgVTs,
- SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
- SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+ bool processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &ArgVTs,
unsigned &NumBytes);
- bool FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
- const Instruction *I, CallingConv::ID CC, unsigned &NumBytes);
+ bool finishCall(CallLoweringInfo &CLI, MVT RetVT, unsigned NumBytes);
public:
// Backend specific FastISel code.
- unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
- unsigned TargetMaterializeConstant(const Constant *C) override;
+ unsigned fastMaterializeAlloca(const AllocaInst *AI) override;
+ unsigned fastMaterializeConstant(const Constant *C) override;
+ unsigned fastMaterializeFloatZero(const ConstantFP* CF) override;
- explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
- const TargetLibraryInfo *libInfo)
- : FastISel(funcInfo, libInfo) {
+ explicit AArch64FastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo)
+ : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) {
Subtarget = &TM.getSubtarget<AArch64Subtarget>();
- Context = &funcInfo.Fn->getContext();
+ Context = &FuncInfo.Fn->getContext();
}
- bool TargetSelectInstruction(const Instruction *I) override;
+ bool fastSelectInstruction(const Instruction *I) override;
#include "AArch64GenFastISel.inc"
};
@@ -165,13 +260,52 @@ public:
#include "AArch64GenCallingConv.inc"
+/// \brief Check if the sign-/zero-extend will be a noop.
+static bool isIntExtFree(const Instruction *I) {
+ assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+ "Unexpected integer extend instruction.");
+ assert(!I->getType()->isVectorTy() && I->getType()->isIntegerTy() &&
+ "Unexpected value type.");
+ bool IsZExt = isa<ZExtInst>(I);
+
+ if (const auto *LI = dyn_cast<LoadInst>(I->getOperand(0)))
+ if (LI->hasOneUse())
+ return true;
+
+ if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0)))
+ if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr()))
+ return true;
+
+ return false;
+}
+
+/// \brief Determine the implicit scale factor that is applied by a memory
+/// operation for a given value type.
+static unsigned getImplicitScaleFactor(MVT VT) {
+ switch (VT.SimpleTy) {
+ default:
+ return 0; // invalid
+ case MVT::i1: // fall-through
+ case MVT::i8:
+ return 1;
+ case MVT::i16:
+ return 2;
+ case MVT::i32: // fall-through
+ case MVT::f32:
+ return 4;
+ case MVT::i64: // fall-through
+ case MVT::f64:
+ return 8;
+ }
+}
+
CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
if (CC == CallingConv::WebKit_JS)
return CC_AArch64_WebKit_JS;
return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
}
-unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
"Alloca should always return a pointer.");
@@ -183,7 +317,7 @@ unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
FuncInfo.StaticAllocaMap.find(AI);
if (SI != FuncInfo.StaticAllocaMap.end()) {
- unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
ResultReg)
.addFrameIndex(SI->second)
@@ -195,28 +329,59 @@ unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
return 0;
}
-unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) {
+ if (VT > MVT::i64)
+ return 0;
+
+ if (!CI->isZero())
+ return fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+
+ // Create a copy from the zero register to materialize a "0" value.
+ const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass
+ : &AArch64::GPR32RegClass;
+ unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(ZeroReg, getKillRegState(true));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
+ // Positive zero (+0.0) has to be materialized with a fmov from the zero
+ // register, because the immediate version of fmov cannot encode zero.
+ if (CFP->isNullValue())
+ return fastMaterializeFloatZero(CFP);
+
if (VT != MVT::f32 && VT != MVT::f64)
return 0;
const APFloat Val = CFP->getValueAPF();
- bool is64bit = (VT == MVT::f64);
-
+ bool Is64Bit = (VT == MVT::f64);
// This checks to see if we can use FMOV instructions to materialize
// a constant, otherwise we have to materialize via the constant pool.
if (TLI.isFPImmLegal(Val, VT)) {
- int Imm;
- unsigned Opc;
- if (is64bit) {
- Imm = AArch64_AM::getFP64Imm(Val);
- Opc = AArch64::FMOVDi;
- } else {
- Imm = AArch64_AM::getFP32Imm(Val);
- Opc = AArch64::FMOVSi;
- }
+ int Imm =
+ Is64Bit ? AArch64_AM::getFP64Imm(Val) : AArch64_AM::getFP32Imm(Val);
+ assert((Imm != -1) && "Cannot encode floating-point constant.");
+ unsigned Opc = Is64Bit ? AArch64::FMOVDi : AArch64::FMOVSi;
+ return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
+ }
+
+ // For the MachO large code model materialize the FP constant in code.
+ if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+ unsigned Opc1 = Is64Bit ? AArch64::MOVi64imm : AArch64::MOVi32imm;
+ const TargetRegisterClass *RC = Is64Bit ?
+ &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ unsigned TmpReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg)
+ .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+
unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addImm(Imm);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(TmpReg, getKillRegState(true));
+
return ResultReg;
}
@@ -226,20 +391,20 @@ unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
if (Align == 0)
Align = DL.getTypeAllocSize(CFP->getType());
- unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+ unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
- ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
+ ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
- unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
+ unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui;
unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addReg(ADRPReg)
- .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
return ResultReg;
}
-unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
+unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
// We can't handle thread-local variables quickly yet.
if (GV->isThreadLocal())
return 0;
@@ -262,30 +427,34 @@ unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
// ADRP + LDRX
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
ADRPReg)
- .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
ResultReg = createResultReg(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
ResultReg)
- .addReg(ADRPReg)
- .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
- AArch64II::MO_NC);
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ } else if (OpFlags & AArch64II::MO_CONSTPOOL) {
+ // We can't handle addresses loaded from a constant pool quickly yet.
+ return 0;
} else {
// ADRP + ADDX
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
- ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+ ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
ResultReg = createResultReg(&AArch64::GPR64spRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
ResultReg)
- .addReg(ADRPReg)
- .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
- .addImm(0);
+ .addReg(ADRPReg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
+ .addImm(0);
}
return ResultReg;
}
-unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
+unsigned AArch64FastISel::fastMaterializeConstant(const Constant *C) {
EVT CEVT = TLI.getValueType(C->getType(), true);
// Only handle simple types.
@@ -293,17 +462,48 @@ unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
return 0;
MVT VT = CEVT.getSimpleVT();
- // FIXME: Handle ConstantInt.
- if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
- return AArch64MaterializeFP(CFP, VT);
+ if (const auto *CI = dyn_cast<ConstantInt>(C))
+ return materializeInt(CI, VT);
+ else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+ return materializeFP(CFP, VT);
else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
- return AArch64MaterializeGV(GV);
+ return materializeGV(GV);
return 0;
}
+unsigned AArch64FastISel::fastMaterializeFloatZero(const ConstantFP* CFP) {
+ assert(CFP->isNullValue() &&
+ "Floating-point constant is not a positive zero.");
+ MVT VT;
+ if (!isTypeLegal(CFP->getType(), VT))
+ return 0;
+
+ if (VT != MVT::f32 && VT != MVT::f64)
+ return 0;
+
+ bool Is64Bit = (VT == MVT::f64);
+ unsigned ZReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+ unsigned Opc = Is64Bit ? AArch64::FMOVXDr : AArch64::FMOVWSr;
+ return fastEmitInst_r(Opc, TLI.getRegClassFor(VT), ZReg, /*IsKill=*/true);
+}
+
+/// \brief Check if the multiply is by a power-of-2 constant.
+static bool isMulPowOf2(const Value *I) {
+ if (const auto *MI = dyn_cast<MulOperator>(I)) {
+ if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(0)))
+ if (C->getValue().isPowerOf2())
+ return true;
+ if (const auto *C = dyn_cast<ConstantInt>(MI->getOperand(1)))
+ if (C->getValue().isPowerOf2())
+ return true;
+ }
+ return false;
+}
+
// Computes the address to get to an object.
-bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
+{
const User *U = nullptr;
unsigned Opcode = Instruction::UserOp1;
if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
@@ -330,18 +530,18 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
break;
case Instruction::BitCast: {
// Look through bitcasts.
- return ComputeAddress(U->getOperand(0), Addr);
+ return computeAddress(U->getOperand(0), Addr, Ty);
}
case Instruction::IntToPtr: {
// Look past no-op inttoptrs.
if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
- return ComputeAddress(U->getOperand(0), Addr);
+ return computeAddress(U->getOperand(0), Addr, Ty);
break;
}
case Instruction::PtrToInt: {
// Look past no-op ptrtoints.
if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
- return ComputeAddress(U->getOperand(0), Addr);
+ return computeAddress(U->getOperand(0), Addr, Ty);
break;
}
case Instruction::GetElementPtr: {
@@ -383,7 +583,7 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
// Try to grab the base operand now.
Addr.setOffset(TmpOffset);
- if (ComputeAddress(U->getOperand(0), Addr))
+ if (computeAddress(U->getOperand(0), Addr, Ty))
return true;
// We failed, restore everything and try the other options.
@@ -403,14 +603,301 @@ bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
}
break;
}
+ case Instruction::Add: {
+ // Adds of constants are common and easy enough.
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (isa<ConstantInt>(LHS))
+ std::swap(LHS, RHS);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ Addr.setOffset(Addr.getOffset() + CI->getSExtValue());
+ return computeAddress(LHS, Addr, Ty);
+ }
+
+ Address Backup = Addr;
+ if (computeAddress(LHS, Addr, Ty) && computeAddress(RHS, Addr, Ty))
+ return true;
+ Addr = Backup;
+
+ break;
+ }
+ case Instruction::Sub: {
+ // Subs of constants are common and easy enough.
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ Addr.setOffset(Addr.getOffset() - CI->getSExtValue());
+ return computeAddress(LHS, Addr, Ty);
+ }
+ break;
+ }
+ case Instruction::Shl: {
+ if (Addr.getOffsetReg())
+ break;
+
+ const auto *CI = dyn_cast<ConstantInt>(U->getOperand(1));
+ if (!CI)
+ break;
+
+ unsigned Val = CI->getZExtValue();
+ if (Val < 1 || Val > 3)
+ break;
+
+ uint64_t NumBytes = 0;
+ if (Ty && Ty->isSized()) {
+ uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+ NumBytes = NumBits / 8;
+ if (!isPowerOf2_64(NumBits))
+ NumBytes = 0;
+ }
+
+ if (NumBytes != (1ULL << Val))
+ break;
+
+ Addr.setShift(Val);
+ Addr.setExtendType(AArch64_AM::LSL);
+
+ const Value *Src = U->getOperand(0);
+ if (const auto *I = dyn_cast<Instruction>(Src))
+ if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+ Src = I;
+
+ // Fold the zext or sext when it won't become a noop.
+ if (const auto *ZE = dyn_cast<ZExtInst>(Src)) {
+ if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ Src = ZE->getOperand(0);
+ }
+ } else if (const auto *SE = dyn_cast<SExtInst>(Src)) {
+ if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::SXTW);
+ Src = SE->getOperand(0);
+ }
+ }
+
+ if (const auto *AI = dyn_cast<BinaryOperator>(Src))
+ if (AI->getOpcode() == Instruction::And) {
+ const Value *LHS = AI->getOperand(0);
+ const Value *RHS = AI->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(LHS))
+ if (C->getValue() == 0xffffffff)
+ std::swap(LHS, RHS);
+
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 0xffffffff) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ unsigned Reg = getRegForValue(LHS);
+ if (!Reg)
+ return false;
+ bool RegIsKill = hasTrivialKill(LHS);
+ Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+ AArch64::sub_32);
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ }
+
+ unsigned Reg = getRegForValue(Src);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ case Instruction::Mul: {
+ if (Addr.getOffsetReg())
+ break;
+
+ if (!isMulPowOf2(U))
+ break;
+
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ // Canonicalize power-of-2 value to the RHS.
+ if (const auto *C = dyn_cast<ConstantInt>(LHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(LHS, RHS);
+
+ assert(isa<ConstantInt>(RHS) && "Expected an ConstantInt.");
+ const auto *C = cast<ConstantInt>(RHS);
+ unsigned Val = C->getValue().logBase2();
+ if (Val < 1 || Val > 3)
+ break;
+
+ uint64_t NumBytes = 0;
+ if (Ty && Ty->isSized()) {
+ uint64_t NumBits = DL.getTypeSizeInBits(Ty);
+ NumBytes = NumBits / 8;
+ if (!isPowerOf2_64(NumBits))
+ NumBytes = 0;
+ }
+
+ if (NumBytes != (1ULL << Val))
+ break;
+
+ Addr.setShift(Val);
+ Addr.setExtendType(AArch64_AM::LSL);
+
+ const Value *Src = LHS;
+ if (const auto *I = dyn_cast<Instruction>(Src))
+ if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+ Src = I;
+
+
+ // Fold the zext or sext when it won't become a noop.
+ if (const auto *ZE = dyn_cast<ZExtInst>(Src)) {
+ if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ Src = ZE->getOperand(0);
+ }
+ } else if (const auto *SE = dyn_cast<SExtInst>(Src)) {
+ if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::SXTW);
+ Src = SE->getOperand(0);
+ }
+ }
+
+ unsigned Reg = getRegForValue(Src);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ case Instruction::And: {
+ if (Addr.getOffsetReg())
+ break;
+
+ if (!Ty || DL.getTypeSizeInBits(Ty) != 8)
+ break;
+
+ const Value *LHS = U->getOperand(0);
+ const Value *RHS = U->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(LHS))
+ if (C->getValue() == 0xffffffff)
+ std::swap(LHS, RHS);
+
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 0xffffffff) {
+ Addr.setShift(0);
+ Addr.setExtendType(AArch64_AM::LSL);
+ Addr.setExtendType(AArch64_AM::UXTW);
+
+ unsigned Reg = getRegForValue(LHS);
+ if (!Reg)
+ return false;
+ bool RegIsKill = hasTrivialKill(LHS);
+ Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, RegIsKill,
+ AArch64::sub_32);
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ break;
+ }
+ case Instruction::SExt:
+ case Instruction::ZExt: {
+ if (!Addr.getReg() || Addr.getOffsetReg())
+ break;
+
+ const Value *Src = nullptr;
+ // Fold the zext or sext when it won't become a noop.
+ if (const auto *ZE = dyn_cast<ZExtInst>(U)) {
+ if (!isIntExtFree(ZE) && ZE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::UXTW);
+ Src = ZE->getOperand(0);
+ }
+ } else if (const auto *SE = dyn_cast<SExtInst>(U)) {
+ if (!isIntExtFree(SE) && SE->getOperand(0)->getType()->isIntegerTy(32)) {
+ Addr.setExtendType(AArch64_AM::SXTW);
+ Src = SE->getOperand(0);
+ }
+ }
+
+ if (!Src)
+ break;
+
+ Addr.setShift(0);
+ unsigned Reg = getRegForValue(Src);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+ } // end switch
+
+ if (Addr.isRegBase() && !Addr.getReg()) {
+ unsigned Reg = getRegForValue(Obj);
+ if (!Reg)
+ return false;
+ Addr.setReg(Reg);
+ return true;
+ }
+
+ if (!Addr.getOffsetReg()) {
+ unsigned Reg = getRegForValue(Obj);
+ if (!Reg)
+ return false;
+ Addr.setOffsetReg(Reg);
+ return true;
+ }
+
+ return false;
+}
+
+bool AArch64FastISel::computeCallAddress(const Value *V, Address &Addr) {
+ const User *U = nullptr;
+ unsigned Opcode = Instruction::UserOp1;
+ bool InMBB = true;
+
+ if (const auto *I = dyn_cast<Instruction>(V)) {
+ Opcode = I->getOpcode();
+ U = I;
+ InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
+ } else if (const auto *C = dyn_cast<ConstantExpr>(V)) {
+ Opcode = C->getOpcode();
+ U = C;
+ }
+
+ switch (Opcode) {
+ default: break;
+ case Instruction::BitCast:
+ // Look past bitcasts if its operand is in the same BB.
+ if (InMBB)
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::IntToPtr:
+ // Look past no-op inttoptrs if its operand is in the same BB.
+ if (InMBB &&
+ TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ case Instruction::PtrToInt:
+ // Look past no-op ptrtoints if its operand is in the same BB.
+ if (InMBB &&
+ TLI.getValueType(U->getType()) == TLI.getPointerTy())
+ return computeCallAddress(U->getOperand(0), Addr);
+ break;
+ }
+
+ if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+ Addr.setGlobalValue(GV);
+ return true;
+ }
+
+ // If all else fails, try to materialize the value in a register.
+ if (!Addr.getGlobalValue()) {
+ Addr.setReg(getRegForValue(V));
+ return Addr.getReg() != 0;
}
- // Try to get this in a register if nothing else has worked.
- if (!Addr.isValid())
- Addr.setReg(getRegForValue(Obj));
- return Addr.isValid();
+ return false;
}
+
bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
EVT evt = TLI.getValueType(Ty, true);
@@ -428,62 +915,122 @@ bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
return TLI.isTypeLegal(VT);
}
-bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+/// \brief Determine if the value type is supported by FastISel.
+///
+/// FastISel for AArch64 can handle more value types than are legal. This adds
+/// simple value type such as i1, i8, and i16.
+bool AArch64FastISel::isTypeSupported(Type *Ty, MVT &VT, bool IsVectorAllowed) {
+ if (Ty->isVectorTy() && !IsVectorAllowed)
+ return false;
+
if (isTypeLegal(Ty, VT))
return true;
// If this is a type than can be sign or zero-extended to a basic operation
- // go ahead and accept it now. For stores, this reflects truncation.
+ // go ahead and accept it now.
if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
return true;
return false;
}
-bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
- int64_t ScaleFactor, bool UseUnscaled) {
- bool needsLowering = false;
- int64_t Offset = Addr.getOffset();
- switch (VT.SimpleTy) {
- default:
+bool AArch64FastISel::isValueAvailable(const Value *V) const {
+ if (!isa<Instruction>(V))
+ return true;
+
+ const auto *I = cast<Instruction>(V);
+ if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
+ return true;
+
+ return false;
+}
+
+bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
+ unsigned ScaleFactor = getImplicitScaleFactor(VT);
+ if (!ScaleFactor)
return false;
- case MVT::i1:
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- case MVT::i64:
- case MVT::f32:
- case MVT::f64:
- if (!UseUnscaled)
- // Using scaled, 12-bit, unsigned immediate offsets.
- needsLowering = ((Offset & 0xfff) != Offset);
- else
- // Using unscaled, 9-bit, signed immediate offsets.
- needsLowering = (Offset > 256 || Offset < -256);
- break;
- }
- //If this is a stack pointer and the offset needs to be simplified then put
+ bool ImmediateOffsetNeedsLowering = false;
+ bool RegisterOffsetNeedsLowering = false;
+ int64_t Offset = Addr.getOffset();
+ if (((Offset < 0) || (Offset & (ScaleFactor - 1))) && !isInt<9>(Offset))
+ ImmediateOffsetNeedsLowering = true;
+ else if (Offset > 0 && !(Offset & (ScaleFactor - 1)) &&
+ !isUInt<12>(Offset / ScaleFactor))
+ ImmediateOffsetNeedsLowering = true;
+
+ // Cannot encode an offset register and an immediate offset in the same
+ // instruction. Fold the immediate offset into the load/store instruction and
+ // emit an additonal add to take care of the offset register.
+ if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg())
+ RegisterOffsetNeedsLowering = true;
+
+ // Cannot encode zero register as base.
+ if (Addr.isRegBase() && Addr.getOffsetReg() && !Addr.getReg())
+ RegisterOffsetNeedsLowering = true;
+
+ // If this is a stack pointer and the offset needs to be simplified then put
// the alloca address into a register, set the base type back to register and
// continue. This should almost never happen.
- if (needsLowering && Addr.getKind() == Address::FrameIndexBase) {
- unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase())
+ {
+ unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
ResultReg)
- .addFrameIndex(Addr.getFI())
- .addImm(0)
- .addImm(0);
+ .addFrameIndex(Addr.getFI())
+ .addImm(0)
+ .addImm(0);
Addr.setKind(Address::RegBase);
Addr.setReg(ResultReg);
}
+ if (RegisterOffsetNeedsLowering) {
+ unsigned ResultReg = 0;
+ if (Addr.getReg()) {
+ if (Addr.getExtendType() == AArch64_AM::SXTW ||
+ Addr.getExtendType() == AArch64_AM::UXTW )
+ ResultReg = emitAddSub_rx(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+ /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+ /*TODO:IsKill=*/false, Addr.getExtendType(),
+ Addr.getShift());
+ else
+ ResultReg = emitAddSub_rs(/*UseAdd=*/true, MVT::i64, Addr.getReg(),
+ /*TODO:IsKill=*/false, Addr.getOffsetReg(),
+ /*TODO:IsKill=*/false, AArch64_AM::LSL,
+ Addr.getShift());
+ } else {
+ if (Addr.getExtendType() == AArch64_AM::UXTW)
+ ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+ /*Op0IsKill=*/false, Addr.getShift(),
+ /*IsZExt=*/true);
+ else if (Addr.getExtendType() == AArch64_AM::SXTW)
+ ResultReg = emitLSL_ri(MVT::i64, MVT::i32, Addr.getOffsetReg(),
+ /*Op0IsKill=*/false, Addr.getShift(),
+ /*IsZExt=*/false);
+ else
+ ResultReg = emitLSL_ri(MVT::i64, MVT::i64, Addr.getOffsetReg(),
+ /*Op0IsKill=*/false, Addr.getShift());
+ }
+ if (!ResultReg)
+ return false;
+
+ Addr.setReg(ResultReg);
+ Addr.setOffsetReg(0);
+ Addr.setShift(0);
+ Addr.setExtendType(AArch64_AM::InvalidShiftExtend);
+ }
+
// Since the offset is too large for the load/store instruction get the
// reg+offset into a register.
- if (needsLowering) {
- uint64_t UnscaledOffset = Addr.getOffset() * ScaleFactor;
- unsigned ResultReg = FastEmit_ri_(MVT::i64, ISD::ADD, Addr.getReg(), false,
- UnscaledOffset, MVT::i64);
- if (ResultReg == 0)
+ if (ImmediateOffsetNeedsLowering) {
+ unsigned ResultReg;
+ if (Addr.getReg())
+ // Try to fold the immediate into the add instruction.
+ ResultReg = emitAdd_ri_(MVT::i64, Addr.getReg(), /*IsKill=*/false, Offset);
+ else
+ ResultReg = fastEmit_i(MVT::i64, MVT::i64, ISD::Constant, Offset);
+
+ if (!ResultReg)
return false;
Addr.setReg(ResultReg);
Addr.setOffset(0);
@@ -491,222 +1038,1021 @@ bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
return true;
}
-void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+void AArch64FastISel::addLoadStoreOperands(Address &Addr,
const MachineInstrBuilder &MIB,
- unsigned Flags, bool UseUnscaled) {
- int64_t Offset = Addr.getOffset();
+ unsigned Flags,
+ unsigned ScaleFactor,
+ MachineMemOperand *MMO) {
+ int64_t Offset = Addr.getOffset() / ScaleFactor;
// Frame base works a bit differently. Handle it separately.
- if (Addr.getKind() == Address::FrameIndexBase) {
+ if (Addr.isFIBase()) {
int FI = Addr.getFI();
// FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size
// and alignment should be based on the VT.
- MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
- MachinePointerInfo::getFixedStack(FI, Offset), Flags,
- MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+ MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getFixedStack(FI, Offset), Flags,
+ MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
// Now add the rest of the operands.
- MIB.addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+ MIB.addFrameIndex(FI).addImm(Offset);
} else {
- // Now add the rest of the operands.
- MIB.addReg(Addr.getReg());
- MIB.addImm(Offset);
+ assert(Addr.isRegBase() && "Unexpected address kind.");
+ const MCInstrDesc &II = MIB->getDesc();
+ unsigned Idx = (Flags & MachineMemOperand::MOStore) ? 1 : 0;
+ Addr.setReg(
+ constrainOperandRegClass(II, Addr.getReg(), II.getNumDefs()+Idx));
+ Addr.setOffsetReg(
+ constrainOperandRegClass(II, Addr.getOffsetReg(), II.getNumDefs()+Idx+1));
+ if (Addr.getOffsetReg()) {
+ assert(Addr.getOffset() == 0 && "Unexpected offset");
+ bool IsSigned = Addr.getExtendType() == AArch64_AM::SXTW ||
+ Addr.getExtendType() == AArch64_AM::SXTX;
+ MIB.addReg(Addr.getReg());
+ MIB.addReg(Addr.getOffsetReg());
+ MIB.addImm(IsSigned);
+ MIB.addImm(Addr.getShift() != 0);
+ } else
+ MIB.addReg(Addr.getReg()).addImm(Offset);
}
+
+ if (MMO)
+ MIB.addMemOperand(MMO);
}
-bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
- bool UseUnscaled) {
- // Negative offsets require unscaled, 9-bit, signed immediate offsets.
- // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
- if (!UseUnscaled && Addr.getOffset() < 0)
- UseUnscaled = true;
+unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
+ const Value *RHS, bool SetFlags,
+ bool WantResult, bool IsZExt) {
+ AArch64_AM::ShiftExtendType ExtendType = AArch64_AM::InvalidShiftExtend;
+ bool NeedExtend = false;
+ switch (RetVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ NeedExtend = true;
+ break;
+ case MVT::i8:
+ NeedExtend = true;
+ ExtendType = IsZExt ? AArch64_AM::UXTB : AArch64_AM::SXTB;
+ break;
+ case MVT::i16:
+ NeedExtend = true;
+ ExtendType = IsZExt ? AArch64_AM::UXTH : AArch64_AM::SXTH;
+ break;
+ case MVT::i32: // fall-through
+ case MVT::i64:
+ break;
+ }
+ MVT SrcVT = RetVT;
+ RetVT.SimpleTy = std::max(RetVT.SimpleTy, MVT::i32);
+
+ // Canonicalize immediates to the RHS first.
+ if (UseAdd && isa<Constant>(LHS) && !isa<Constant>(RHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize mul by power of 2 to the RHS.
+ if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+ if (isMulPowOf2(LHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize shift immediate to the RHS.
+ if (UseAdd && LHS->hasOneUse() && isValueAvailable(LHS))
+ if (const auto *SI = dyn_cast<BinaryOperator>(LHS))
+ if (isa<ConstantInt>(SI->getOperand(1)))
+ if (SI->getOpcode() == Instruction::Shl ||
+ SI->getOpcode() == Instruction::LShr ||
+ SI->getOpcode() == Instruction::AShr )
+ std::swap(LHS, RHS);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return 0;
+ bool LHSIsKill = hasTrivialKill(LHS);
- unsigned Opc;
+ if (NeedExtend)
+ LHSReg = emitIntExt(SrcVT, LHSReg, RetVT, IsZExt);
+
+ unsigned ResultReg = 0;
+ if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+ uint64_t Imm = IsZExt ? C->getZExtValue() : C->getSExtValue();
+ if (C->isNegative())
+ ResultReg = emitAddSub_ri(!UseAdd, RetVT, LHSReg, LHSIsKill, -Imm,
+ SetFlags, WantResult);
+ else
+ ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, Imm, SetFlags,
+ WantResult);
+ } else if (const auto *C = dyn_cast<Constant>(RHS))
+ if (C->isNullValue())
+ ResultReg = emitAddSub_ri(UseAdd, RetVT, LHSReg, LHSIsKill, 0, SetFlags,
+ WantResult);
+
+ if (ResultReg)
+ return ResultReg;
+
+ // Only extend the RHS within the instruction if there is a valid extend type.
+ if (ExtendType != AArch64_AM::InvalidShiftExtend && RHS->hasOneUse() &&
+ isValueAvailable(RHS)) {
+ if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
+ if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
+ if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
+ unsigned RHSReg = getRegForValue(SI->getOperand(0));
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+ return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ExtendType, C->getZExtValue(),
+ SetFlags, WantResult);
+ }
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(RHS);
+ return emitAddSub_rx(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ ExtendType, 0, SetFlags, WantResult);
+ }
+
+ // Check if the mul can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (isMulPowOf2(RHS)) {
+ const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+ const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(MulLHS, MulRHS);
+
+ assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+ uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+ unsigned RHSReg = getRegForValue(MulLHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(MulLHS);
+ return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ AArch64_AM::LSL, ShiftVal, SetFlags, WantResult);
+ }
+
+ // Check if the shift can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) {
+ if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+ AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend;
+ switch (SI->getOpcode()) {
+ default: break;
+ case Instruction::Shl: ShiftType = AArch64_AM::LSL; break;
+ case Instruction::LShr: ShiftType = AArch64_AM::LSR; break;
+ case Instruction::AShr: ShiftType = AArch64_AM::ASR; break;
+ }
+ uint64_t ShiftVal = C->getZExtValue();
+ if (ShiftType != AArch64_AM::InvalidShiftExtend) {
+ unsigned RHSReg = getRegForValue(SI->getOperand(0));
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+ return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftType, ShiftVal, SetFlags,
+ WantResult);
+ }
+ }
+ }
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (NeedExtend)
+ RHSReg = emitIntExt(SrcVT, RHSReg, RetVT, IsZExt);
+
+ return emitAddSub_rr(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ SetFlags, WantResult);
+}
+
+unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill, bool SetFlags,
+ bool WantResult) {
+ assert(LHSReg && RHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWrr, AArch64::SUBXrr },
+ { AArch64::ADDWrr, AArch64::ADDXrr } },
+ { { AArch64::SUBSWrr, AArch64::SUBSXrr },
+ { AArch64::ADDSWrr, AArch64::ADDSXrr } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_ri(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, uint64_t Imm,
+ bool SetFlags, bool WantResult) {
+ assert(LHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ unsigned ShiftImm;
+ if (isUInt<12>(Imm))
+ ShiftImm = 0;
+ else if ((Imm & 0xfff000) == Imm) {
+ ShiftImm = 12;
+ Imm >>= 12;
+ } else
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWri, AArch64::SUBXri },
+ { AArch64::ADDWri, AArch64::ADDXri } },
+ { { AArch64::SUBSWri, AArch64::SUBSXri },
+ { AArch64::ADDSWri, AArch64::ADDSXri } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
const TargetRegisterClass *RC;
- bool VTIsi1 = false;
- int64_t ScaleFactor = 0;
+ if (SetFlags)
+ RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ else
+ RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addImm(Imm)
+ .addImm(getShifterImm(AArch64_AM::LSL, ShiftImm));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType,
+ uint64_t ShiftImm, bool SetFlags,
+ bool WantResult) {
+ assert(LHSReg && RHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWrs, AArch64::SUBXrs },
+ { AArch64::ADDWrs, AArch64::ADDXrs } },
+ { { AArch64::SUBSWrs, AArch64::SUBSXrs },
+ { AArch64::ADDSWrs, AArch64::ADDSXrs } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill))
+ .addImm(getShifterImm(ShiftType, ShiftImm));
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ExtType,
+ uint64_t ShiftImm, bool SetFlags,
+ bool WantResult) {
+ assert(LHSReg && RHSReg && "Invalid register number.");
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return 0;
+
+ static const unsigned OpcTable[2][2][2] = {
+ { { AArch64::SUBWrx, AArch64::SUBXrx },
+ { AArch64::ADDWrx, AArch64::ADDXrx } },
+ { { AArch64::SUBSWrx, AArch64::SUBSXrx },
+ { AArch64::ADDSWrx, AArch64::ADDSXrx } }
+ };
+ bool Is64Bit = RetVT == MVT::i64;
+ unsigned Opc = OpcTable[SetFlags][UseAdd][Is64Bit];
+ const TargetRegisterClass *RC = nullptr;
+ if (SetFlags)
+ RC = Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ else
+ RC = Is64Bit ? &AArch64::GPR64spRegClass : &AArch64::GPR32spRegClass;
+ unsigned ResultReg;
+ if (WantResult)
+ ResultReg = createResultReg(RC);
+ else
+ ResultReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ const MCInstrDesc &II = TII.get(Opc);
+ LHSReg = constrainOperandRegClass(II, LHSReg, II.getNumDefs());
+ RHSReg = constrainOperandRegClass(II, RHSReg, II.getNumDefs() + 1);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill))
+ .addImm(getArithExtendImm(ExtType, ShiftImm));
+ return ResultReg;
+}
+
+bool AArch64FastISel::emitCmp(const Value *LHS, const Value *RHS, bool IsZExt) {
+ Type *Ty = LHS->getType();
+ EVT EVT = TLI.getValueType(Ty, true);
+ if (!EVT.isSimple())
+ return false;
+ MVT VT = EVT.getSimpleVT();
+
switch (VT.SimpleTy) {
default:
return false;
case MVT::i1:
- VTIsi1 = true;
- // Intentional fall-through.
case MVT::i8:
- Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ return emitICmp(VT, LHS, RHS, IsZExt);
+ case MVT::f32:
+ case MVT::f64:
+ return emitFCmp(VT, LHS, RHS);
+ }
+}
+
+bool AArch64FastISel::emitICmp(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool IsZExt) {
+ return emitSub(RetVT, LHS, RHS, /*SetFlags=*/true, /*WantResult=*/false,
+ IsZExt) != 0;
+}
+
+bool AArch64FastISel::emitICmp_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ uint64_t Imm) {
+ return emitAddSub_ri(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, Imm,
+ /*SetFlags=*/true, /*WantResult=*/false) != 0;
+}
+
+bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
+ if (RetVT != MVT::f32 && RetVT != MVT::f64)
+ return false;
+
+ // Check to see if the 2nd operand is a constant that we can encode directly
+ // in the compare.
+ bool UseImm = false;
+ if (const auto *CFP = dyn_cast<ConstantFP>(RHS))
+ if (CFP->isZero() && !CFP->isNegative())
+ UseImm = true;
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ if (UseImm) {
+ unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDri : AArch64::FCMPSri;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(LHSReg, getKillRegState(LHSIsKill));
+ return true;
+ }
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return false;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ unsigned Opc = (RetVT == MVT::f64) ? AArch64::FCMPDrr : AArch64::FCMPSrr;
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(LHSReg, getKillRegState(LHSIsKill))
+ .addReg(RHSReg, getKillRegState(RHSIsKill));
+ return true;
+}
+
+unsigned AArch64FastISel::emitAdd(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags, bool WantResult, bool IsZExt) {
+ return emitAddSub(/*UseAdd=*/true, RetVT, LHS, RHS, SetFlags, WantResult,
+ IsZExt);
+}
+
+/// \brief This method is a wrapper to simplify add emission.
+///
+/// First try to emit an add with an immediate operand using emitAddSub_ri. If
+/// that fails, then try to materialize the immediate into a register and use
+/// emitAddSub_rr instead.
+unsigned AArch64FastISel::emitAdd_ri_(MVT VT, unsigned Op0, bool Op0IsKill,
+ int64_t Imm) {
+ unsigned ResultReg;
+ if (Imm < 0)
+ ResultReg = emitAddSub_ri(false, VT, Op0, Op0IsKill, -Imm);
+ else
+ ResultReg = emitAddSub_ri(true, VT, Op0, Op0IsKill, Imm);
+
+ if (ResultReg)
+ return ResultReg;
+
+ unsigned CReg = fastEmit_i(VT, VT, ISD::Constant, Imm);
+ if (!CReg)
+ return 0;
+
+ ResultReg = emitAddSub_rr(true, VT, Op0, Op0IsKill, CReg, true);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitSub(MVT RetVT, const Value *LHS, const Value *RHS,
+ bool SetFlags, bool WantResult, bool IsZExt) {
+ return emitAddSub(/*UseAdd=*/false, RetVT, LHS, RHS, SetFlags, WantResult,
+ IsZExt);
+}
+
+unsigned AArch64FastISel::emitSubs_rr(MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill, bool WantResult) {
+ return emitAddSub_rr(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, /*SetFlags=*/true, WantResult);
+}
+
+unsigned AArch64FastISel::emitSubs_rs(MVT RetVT, unsigned LHSReg,
+ bool LHSIsKill, unsigned RHSReg,
+ bool RHSIsKill,
+ AArch64_AM::ShiftExtendType ShiftType,
+ uint64_t ShiftImm, bool WantResult) {
+ return emitAddSub_rs(/*UseAdd=*/false, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftType, ShiftImm, /*SetFlags=*/true,
+ WantResult);
+}
+
+unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
+ const Value *LHS, const Value *RHS) {
+ // Canonicalize immediates to the RHS first.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize mul by power-of-2 to the RHS.
+ if (LHS->hasOneUse() && isValueAvailable(LHS))
+ if (isMulPowOf2(LHS))
+ std::swap(LHS, RHS);
+
+ // Canonicalize shift immediate to the RHS.
+ if (LHS->hasOneUse() && isValueAvailable(LHS))
+ if (const auto *SI = dyn_cast<ShlOperator>(LHS))
+ if (isa<ConstantInt>(SI->getOperand(1)))
+ std::swap(LHS, RHS);
+
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return 0;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned ResultReg = 0;
+ if (const auto *C = dyn_cast<ConstantInt>(RHS)) {
+ uint64_t Imm = C->getZExtValue();
+ ResultReg = emitLogicalOp_ri(ISDOpc, RetVT, LHSReg, LHSIsKill, Imm);
+ }
+ if (ResultReg)
+ return ResultReg;
+
+ // Check if the mul can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (isMulPowOf2(RHS)) {
+ const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0);
+ const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(MulLHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(MulLHS, MulRHS);
+
+ assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
+ uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
+
+ unsigned RHSReg = getRegForValue(MulLHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(MulLHS);
+ return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ }
+
+ // Check if the shift can be folded into the instruction.
+ if (RHS->hasOneUse() && isValueAvailable(RHS))
+ if (const auto *SI = dyn_cast<ShlOperator>(RHS))
+ if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+ uint64_t ShiftVal = C->getZExtValue();
+ unsigned RHSReg = getRegForValue(SI->getOperand(0));
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(SI->getOperand(0));
+ return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg,
+ RHSIsKill, ShiftVal);
+ }
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return 0;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ MVT VT = std::max(MVT::i32, RetVT.SimpleTy);
+ ResultReg = fastEmit_rr(VT, VT, ISDOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+ uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
+ unsigned LHSReg, bool LHSIsKill,
+ uint64_t Imm) {
+ assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
+ "ISD nodes are not consecutive!");
+ static const unsigned OpcTable[3][2] = {
+ { AArch64::ANDWri, AArch64::ANDXri },
+ { AArch64::ORRWri, AArch64::ORRXri },
+ { AArch64::EORWri, AArch64::EORXri }
+ };
+ const TargetRegisterClass *RC;
+ unsigned Opc;
+ unsigned RegSize;
+ switch (RetVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32: {
+ unsigned Idx = ISDOpc - ISD::AND;
+ Opc = OpcTable[Idx][0];
+ RC = &AArch64::GPR32spRegClass;
+ RegSize = 32;
+ break;
+ }
+ case MVT::i64:
+ Opc = OpcTable[ISDOpc - ISD::AND][1];
+ RC = &AArch64::GPR64spRegClass;
+ RegSize = 64;
+ break;
+ }
+
+ if (!AArch64_AM::isLogicalImmediate(Imm, RegSize))
+ return 0;
+
+ unsigned ResultReg =
+ fastEmitInst_ri(Opc, RC, LHSReg, LHSIsKill,
+ AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
+ if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
+ uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
+ unsigned LHSReg, bool LHSIsKill,
+ unsigned RHSReg, bool RHSIsKill,
+ uint64_t ShiftImm) {
+ assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
+ "ISD nodes are not consecutive!");
+ static const unsigned OpcTable[3][2] = {
+ { AArch64::ANDWrs, AArch64::ANDXrs },
+ { AArch64::ORRWrs, AArch64::ORRXrs },
+ { AArch64::EORWrs, AArch64::EORXrs }
+ };
+ const TargetRegisterClass *RC;
+ unsigned Opc;
+ switch (RetVT.SimpleTy) {
+ default:
+ return 0;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ Opc = OpcTable[ISDOpc - ISD::AND][0];
RC = &AArch64::GPR32RegClass;
+ break;
+ case MVT::i64:
+ Opc = OpcTable[ISDOpc - ISD::AND][1];
+ RC = &AArch64::GPR64RegClass;
+ break;
+ }
+ unsigned ResultReg =
+ fastEmitInst_rri(Opc, RC, LHSReg, LHSIsKill, RHSReg, RHSIsKill,
+ AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
+ if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
+ uint64_t Mask = (RetVT == MVT::i8) ? 0xff : 0xffff;
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ }
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitAnd_ri(MVT RetVT, unsigned LHSReg, bool LHSIsKill,
+ uint64_t Imm) {
+ return emitLogicalOp_ri(ISD::AND, RetVT, LHSReg, LHSIsKill, Imm);
+}
+
+unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
+ bool WantZExt, MachineMemOperand *MMO) {
+ // Simplify this down to something we can handle.
+ if (!simplifyAddress(Addr, VT))
+ return 0;
+
+ unsigned ScaleFactor = getImplicitScaleFactor(VT);
+ if (!ScaleFactor)
+ llvm_unreachable("Unexpected value type.");
+
+ // Negative offsets require unscaled, 9-bit, signed immediate offsets.
+ // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
+ bool UseScaled = true;
+ if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+ UseScaled = false;
ScaleFactor = 1;
+ }
+
+ static const unsigned GPOpcTable[2][8][4] = {
+ // Sign-extend.
+ { { AArch64::LDURSBWi, AArch64::LDURSHWi, AArch64::LDURWi,
+ AArch64::LDURXi },
+ { AArch64::LDURSBXi, AArch64::LDURSHXi, AArch64::LDURSWi,
+ AArch64::LDURXi },
+ { AArch64::LDRSBWui, AArch64::LDRSHWui, AArch64::LDRWui,
+ AArch64::LDRXui },
+ { AArch64::LDRSBXui, AArch64::LDRSHXui, AArch64::LDRSWui,
+ AArch64::LDRXui },
+ { AArch64::LDRSBWroX, AArch64::LDRSHWroX, AArch64::LDRWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRSBXroX, AArch64::LDRSHXroX, AArch64::LDRSWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRSBWroW, AArch64::LDRSHWroW, AArch64::LDRWroW,
+ AArch64::LDRXroW },
+ { AArch64::LDRSBXroW, AArch64::LDRSHXroW, AArch64::LDRSWroW,
+ AArch64::LDRXroW }
+ },
+ // Zero-extend.
+ { { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi,
+ AArch64::LDURXi },
+ { AArch64::LDURBBi, AArch64::LDURHHi, AArch64::LDURWi,
+ AArch64::LDURXi },
+ { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui,
+ AArch64::LDRXui },
+ { AArch64::LDRBBui, AArch64::LDRHHui, AArch64::LDRWui,
+ AArch64::LDRXui },
+ { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRBBroX, AArch64::LDRHHroX, AArch64::LDRWroX,
+ AArch64::LDRXroX },
+ { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW,
+ AArch64::LDRXroW },
+ { AArch64::LDRBBroW, AArch64::LDRHHroW, AArch64::LDRWroW,
+ AArch64::LDRXroW }
+ }
+ };
+
+ static const unsigned FPOpcTable[4][2] = {
+ { AArch64::LDURSi, AArch64::LDURDi },
+ { AArch64::LDRSui, AArch64::LDRDui },
+ { AArch64::LDRSroX, AArch64::LDRDroX },
+ { AArch64::LDRSroW, AArch64::LDRDroW }
+ };
+
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+ Addr.getOffsetReg();
+ unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+ if (Addr.getExtendType() == AArch64_AM::UXTW ||
+ Addr.getExtendType() == AArch64_AM::SXTW)
+ Idx++;
+
+ bool IsRet64Bit = RetVT == MVT::i64;
+ switch (VT.SimpleTy) {
+ default:
+ llvm_unreachable("Unexpected value type.");
+ case MVT::i1: // Intentional fall-through.
+ case MVT::i8:
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][0];
+ RC = (IsRet64Bit && !WantZExt) ?
+ &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
break;
case MVT::i16:
- Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
- RC = &AArch64::GPR32RegClass;
- ScaleFactor = 2;
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][1];
+ RC = (IsRet64Bit && !WantZExt) ?
+ &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
break;
case MVT::i32:
- Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
- RC = &AArch64::GPR32RegClass;
- ScaleFactor = 4;
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][2];
+ RC = (IsRet64Bit && !WantZExt) ?
+ &AArch64::GPR64RegClass: &AArch64::GPR32RegClass;
break;
case MVT::i64:
- Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+ Opc = GPOpcTable[WantZExt][2 * Idx + IsRet64Bit][3];
RC = &AArch64::GPR64RegClass;
- ScaleFactor = 8;
break;
case MVT::f32:
- Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
- RC = TLI.getRegClassFor(VT);
- ScaleFactor = 4;
+ Opc = FPOpcTable[Idx][0];
+ RC = &AArch64::FPR32RegClass;
break;
case MVT::f64:
- Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
- RC = TLI.getRegClassFor(VT);
- ScaleFactor = 8;
+ Opc = FPOpcTable[Idx][1];
+ RC = &AArch64::FPR64RegClass;
break;
}
- // Scale the offset.
- if (!UseUnscaled) {
- int64_t Offset = Addr.getOffset();
- if (Offset & (ScaleFactor - 1))
- // Retry using an unscaled, 9-bit, signed immediate offset.
- return EmitLoad(VT, ResultReg, Addr, /*UseUnscaled*/ true);
-
- Addr.setOffset(Offset / ScaleFactor);
- }
-
- // Simplify this down to something we can handle.
- if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
- return false;
// Create the base instruction, then add the operands.
- ResultReg = createResultReg(RC);
+ unsigned ResultReg = createResultReg(RC);
MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(Opc), ResultReg);
- AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, UseUnscaled);
+ addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
// Loading an i1 requires special handling.
- if (VTIsi1) {
- MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
- unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
- ANDReg)
- .addReg(ResultReg)
- .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ if (VT == MVT::i1) {
+ unsigned ANDReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, 1);
+ assert(ANDReg && "Unexpected AND instruction emission failure.");
ResultReg = ANDReg;
}
+
+ // For zero-extending loads to 64bit we emit a 32bit load and then convert
+ // the 32bit reg to a 64bit reg.
+ if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
+ unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Reg64)
+ .addImm(0)
+ .addReg(ResultReg, getKillRegState(true))
+ .addImm(AArch64::sub_32);
+ ResultReg = Reg64;
+ }
+ return ResultReg;
+}
+
+bool AArch64FastISel::selectAddSub(const Instruction *I) {
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+ return false;
+
+ if (VT.isVector())
+ return selectOperator(I, I->getOpcode());
+
+ unsigned ResultReg;
+ switch (I->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Instruction::Add:
+ ResultReg = emitAdd(VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Sub:
+ ResultReg = emitSub(VT, I->getOperand(0), I->getOperand(1));
+ break;
+ }
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectLogicalOp(const Instruction *I) {
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
+ return false;
+
+ if (VT.isVector())
+ return selectOperator(I, I->getOpcode());
+
+ unsigned ResultReg;
+ switch (I->getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ case Instruction::And:
+ ResultReg = emitLogicalOp(ISD::AND, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Or:
+ ResultReg = emitLogicalOp(ISD::OR, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ case Instruction::Xor:
+ ResultReg = emitLogicalOp(ISD::XOR, VT, I->getOperand(0), I->getOperand(1));
+ break;
+ }
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::SelectLoad(const Instruction *I) {
+bool AArch64FastISel::selectLoad(const Instruction *I) {
MVT VT;
// Verify we have a legal type before going any further. Currently, we handle
// simple types that will directly fit in a register (i32/f32/i64/f64) or
// those that can be sign or zero-extended to a basic operation (i1/i8/i16).
- if (!isLoadStoreTypeLegal(I->getType(), VT) || cast<LoadInst>(I)->isAtomic())
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true) ||
+ cast<LoadInst>(I)->isAtomic())
return false;
// See if we can handle this address.
Address Addr;
- if (!ComputeAddress(I->getOperand(0), Addr))
+ if (!computeAddress(I->getOperand(0), Addr, I->getType()))
return false;
- unsigned ResultReg;
- if (!EmitLoad(VT, ResultReg, Addr))
+ // Fold the following sign-/zero-extend into the load instruction.
+ bool WantZExt = true;
+ MVT RetVT = VT;
+ const Value *IntExtVal = nullptr;
+ if (I->hasOneUse()) {
+ if (const auto *ZE = dyn_cast<ZExtInst>(I->use_begin()->getUser())) {
+ if (isTypeSupported(ZE->getType(), RetVT))
+ IntExtVal = ZE;
+ else
+ RetVT = VT;
+ } else if (const auto *SE = dyn_cast<SExtInst>(I->use_begin()->getUser())) {
+ if (isTypeSupported(SE->getType(), RetVT))
+ IntExtVal = SE;
+ else
+ RetVT = VT;
+ WantZExt = false;
+ }
+ }
+
+ unsigned ResultReg =
+ emitLoad(VT, RetVT, Addr, WantZExt, createMachineMemOperandFor(I));
+ if (!ResultReg)
return false;
- UpdateValueMap(I, ResultReg);
+ // There are a few different cases we have to handle, because the load or the
+ // sign-/zero-extend might not be selected by FastISel if we fall-back to
+ // SelectionDAG. There is also an ordering issue when both instructions are in
+ // different basic blocks.
+ // 1.) The load instruction is selected by FastISel, but the integer extend
+ // not. This usually happens when the integer extend is in a different
+ // basic block and SelectionDAG took over for that basic block.
+ // 2.) The load instruction is selected before the integer extend. This only
+ // happens when the integer extend is in a different basic block.
+ // 3.) The load instruction is selected by SelectionDAG and the integer extend
+ // by FastISel. This happens if there are instructions between the load
+ // and the integer extend that couldn't be selected by FastISel.
+ if (IntExtVal) {
+ // The integer extend hasn't been emitted yet. FastISel or SelectionDAG
+ // could select it. Emit a copy to subreg if necessary. FastISel will remove
+ // it when it selects the integer extend.
+ unsigned Reg = lookUpRegForValue(IntExtVal);
+ if (!Reg) {
+ if (RetVT == MVT::i64 && VT <= MVT::i32) {
+ if (WantZExt) {
+ // Delete the last emitted instruction from emitLoad (SUBREG_TO_REG).
+ std::prev(FuncInfo.InsertPt)->eraseFromParent();
+ ResultReg = std::prev(FuncInfo.InsertPt)->getOperand(0).getReg();
+ } else
+ ResultReg = fastEmitInst_extractsubreg(MVT::i32, ResultReg,
+ /*IsKill=*/true,
+ AArch64::sub_32);
+ }
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ // The integer extend has already been emitted - delete all the instructions
+ // that have been emitted by the integer extend lowering code and use the
+ // result from the load instruction directly.
+ while (Reg) {
+ auto *MI = MRI.getUniqueVRegDef(Reg);
+ if (!MI)
+ break;
+ Reg = 0;
+ for (auto &Opnd : MI->uses()) {
+ if (Opnd.isReg()) {
+ Reg = Opnd.getReg();
+ break;
+ }
+ }
+ MI->eraseFromParent();
+ }
+ updateValueMap(IntExtVal, ResultReg);
+ return true;
+ }
+
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
- bool UseUnscaled) {
+bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
+ MachineMemOperand *MMO) {
+ // Simplify this down to something we can handle.
+ if (!simplifyAddress(Addr, VT))
+ return false;
+
+ unsigned ScaleFactor = getImplicitScaleFactor(VT);
+ if (!ScaleFactor)
+ llvm_unreachable("Unexpected value type.");
+
// Negative offsets require unscaled, 9-bit, signed immediate offsets.
// Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
- if (!UseUnscaled && Addr.getOffset() < 0)
- UseUnscaled = true;
-
- unsigned StrOpc;
- bool VTIsi1 = false;
- int64_t ScaleFactor = 0;
- // Using scaled, 12-bit, unsigned immediate offsets.
- switch (VT.SimpleTy) {
- default:
- return false;
- case MVT::i1:
- VTIsi1 = true;
- case MVT::i8:
- StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
+ bool UseScaled = true;
+ if ((Addr.getOffset() < 0) || (Addr.getOffset() & (ScaleFactor - 1))) {
+ UseScaled = false;
ScaleFactor = 1;
- break;
- case MVT::i16:
- StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
- ScaleFactor = 2;
- break;
- case MVT::i32:
- StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
- ScaleFactor = 4;
- break;
- case MVT::i64:
- StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
- ScaleFactor = 8;
- break;
- case MVT::f32:
- StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
- ScaleFactor = 4;
- break;
- case MVT::f64:
- StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
- ScaleFactor = 8;
- break;
}
- // Scale the offset.
- if (!UseUnscaled) {
- int64_t Offset = Addr.getOffset();
- if (Offset & (ScaleFactor - 1))
- // Retry using an unscaled, 9-bit, signed immediate offset.
- return EmitStore(VT, SrcReg, Addr, /*UseUnscaled*/ true);
- Addr.setOffset(Offset / ScaleFactor);
- }
+ static const unsigned OpcTable[4][6] = {
+ { AArch64::STURBBi, AArch64::STURHHi, AArch64::STURWi, AArch64::STURXi,
+ AArch64::STURSi, AArch64::STURDi },
+ { AArch64::STRBBui, AArch64::STRHHui, AArch64::STRWui, AArch64::STRXui,
+ AArch64::STRSui, AArch64::STRDui },
+ { AArch64::STRBBroX, AArch64::STRHHroX, AArch64::STRWroX, AArch64::STRXroX,
+ AArch64::STRSroX, AArch64::STRDroX },
+ { AArch64::STRBBroW, AArch64::STRHHroW, AArch64::STRWroW, AArch64::STRXroW,
+ AArch64::STRSroW, AArch64::STRDroW }
+ };
- // Simplify this down to something we can handle.
- if (!SimplifyAddress(Addr, VT, UseUnscaled ? 1 : ScaleFactor, UseUnscaled))
- return false;
+ unsigned Opc;
+ bool VTIsi1 = false;
+ bool UseRegOffset = Addr.isRegBase() && !Addr.getOffset() && Addr.getReg() &&
+ Addr.getOffsetReg();
+ unsigned Idx = UseRegOffset ? 2 : UseScaled ? 1 : 0;
+ if (Addr.getExtendType() == AArch64_AM::UXTW ||
+ Addr.getExtendType() == AArch64_AM::SXTW)
+ Idx++;
+
+ switch (VT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i1: VTIsi1 = true;
+ case MVT::i8: Opc = OpcTable[Idx][0]; break;
+ case MVT::i16: Opc = OpcTable[Idx][1]; break;
+ case MVT::i32: Opc = OpcTable[Idx][2]; break;
+ case MVT::i64: Opc = OpcTable[Idx][3]; break;
+ case MVT::f32: Opc = OpcTable[Idx][4]; break;
+ case MVT::f64: Opc = OpcTable[Idx][5]; break;
+ }
// Storing an i1 requires special handling.
- if (VTIsi1) {
- MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
- unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
- ANDReg)
- .addReg(SrcReg)
- .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ if (VTIsi1 && SrcReg != AArch64::WZR) {
+ unsigned ANDReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+ assert(ANDReg && "Unexpected AND instruction emission failure.");
SrcReg = ANDReg;
}
// Create the base instruction, then add the operands.
- MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(StrOpc)).addReg(SrcReg);
- AddLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, UseUnscaled);
+ const MCInstrDesc &II = TII.get(Opc);
+ SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(SrcReg);
+ addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOStore, ScaleFactor, MMO);
+
return true;
}
-bool AArch64FastISel::SelectStore(const Instruction *I) {
+bool AArch64FastISel::selectStore(const Instruction *I) {
MVT VT;
- Value *Op0 = I->getOperand(0);
+ const Value *Op0 = I->getOperand(0);
// Verify we have a legal type before going any further. Currently, we handle
// simple types that will directly fit in a register (i32/f32/i64/f64) or
// those that can be sign or zero-extended to a basic operation (i1/i8/i16).
- if (!isLoadStoreTypeLegal(Op0->getType(), VT) ||
+ if (!isTypeSupported(Op0->getType(), VT, /*IsVectorAllowed=*/true) ||
cast<StoreInst>(I)->isAtomic())
return false;
- // Get the value to be stored into a register.
- unsigned SrcReg = getRegForValue(Op0);
- if (SrcReg == 0)
+ // Get the value to be stored into a register. Use the zero register directly
+ // when possible to avoid an unnecessary copy and a wasted register.
+ unsigned SrcReg = 0;
+ if (const auto *CI = dyn_cast<ConstantInt>(Op0)) {
+ if (CI->isZero())
+ SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ } else if (const auto *CF = dyn_cast<ConstantFP>(Op0)) {
+ if (CF->isZero() && !CF->isNegative()) {
+ VT = MVT::getIntegerVT(VT.getSizeInBits());
+ SrcReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ }
+ }
+
+ if (!SrcReg)
+ SrcReg = getRegForValue(Op0);
+
+ if (!SrcReg)
return false;
// See if we can handle this address.
Address Addr;
- if (!ComputeAddress(I->getOperand(1), Addr))
+ if (!computeAddress(I->getOperand(1), Addr, I->getOperand(0)->getType()))
return false;
- if (!EmitStore(VT, SrcReg, Addr))
+ if (!emitStore(VT, SrcReg, Addr, createMachineMemOperandFor(I)))
return false;
return true;
}
@@ -757,58 +2103,235 @@ static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
}
}
-bool AArch64FastISel::SelectBranch(const Instruction *I) {
+/// \brief Try to emit a combined compare-and-branch instruction.
+bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
+ assert(isa<CmpInst>(BI->getCondition()) && "Expected cmp instruction");
+ const CmpInst *CI = cast<CmpInst>(BI->getCondition());
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+
+ const Value *LHS = CI->getOperand(0);
+ const Value *RHS = CI->getOperand(1);
+
+ MVT VT;
+ if (!isTypeSupported(LHS->getType(), VT))
+ return false;
+
+ unsigned BW = VT.getSizeInBits();
+ if (BW > 64)
+ return false;
+
+ MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+ // Try to take advantage of fallthrough opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
+
+ int TestBit = -1;
+ bool IsCmpNE;
+ switch (Predicate) {
+ default:
+ return false;
+ case CmpInst::ICMP_EQ:
+ case CmpInst::ICMP_NE:
+ if (isa<Constant>(LHS) && cast<Constant>(LHS)->isNullValue())
+ std::swap(LHS, RHS);
+
+ if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+ return false;
+
+ if (const auto *AI = dyn_cast<BinaryOperator>(LHS))
+ if (AI->getOpcode() == Instruction::And && isValueAvailable(AI)) {
+ const Value *AndLHS = AI->getOperand(0);
+ const Value *AndRHS = AI->getOperand(1);
+
+ if (const auto *C = dyn_cast<ConstantInt>(AndLHS))
+ if (C->getValue().isPowerOf2())
+ std::swap(AndLHS, AndRHS);
+
+ if (const auto *C = dyn_cast<ConstantInt>(AndRHS))
+ if (C->getValue().isPowerOf2()) {
+ TestBit = C->getValue().logBase2();
+ LHS = AndLHS;
+ }
+ }
+
+ if (VT == MVT::i1)
+ TestBit = 0;
+
+ IsCmpNE = Predicate == CmpInst::ICMP_NE;
+ break;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SGE:
+ if (!isa<Constant>(RHS) || !cast<Constant>(RHS)->isNullValue())
+ return false;
+
+ TestBit = BW - 1;
+ IsCmpNE = Predicate == CmpInst::ICMP_SLT;
+ break;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SLE:
+ if (!isa<ConstantInt>(RHS))
+ return false;
+
+ if (cast<ConstantInt>(RHS)->getValue() != APInt(BW, -1, true))
+ return false;
+
+ TestBit = BW - 1;
+ IsCmpNE = Predicate == CmpInst::ICMP_SLE;
+ break;
+ } // end switch
+
+ static const unsigned OpcTable[2][2][2] = {
+ { {AArch64::CBZW, AArch64::CBZX },
+ {AArch64::CBNZW, AArch64::CBNZX} },
+ { {AArch64::TBZW, AArch64::TBZX },
+ {AArch64::TBNZW, AArch64::TBNZX} }
+ };
+
+ bool IsBitTest = TestBit != -1;
+ bool Is64Bit = BW == 64;
+ if (TestBit < 32 && TestBit >= 0)
+ Is64Bit = false;
+
+ unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit];
+ const MCInstrDesc &II = TII.get(Opc);
+
+ unsigned SrcReg = getRegForValue(LHS);
+ if (!SrcReg)
+ return false;
+ bool SrcIsKill = hasTrivialKill(LHS);
+
+ if (BW == 64 && !Is64Bit)
+ SrcReg = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
+ AArch64::sub_32);
+
+ if ((BW < 32) && !IsBitTest)
+ SrcReg = emitIntExt(VT, SrcReg, MVT::i32, /*IsZExt=*/true);
+
+ // Emit the combined compare and branch instruction.
+ SrcReg = constrainOperandRegClass(II, SrcReg, II.getNumDefs());
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
+ if (IsBitTest)
+ MIB.addImm(TestBit);
+ MIB.addMBB(TBB);
+
+ // Obtain the branch weight and add the TrueBB to the successor list.
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+ fastEmitBranch(FBB, DbgLoc);
+
+ return true;
+}
+
+bool AArch64FastISel::selectBranch(const Instruction *I) {
const BranchInst *BI = cast<BranchInst>(I);
+ if (BI->isUnconditional()) {
+ MachineBasicBlock *MSucc = FuncInfo.MBBMap[BI->getSuccessor(0)];
+ fastEmitBranch(MSucc, BI->getDebugLoc());
+ return true;
+ }
+
MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+ AArch64CC::CondCode CC = AArch64CC::NE;
if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
- if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
- // We may not handle every CC for now.
- AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
- if (CC == AArch64CC::AL)
- return false;
+ if (CI->hasOneUse() && isValueAvailable(CI)) {
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_FALSE:
+ fastEmitBranch(FBB, DbgLoc);
+ return true;
+ case CmpInst::FCMP_TRUE:
+ fastEmitBranch(TBB, DbgLoc);
+ return true;
+ }
+
+ // Try to emit a combined compare-and-branch first.
+ if (emitCompareAndBranch(BI))
+ return true;
+
+ // Try to take advantage of fallthrough opportunities.
+ if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+ std::swap(TBB, FBB);
+ Predicate = CmpInst::getInversePredicate(Predicate);
+ }
// Emit the cmp.
- if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
return false;
+ // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch
+ // instruction.
+ CC = getCompareCC(Predicate);
+ AArch64CC::CondCode ExtraCC = AArch64CC::AL;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_UEQ:
+ ExtraCC = AArch64CC::EQ;
+ CC = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_ONE:
+ ExtraCC = AArch64CC::MI;
+ CC = AArch64CC::GT;
+ break;
+ }
+ assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+
+ // Emit the extra branch for FCMP_UEQ and FCMP_ONE.
+ if (ExtraCC != AArch64CC::AL) {
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(ExtraCC)
+ .addMBB(TBB);
+ }
+
// Emit the branch.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
.addImm(CC)
.addMBB(TBB);
- FuncInfo.MBB->addSuccessor(TBB);
- FastEmitBranch(FBB, DbgLoc);
+ // Obtain the branch weight and add the TrueBB to the successor list.
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+ fastEmitBranch(FBB, DbgLoc);
return true;
}
} else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
MVT SrcVT;
- if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
- (isLoadStoreTypeLegal(TI->getOperand(0)->getType(), SrcVT))) {
+ if (TI->hasOneUse() && isValueAvailable(TI) &&
+ isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) {
unsigned CondReg = getRegForValue(TI->getOperand(0));
- if (CondReg == 0)
+ if (!CondReg)
return false;
+ bool CondIsKill = hasTrivialKill(TI->getOperand(0));
// Issue an extract_subreg to get the lower 32-bits.
- if (SrcVT == MVT::i64)
- CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
+ if (SrcVT == MVT::i64) {
+ CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill,
AArch64::sub_32);
+ CondIsKill = true;
+ }
- MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
- unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(AArch64::ANDWri), ANDReg)
- .addReg(CondReg)
- .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(AArch64::SUBSWri))
- .addReg(ANDReg)
- .addReg(ANDReg)
- .addImm(0)
- .addImm(0);
+ unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1);
+ assert(ANDReg && "Unexpected AND instruction emission failure.");
+ emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0);
- unsigned CC = AArch64CC::NE;
if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
std::swap(TBB, FBB);
CC = AArch64CC::EQ;
@@ -816,23 +2339,57 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
.addImm(CC)
.addMBB(TBB);
- FuncInfo.MBB->addSuccessor(TBB);
- FastEmitBranch(FBB, DbgLoc);
+
+ // Obtain the branch weight and add the TrueBB to the successor list.
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+ fastEmitBranch(FBB, DbgLoc);
return true;
}
- } else if (const ConstantInt *CI =
- dyn_cast<ConstantInt>(BI->getCondition())) {
+ } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) {
uint64_t Imm = CI->getZExtValue();
MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
.addMBB(Target);
- FuncInfo.MBB->addSuccessor(Target);
+
+ // Obtain the branch weight and add the target to the successor list.
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ Target->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(Target, BranchWeight);
+ return true;
+ } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
+ // Fake request the condition, otherwise the intrinsic might be completely
+ // optimized away.
+ unsigned CondReg = getRegForValue(BI->getCondition());
+ if (!CondReg)
+ return false;
+
+ // Emit the branch.
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TBB);
+
+ // Obtain the branch weight and add the TrueBB to the successor list.
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+ fastEmitBranch(FBB, DbgLoc);
return true;
}
unsigned CondReg = getRegForValue(BI->getCondition());
if (CondReg == 0)
return false;
+ bool CondRegIsKill = hasTrivialKill(BI->getCondition());
// We've been divorced from our compare! Our block was split, and
// now our compare lives in a predecessor block. We musn't
@@ -841,13 +2398,8 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) {
// Regardless, the compare has been done in the predecessor block,
// and it left a value for us in a virtual register. Ergo, we test
// the one-bit value left in the virtual register.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
- AArch64::WZR)
- .addReg(CondReg)
- .addImm(0)
- .addImm(0);
+ emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0);
- unsigned CC = AArch64CC::NE;
if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
std::swap(TBB, FBB);
CC = AArch64CC::EQ;
@@ -856,20 +2408,28 @@ bool AArch64FastISel::SelectBranch(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
.addImm(CC)
.addMBB(TBB);
- FuncInfo.MBB->addSuccessor(TBB);
- FastEmitBranch(FBB, DbgLoc);
+
+ // Obtain the branch weight and add the TrueBB to the successor list.
+ uint32_t BranchWeight = 0;
+ if (FuncInfo.BPI)
+ BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
+ TBB->getBasicBlock());
+ FuncInfo.MBB->addSuccessor(TBB, BranchWeight);
+
+ fastEmitBranch(FBB, DbgLoc);
return true;
}
-bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
+bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
const IndirectBrInst *BI = cast<IndirectBrInst>(I);
unsigned AddrReg = getRegForValue(BI->getOperand(0));
if (AddrReg == 0)
return false;
// Emit the indirect branch.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
- .addReg(AddrReg);
+ const MCInstrDesc &II = TII.get(AArch64::BR);
+ AddrReg = constrainOperandRegClass(II, AddrReg, II.getNumDefs());
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg);
// Make sure the CFG is up-to-date.
for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i)
@@ -878,211 +2438,271 @@ bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
return true;
}
-bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
- Type *Ty = Src1Value->getType();
- EVT SrcEVT = TLI.getValueType(Ty, true);
- if (!SrcEVT.isSimple())
- return false;
- MVT SrcVT = SrcEVT.getSimpleVT();
-
- // Check to see if the 2nd operand is a constant that we can encode directly
- // in the compare.
- uint64_t Imm;
- bool UseImm = false;
- bool isNegativeImm = false;
- if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
- if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
- SrcVT == MVT::i8 || SrcVT == MVT::i1) {
- const APInt &CIVal = ConstInt->getValue();
-
- Imm = (isZExt) ? CIVal.getZExtValue() : CIVal.getSExtValue();
- if (CIVal.isNegative()) {
- isNegativeImm = true;
- Imm = -Imm;
- }
- // FIXME: We can handle more immediates using shifts.
- UseImm = ((Imm & 0xfff) == Imm);
- }
- } else if (const ConstantFP *ConstFP = dyn_cast<ConstantFP>(Src2Value)) {
- if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
- if (ConstFP->isZero() && !ConstFP->isNegative())
- UseImm = true;
- }
+bool AArch64FastISel::selectCmp(const Instruction *I) {
+ const CmpInst *CI = cast<CmpInst>(I);
- unsigned ZReg;
- unsigned CmpOpc;
- bool isICmp = true;
- bool needsExt = false;
- switch (SrcVT.SimpleTy) {
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
+ unsigned ResultReg = 0;
+ switch (Predicate) {
default:
- return false;
- case MVT::i1:
- case MVT::i8:
- case MVT::i16:
- needsExt = true;
- // Intentional fall-through.
- case MVT::i32:
- ZReg = AArch64::WZR;
- if (UseImm)
- CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
- else
- CmpOpc = AArch64::SUBSWrr;
break;
- case MVT::i64:
- ZReg = AArch64::XZR;
- if (UseImm)
- CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
- else
- CmpOpc = AArch64::SUBSXrr;
- break;
- case MVT::f32:
- isICmp = false;
- CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
+ case CmpInst::FCMP_FALSE:
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(AArch64::WZR, getKillRegState(true));
break;
- case MVT::f64:
- isICmp = false;
- CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
+ case CmpInst::FCMP_TRUE:
+ ResultReg = fastEmit_i(MVT::i32, MVT::i32, ISD::Constant, 1);
break;
}
- unsigned SrcReg1 = getRegForValue(Src1Value);
- if (SrcReg1 == 0)
- return false;
-
- unsigned SrcReg2;
- if (!UseImm) {
- SrcReg2 = getRegForValue(Src2Value);
- if (SrcReg2 == 0)
- return false;
+ if (ResultReg) {
+ updateValueMap(I, ResultReg);
+ return true;
}
- // We have i1, i8, or i16, we need to either zero extend or sign extend.
- if (needsExt) {
- SrcReg1 = EmitIntExt(SrcVT, SrcReg1, MVT::i32, isZExt);
- if (SrcReg1 == 0)
- return false;
- if (!UseImm) {
- SrcReg2 = EmitIntExt(SrcVT, SrcReg2, MVT::i32, isZExt);
- if (SrcReg2 == 0)
- return false;
- }
- }
+ // Emit the cmp.
+ if (!emitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+ return false;
- if (isICmp) {
- if (UseImm)
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
- .addReg(ZReg)
- .addReg(SrcReg1)
- .addImm(Imm)
- .addImm(0);
- else
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
- .addReg(ZReg)
- .addReg(SrcReg1)
- .addReg(SrcReg2);
- } else {
- if (UseImm)
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
- .addReg(SrcReg1);
- else
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CmpOpc))
- .addReg(SrcReg1)
- .addReg(SrcReg2);
- }
- return true;
-}
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
-bool AArch64FastISel::SelectCmp(const Instruction *I) {
- const CmpInst *CI = cast<CmpInst>(I);
+ // FCMP_UEQ and FCMP_ONE cannot be checked with a single instruction. These
+ // condition codes are inverted, because they are used by CSINC.
+ static unsigned CondCodeTable[2][2] = {
+ { AArch64CC::NE, AArch64CC::VC },
+ { AArch64CC::PL, AArch64CC::LE }
+ };
+ unsigned *CondCodes = nullptr;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_UEQ:
+ CondCodes = &CondCodeTable[0][0];
+ break;
+ case CmpInst::FCMP_ONE:
+ CondCodes = &CondCodeTable[1][0];
+ break;
+ }
- // We may not handle every CC for now.
- AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
- if (CC == AArch64CC::AL)
- return false;
+ if (CondCodes) {
+ unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+ TmpReg1)
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addImm(CondCodes[0]);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
+ ResultReg)
+ .addReg(TmpReg1, getKillRegState(true))
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addImm(CondCodes[1]);
- // Emit the cmp.
- if (!EmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
- return false;
+ updateValueMap(I, ResultReg);
+ return true;
+ }
// Now set a register based on the comparison.
+ AArch64CC::CondCode CC = getCompareCC(Predicate);
+ assert((CC != AArch64CC::AL) && "Unexpected condition code.");
AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
- unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
ResultReg)
- .addReg(AArch64::WZR)
- .addReg(AArch64::WZR)
+ .addReg(AArch64::WZR, getKillRegState(true))
+ .addReg(AArch64::WZR, getKillRegState(true))
.addImm(invertedCC);
- UpdateValueMap(I, ResultReg);
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::SelectSelect(const Instruction *I) {
- const SelectInst *SI = cast<SelectInst>(I);
-
- EVT DestEVT = TLI.getValueType(SI->getType(), true);
- if (!DestEVT.isSimple())
+/// \brief Optimize selects of i1 if one of the operands has a 'true' or 'false'
+/// value.
+bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
+ if (!SI->getType()->isIntegerTy(1))
return false;
- MVT DestVT = DestEVT.getSimpleVT();
- if (DestVT != MVT::i32 && DestVT != MVT::i64 && DestVT != MVT::f32 &&
- DestVT != MVT::f64)
- return false;
+ const Value *Src1Val, *Src2Val;
+ unsigned Opc = 0;
+ bool NeedExtraOp = false;
+ if (auto *CI = dyn_cast<ConstantInt>(SI->getTrueValue())) {
+ if (CI->isOne()) {
+ Src1Val = SI->getCondition();
+ Src2Val = SI->getFalseValue();
+ Opc = AArch64::ORRWrr;
+ } else {
+ assert(CI->isZero());
+ Src1Val = SI->getFalseValue();
+ Src2Val = SI->getCondition();
+ Opc = AArch64::BICWrr;
+ }
+ } else if (auto *CI = dyn_cast<ConstantInt>(SI->getFalseValue())) {
+ if (CI->isOne()) {
+ Src1Val = SI->getCondition();
+ Src2Val = SI->getTrueValue();
+ Opc = AArch64::ORRWrr;
+ NeedExtraOp = true;
+ } else {
+ assert(CI->isZero());
+ Src1Val = SI->getCondition();
+ Src2Val = SI->getTrueValue();
+ Opc = AArch64::ANDWrr;
+ }
+ }
- unsigned CondReg = getRegForValue(SI->getCondition());
- if (CondReg == 0)
- return false;
- unsigned TrueReg = getRegForValue(SI->getTrueValue());
- if (TrueReg == 0)
+ if (!Opc)
return false;
- unsigned FalseReg = getRegForValue(SI->getFalseValue());
- if (FalseReg == 0)
+
+ unsigned Src1Reg = getRegForValue(Src1Val);
+ if (!Src1Reg)
return false;
+ bool Src1IsKill = hasTrivialKill(Src1Val);
+ unsigned Src2Reg = getRegForValue(Src2Val);
+ if (!Src2Reg)
+ return false;
+ bool Src2IsKill = hasTrivialKill(Src2Val);
- MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
- unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
- ANDReg)
- .addReg(CondReg)
- .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ if (NeedExtraOp) {
+ Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, Src1IsKill, 1);
+ Src1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32spRegClass, Src1Reg,
+ Src1IsKill, Src2Reg, Src2IsKill);
+ updateValueMap(SI, ResultReg);
+ return true;
+}
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
- .addReg(ANDReg)
- .addReg(ANDReg)
- .addImm(0)
- .addImm(0);
+bool AArch64FastISel::selectSelect(const Instruction *I) {
+ assert(isa<SelectInst>(I) && "Expected a select instruction.");
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT))
+ return false;
- unsigned SelectOpc;
- switch (DestVT.SimpleTy) {
+ unsigned Opc;
+ const TargetRegisterClass *RC;
+ switch (VT.SimpleTy) {
default:
return false;
+ case MVT::i1:
+ case MVT::i8:
+ case MVT::i16:
case MVT::i32:
- SelectOpc = AArch64::CSELWr;
+ Opc = AArch64::CSELWr;
+ RC = &AArch64::GPR32RegClass;
break;
case MVT::i64:
- SelectOpc = AArch64::CSELXr;
+ Opc = AArch64::CSELXr;
+ RC = &AArch64::GPR64RegClass;
break;
case MVT::f32:
- SelectOpc = AArch64::FCSELSrrr;
+ Opc = AArch64::FCSELSrrr;
+ RC = &AArch64::FPR32RegClass;
break;
case MVT::f64:
- SelectOpc = AArch64::FCSELDrrr;
+ Opc = AArch64::FCSELDrrr;
+ RC = &AArch64::FPR64RegClass;
break;
}
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SelectOpc),
- ResultReg)
- .addReg(TrueReg)
- .addReg(FalseReg)
- .addImm(AArch64CC::NE);
+ const SelectInst *SI = cast<SelectInst>(I);
+ const Value *Cond = SI->getCondition();
+ AArch64CC::CondCode CC = AArch64CC::NE;
+ AArch64CC::CondCode ExtraCC = AArch64CC::AL;
- UpdateValueMap(I, ResultReg);
+ if (optimizeSelect(SI))
+ return true;
+
+ // Try to pickup the flags, so we don't have to emit another compare.
+ if (foldXALUIntrinsic(CC, I, Cond)) {
+ // Fake request the condition to force emission of the XALU intrinsic.
+ unsigned CondReg = getRegForValue(Cond);
+ if (!CondReg)
+ return false;
+ } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() &&
+ isValueAvailable(Cond)) {
+ const auto *Cmp = cast<CmpInst>(Cond);
+ // Try to optimize or fold the cmp.
+ CmpInst::Predicate Predicate = optimizeCmpPredicate(Cmp);
+ const Value *FoldSelect = nullptr;
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_FALSE:
+ FoldSelect = SI->getFalseValue();
+ break;
+ case CmpInst::FCMP_TRUE:
+ FoldSelect = SI->getTrueValue();
+ break;
+ }
+
+ if (FoldSelect) {
+ unsigned SrcReg = getRegForValue(FoldSelect);
+ if (!SrcReg)
+ return false;
+ unsigned UseReg = lookUpRegForValue(SI);
+ if (UseReg)
+ MRI.clearKillFlags(UseReg);
+
+ updateValueMap(I, SrcReg);
+ return true;
+ }
+
+ // Emit the cmp.
+ if (!emitCmp(Cmp->getOperand(0), Cmp->getOperand(1), Cmp->isUnsigned()))
+ return false;
+
+ // FCMP_UEQ and FCMP_ONE cannot be checked with a single select instruction.
+ CC = getCompareCC(Predicate);
+ switch (Predicate) {
+ default:
+ break;
+ case CmpInst::FCMP_UEQ:
+ ExtraCC = AArch64CC::EQ;
+ CC = AArch64CC::VS;
+ break;
+ case CmpInst::FCMP_ONE:
+ ExtraCC = AArch64CC::MI;
+ CC = AArch64CC::GT;
+ break;
+ }
+ assert((CC != AArch64CC::AL) && "Unexpected condition code.");
+ } else {
+ unsigned CondReg = getRegForValue(Cond);
+ if (!CondReg)
+ return false;
+ bool CondIsKill = hasTrivialKill(Cond);
+
+ // Emit a TST instruction (ANDS wzr, reg, #imm).
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDSWri),
+ AArch64::WZR)
+ .addReg(CondReg, getKillRegState(CondIsKill))
+ .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+ }
+
+ unsigned Src1Reg = getRegForValue(SI->getTrueValue());
+ bool Src1IsKill = hasTrivialKill(SI->getTrueValue());
+
+ unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+ bool Src2IsKill = hasTrivialKill(SI->getFalseValue());
+
+ if (!Src1Reg || !Src2Reg)
+ return false;
+
+ if (ExtraCC != AArch64CC::AL) {
+ Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+ Src2IsKill, ExtraCC);
+ Src2IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src1IsKill, Src2Reg,
+ Src2IsKill, CC);
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::SelectFPExt(const Instruction *I) {
+bool AArch64FastISel::selectFPExt(const Instruction *I) {
Value *V = I->getOperand(0);
if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
return false;
@@ -1094,11 +2714,11 @@ bool AArch64FastISel::SelectFPExt(const Instruction *I) {
unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
ResultReg).addReg(Op);
- UpdateValueMap(I, ResultReg);
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
+bool AArch64FastISel::selectFPTrunc(const Instruction *I) {
Value *V = I->getOperand(0);
if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
return false;
@@ -1110,12 +2730,12 @@ bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
ResultReg).addReg(Op);
- UpdateValueMap(I, ResultReg);
+ updateValueMap(I, ResultReg);
return true;
}
// FPToUI and FPToSI
-bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
MVT DestVT;
if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
return false;
@@ -1144,11 +2764,11 @@ bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
.addReg(SrcReg);
- UpdateValueMap(I, ResultReg);
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
MVT DestVT;
if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
return false;
@@ -1156,22 +2776,21 @@ bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
"Unexpected value type.");
unsigned SrcReg = getRegForValue(I->getOperand(0));
- if (SrcReg == 0)
+ if (!SrcReg)
return false;
+ bool SrcIsKill = hasTrivialKill(I->getOperand(0));
EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
// Handle sign-extension.
if (SrcVT == MVT::i16 || SrcVT == MVT::i8 || SrcVT == MVT::i1) {
SrcReg =
- EmitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
- if (SrcReg == 0)
+ emitIntExt(SrcVT.getSimpleVT(), SrcReg, MVT::i32, /*isZExt*/ !Signed);
+ if (!SrcReg)
return false;
+ SrcIsKill = true;
}
- MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
- : &AArch64::GPR32RegClass);
-
unsigned Opc;
if (SrcVT == MVT::i64) {
if (Signed)
@@ -1185,21 +2804,128 @@ bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
}
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addReg(SrcReg);
- UpdateValueMap(I, ResultReg);
+ unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg,
+ SrcIsKill);
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::fastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C)
+ return false;
+
+ // Only handle simple cases of up to 8 GPR and FPR each.
+ unsigned GPRCnt = 0;
+ unsigned FPRCnt = 0;
+ unsigned Idx = 0;
+ for (auto const &Arg : F->args()) {
+ // The first argument is at index 1.
+ ++Idx;
+ if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+ return false;
+
+ Type *ArgTy = Arg.getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(ArgTy);
+ if (!ArgVT.isSimple())
+ return false;
+
+ MVT VT = ArgVT.getSimpleVT().SimpleTy;
+ if (VT.isFloatingPoint() && !Subtarget->hasFPARMv8())
+ return false;
+
+ if (VT.isVector() &&
+ (!Subtarget->hasNEON() || !Subtarget->isLittleEndian()))
+ return false;
+
+ if (VT >= MVT::i1 && VT <= MVT::i64)
+ ++GPRCnt;
+ else if ((VT >= MVT::f16 && VT <= MVT::f64) || VT.is64BitVector() ||
+ VT.is128BitVector())
+ ++FPRCnt;
+ else
+ return false;
+
+ if (GPRCnt > 8 || FPRCnt > 8)
+ return false;
+ }
+
+ static const MCPhysReg Registers[6][8] = {
+ { AArch64::W0, AArch64::W1, AArch64::W2, AArch64::W3, AArch64::W4,
+ AArch64::W5, AArch64::W6, AArch64::W7 },
+ { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4,
+ AArch64::X5, AArch64::X6, AArch64::X7 },
+ { AArch64::H0, AArch64::H1, AArch64::H2, AArch64::H3, AArch64::H4,
+ AArch64::H5, AArch64::H6, AArch64::H7 },
+ { AArch64::S0, AArch64::S1, AArch64::S2, AArch64::S3, AArch64::S4,
+ AArch64::S5, AArch64::S6, AArch64::S7 },
+ { AArch64::D0, AArch64::D1, AArch64::D2, AArch64::D3, AArch64::D4,
+ AArch64::D5, AArch64::D6, AArch64::D7 },
+ { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4,
+ AArch64::Q5, AArch64::Q6, AArch64::Q7 }
+ };
+
+ unsigned GPRIdx = 0;
+ unsigned FPRIdx = 0;
+ for (auto const &Arg : F->args()) {
+ MVT VT = TLI.getSimpleValueType(Arg.getType());
+ unsigned SrcReg;
+ const TargetRegisterClass *RC;
+ if (VT >= MVT::i1 && VT <= MVT::i32) {
+ SrcReg = Registers[0][GPRIdx++];
+ RC = &AArch64::GPR32RegClass;
+ VT = MVT::i32;
+ } else if (VT == MVT::i64) {
+ SrcReg = Registers[1][GPRIdx++];
+ RC = &AArch64::GPR64RegClass;
+ } else if (VT == MVT::f16) {
+ SrcReg = Registers[2][FPRIdx++];
+ RC = &AArch64::FPR16RegClass;
+ } else if (VT == MVT::f32) {
+ SrcReg = Registers[3][FPRIdx++];
+ RC = &AArch64::FPR32RegClass;
+ } else if ((VT == MVT::f64) || VT.is64BitVector()) {
+ SrcReg = Registers[4][FPRIdx++];
+ RC = &AArch64::FPR64RegClass;
+ } else if (VT.is128BitVector()) {
+ SrcReg = Registers[5][FPRIdx++];
+ RC = &AArch64::FPR128RegClass;
+ } else
+ llvm_unreachable("Unexpected value type.");
+
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(DstReg, getKillRegState(true));
+ updateValueMap(&Arg, ResultReg);
+ }
return true;
}
-bool AArch64FastISel::ProcessCallArgs(
- SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
- SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
- SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
- unsigned &NumBytes) {
+bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
+ SmallVectorImpl<MVT> &OutVTs,
+ unsigned &NumBytes) {
+ CallingConv::ID CC = CLI.CallConv;
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
- CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+ CCState CCInfo(CC, false, *FuncInfo.MF, ArgLocs, *Context);
+ CCInfo.AnalyzeCallOperands(OutVTs, CLI.OutFlags, CCAssignFnForCall(CC));
// Get a count of how many bytes are to be pushed on the stack.
NumBytes = CCInfo.getNextStackOffset();
@@ -1207,13 +2933,17 @@ bool AArch64FastISel::ProcessCallArgs(
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes);
+ .addImm(NumBytes);
// Process the args.
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
- unsigned Arg = ArgRegs[VA.getValNo()];
- MVT ArgVT = ArgVTs[VA.getValNo()];
+ const Value *ArgVal = CLI.OutVals[VA.getValNo()];
+ MVT ArgVT = OutVTs[VA.getValNo()];
+
+ unsigned ArgReg = getRegForValue(ArgVal);
+ if (!ArgReg)
+ return false;
// Handle arg promotion: SExt, ZExt, AExt.
switch (VA.getLocInfo()) {
@@ -1222,8 +2952,8 @@ bool AArch64FastISel::ProcessCallArgs(
case CCValAssign::SExt: {
MVT DestVT = VA.getLocVT();
MVT SrcVT = ArgVT;
- Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ false);
- if (Arg == 0)
+ ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/false);
+ if (!ArgReg)
return false;
break;
}
@@ -1232,8 +2962,8 @@ bool AArch64FastISel::ProcessCallArgs(
case CCValAssign::ZExt: {
MVT DestVT = VA.getLocVT();
MVT SrcVT = ArgVT;
- Arg = EmitIntExt(SrcVT, Arg, DestVT, /*isZExt*/ true);
- if (Arg == 0)
+ ArgReg = emitIntExt(SrcVT, ArgReg, DestVT, /*isZExt=*/true);
+ if (!ArgReg)
return false;
break;
}
@@ -1244,14 +2974,18 @@ bool AArch64FastISel::ProcessCallArgs(
// Now copy/store arg to correct locations.
if (VA.isRegLoc() && !VA.needsCustom()) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(Arg);
- RegArgs.push_back(VA.getLocReg());
+ TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
+ CLI.OutRegs.push_back(VA.getLocReg());
} else if (VA.needsCustom()) {
// FIXME: Handle custom args.
return false;
} else {
assert(VA.isMemLoc() && "Assuming store on stack.");
+ // Don't emit stores for undef values.
+ if (isa<UndefValue>(ArgVal))
+ continue;
+
// Need to store on the stack.
unsigned ArgSize = (ArgVT.getSizeInBits() + 7) / 8;
@@ -1264,26 +2998,31 @@ bool AArch64FastISel::ProcessCallArgs(
Addr.setReg(AArch64::SP);
Addr.setOffset(VA.getLocMemOffset() + BEAlign);
- if (!EmitStore(ArgVT, Arg, Addr))
+ unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+ MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
+ MachinePointerInfo::getStack(Addr.getOffset()),
+ MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
+
+ if (!emitStore(ArgVT, ArgReg, Addr, MMO))
return false;
}
}
return true;
}
-bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
- const Instruction *I, CallingConv::ID CC,
- unsigned &NumBytes) {
+bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
+ unsigned NumBytes) {
+ CallingConv::ID CC = CLI.CallConv;
+
// Issue CALLSEQ_END
unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
- .addImm(NumBytes)
- .addImm(0);
+ .addImm(NumBytes).addImm(0);
// Now the return value.
if (RetVT != MVT::isVoid) {
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CC, false, *FuncInfo.MF, TM, RVLocs, *Context);
+ CCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context);
CCInfo.AnalyzeCallResult(RetVT, CCAssignFnForCall(CC));
// Only handle a single return value.
@@ -1294,147 +3033,147 @@ bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
MVT CopyVT = RVLocs[0].getValVT();
unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY),
- ResultReg).addReg(RVLocs[0].getLocReg());
- UsedRegs.push_back(RVLocs[0].getLocReg());
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(RVLocs[0].getLocReg());
+ CLI.InRegs.push_back(RVLocs[0].getLocReg());
- // Finally update the result.
- UpdateValueMap(I, ResultReg);
+ CLI.ResultReg = ResultReg;
+ CLI.NumResultRegs = 1;
}
return true;
}
-bool AArch64FastISel::SelectCall(const Instruction *I,
- const char *IntrMemName = nullptr) {
- const CallInst *CI = cast<CallInst>(I);
- const Value *Callee = CI->getCalledValue();
+bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
+ CallingConv::ID CC = CLI.CallConv;
+ bool IsTailCall = CLI.IsTailCall;
+ bool IsVarArg = CLI.IsVarArg;
+ const Value *Callee = CLI.Callee;
+ const char *SymName = CLI.SymName;
- // Don't handle inline asm or intrinsics.
- if (isa<InlineAsm>(Callee))
+ if (!Callee && !SymName)
return false;
- // Only handle global variable Callees.
- const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
- if (!GV)
+ // Allow SelectionDAG isel to handle tail calls.
+ if (IsTailCall)
return false;
- // Check the calling convention.
- ImmutableCallSite CS(CI);
- CallingConv::ID CC = CS.getCallingConv();
+ CodeModel::Model CM = TM.getCodeModel();
+ // Only support the small and large code model.
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ return false;
+
+ // FIXME: Add large code model support for ELF.
+ if (CM == CodeModel::Large && !Subtarget->isTargetMachO())
+ return false;
// Let SDISel handle vararg functions.
- PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
- FunctionType *FTy = cast<FunctionType>(PT->getElementType());
- if (FTy->isVarArg())
+ if (IsVarArg)
return false;
- // Handle *simple* calls for now.
+ // FIXME: Only handle *simple* calls for now.
MVT RetVT;
- Type *RetTy = I->getType();
- if (RetTy->isVoidTy())
+ if (CLI.RetTy->isVoidTy())
RetVT = MVT::isVoid;
- else if (!isTypeLegal(RetTy, RetVT))
+ else if (!isTypeLegal(CLI.RetTy, RetVT))
return false;
- // Set up the argument vectors.
- SmallVector<Value *, 8> Args;
- SmallVector<unsigned, 8> ArgRegs;
- SmallVector<MVT, 8> ArgVTs;
- SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
- Args.reserve(CS.arg_size());
- ArgRegs.reserve(CS.arg_size());
- ArgVTs.reserve(CS.arg_size());
- ArgFlags.reserve(CS.arg_size());
-
- for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
- i != e; ++i) {
- // If we're lowering a memory intrinsic instead of a regular call, skip the
- // last two arguments, which shouldn't be passed to the underlying function.
- if (IntrMemName && e - i <= 2)
- break;
-
- unsigned Arg = getRegForValue(*i);
- if (Arg == 0)
+ for (auto Flag : CLI.OutFlags)
+ if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
return false;
- ISD::ArgFlagsTy Flags;
- unsigned AttrInd = i - CS.arg_begin() + 1;
- if (CS.paramHasAttr(AttrInd, Attribute::SExt))
- Flags.setSExt();
- if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
- Flags.setZExt();
-
- // FIXME: Only handle *easy* calls for now.
- if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
- CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
- CS.paramHasAttr(AttrInd, Attribute::Nest) ||
- CS.paramHasAttr(AttrInd, Attribute::ByVal))
- return false;
+ // Set up the argument vectors.
+ SmallVector<MVT, 16> OutVTs;
+ OutVTs.reserve(CLI.OutVals.size());
- MVT ArgVT;
- Type *ArgTy = (*i)->getType();
- if (!isTypeLegal(ArgTy, ArgVT) &&
- !(ArgVT == MVT::i1 || ArgVT == MVT::i8 || ArgVT == MVT::i16))
+ for (auto *Val : CLI.OutVals) {
+ MVT VT;
+ if (!isTypeLegal(Val->getType(), VT) &&
+ !(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16))
return false;
// We don't handle vector parameters yet.
- if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64)
+ if (VT.isVector() || VT.getSizeInBits() > 64)
return false;
- unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
- Flags.setOrigAlign(OriginalAlignment);
-
- Args.push_back(*i);
- ArgRegs.push_back(Arg);
- ArgVTs.push_back(ArgVT);
- ArgFlags.push_back(Flags);
+ OutVTs.push_back(VT);
}
+ Address Addr;
+ if (Callee && !computeCallAddress(Callee, Addr))
+ return false;
+
// Handle the arguments now that we've gotten them.
- SmallVector<unsigned, 4> RegArgs;
unsigned NumBytes;
- if (!ProcessCallArgs(Args, ArgRegs, ArgVTs, ArgFlags, RegArgs, CC, NumBytes))
+ if (!processCallArgs(CLI, OutVTs, NumBytes))
return false;
// Issue the call.
MachineInstrBuilder MIB;
- MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
- if (!IntrMemName)
- MIB.addGlobalAddress(GV, 0, 0);
- else
- MIB.addExternalSymbol(IntrMemName, 0);
+ if (CM == CodeModel::Small) {
+ const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
+ if (SymName)
+ MIB.addExternalSymbol(SymName, 0);
+ else if (Addr.getGlobalValue())
+ MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0);
+ else if (Addr.getReg()) {
+ unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
+ MIB.addReg(Reg);
+ } else
+ return false;
+ } else {
+ unsigned CallReg = 0;
+ if (SymName) {
+ unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+ ADRPReg)
+ .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+ CallReg = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
+ CallReg)
+ .addReg(ADRPReg)
+ .addExternalSymbol(SymName, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+ AArch64II::MO_NC);
+ } else if (Addr.getGlobalValue())
+ CallReg = materializeGV(Addr.getGlobalValue());
+ else if (Addr.getReg())
+ CallReg = Addr.getReg();
+
+ if (!CallReg)
+ return false;
+
+ const MCInstrDesc &II = TII.get(AArch64::BLR);
+ CallReg = constrainOperandRegClass(II, CallReg, 0);
+ MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
+ }
// Add implicit physical register uses to the call.
- for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
- MIB.addReg(RegArgs[i], RegState::Implicit);
+ for (auto Reg : CLI.OutRegs)
+ MIB.addReg(Reg, RegState::Implicit);
// Add a register mask with the call-preserved registers.
// Proper defs for return values will be added by setPhysRegsDeadExcept().
- MIB.addRegMask(TRI.getCallPreservedMask(CS.getCallingConv()));
-
- // Finish off the call including any return values.
- SmallVector<unsigned, 4> UsedRegs;
- if (!FinishCall(RetVT, UsedRegs, I, CC, NumBytes))
- return false;
+ MIB.addRegMask(TRI.getCallPreservedMask(CC));
- // Set all unused physreg defs as dead.
- static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+ CLI.Call = MIB;
- return true;
+ // Finish off the call including any return values.
+ return finishCall(CLI, RetVT, NumBytes);
}
-bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+bool AArch64FastISel::isMemCpySmall(uint64_t Len, unsigned Alignment) {
if (Alignment)
return Len / Alignment <= 4;
else
return Len < 32;
}
-bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+bool AArch64FastISel::tryEmitSmallMemCpy(Address Dest, Address Src,
uint64_t Len, unsigned Alignment) {
// Make sure we don't bloat code by inlining very large memcpy's.
- if (!IsMemCpySmall(Len, Alignment))
+ if (!isMemCpySmall(Len, Alignment))
return false;
int64_t UnscaledOffset = 0;
@@ -1464,14 +3203,11 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
}
}
- bool RV;
- unsigned ResultReg;
- RV = EmitLoad(VT, ResultReg, Src);
- if (!RV)
+ unsigned ResultReg = emitLoad(VT, VT, Src);
+ if (!ResultReg)
return false;
- RV = EmitStore(VT, ResultReg, Dest);
- if (!RV)
+ if (!emitStore(VT, ResultReg, Dest))
return false;
int64_t Size = VT.getSizeInBits() / 8;
@@ -1486,73 +3222,430 @@ bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
return true;
}
-bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
- // FIXME: Handle more intrinsics.
- switch (I.getIntrinsicID()) {
+/// \brief Check if it is possible to fold the condition from the XALU intrinsic
+/// into the user. The condition code will only be updated on success.
+bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC,
+ const Instruction *I,
+ const Value *Cond) {
+ if (!isa<ExtractValueInst>(Cond))
+ return false;
+
+ const auto *EV = cast<ExtractValueInst>(Cond);
+ if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
+ return false;
+
+ const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
+ MVT RetVT;
+ const Function *Callee = II->getCalledFunction();
+ Type *RetTy =
+ cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
+ if (!isTypeLegal(RetTy, RetVT))
+ return false;
+
+ if (RetVT != MVT::i32 && RetVT != MVT::i64)
+ return false;
+
+ const Value *LHS = II->getArgOperand(0);
+ const Value *RHS = II->getArgOperand(1);
+
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+ isCommutativeIntrinsic(II))
+ std::swap(LHS, RHS);
+
+ // Simplify multiplies.
+ unsigned IID = II->getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::smul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2)
+ IID = Intrinsic::sadd_with_overflow;
+ break;
+ case Intrinsic::umul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2)
+ IID = Intrinsic::uadd_with_overflow;
+ break;
+ }
+
+ AArch64CC::CondCode TmpCC;
+ switch (IID) {
default:
return false;
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ TmpCC = AArch64CC::VS;
+ break;
+ case Intrinsic::uadd_with_overflow:
+ TmpCC = AArch64CC::HS;
+ break;
+ case Intrinsic::usub_with_overflow:
+ TmpCC = AArch64CC::LO;
+ break;
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow:
+ TmpCC = AArch64CC::NE;
+ break;
+ }
+
+ // Check if both instructions are in the same basic block.
+ if (!isValueAvailable(II))
+ return false;
+
+ // Make sure nothing is in the way
+ BasicBlock::const_iterator Start = I;
+ BasicBlock::const_iterator End = II;
+ for (auto Itr = std::prev(Start); Itr != End; --Itr) {
+ // We only expect extractvalue instructions between the intrinsic and the
+ // instruction to be selected.
+ if (!isa<ExtractValueInst>(Itr))
+ return false;
+
+ // Check that the extractvalue operand comes from the intrinsic.
+ const auto *EVI = cast<ExtractValueInst>(Itr);
+ if (EVI->getAggregateOperand() != II)
+ return false;
+ }
+
+ CC = TmpCC;
+ return true;
+}
+
+bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
+ // FIXME: Handle more intrinsics.
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::frameaddress: {
+ MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
+ MFI->setFrameAddressIsTaken(true);
+
+ const AArch64RegisterInfo *RegInfo =
+ static_cast<const AArch64RegisterInfo *>(
+ TM.getSubtargetImpl()->getRegisterInfo());
+ unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
+ unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), SrcReg).addReg(FramePtr);
+ // Recursively load frame address
+ // ldr x0, [fp]
+ // ldr x0, [x0]
+ // ldr x0, [x0]
+ // ...
+ unsigned DestReg;
+ unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
+ while (Depth--) {
+ DestReg = fastEmitInst_ri(AArch64::LDRXui, &AArch64::GPR64RegClass,
+ SrcReg, /*IsKill=*/true, 0);
+ assert(DestReg && "Unexpected LDR instruction emission failure.");
+ SrcReg = DestReg;
+ }
+
+ updateValueMap(II, SrcReg);
+ return true;
+ }
case Intrinsic::memcpy:
case Intrinsic::memmove: {
- const MemTransferInst &MTI = cast<MemTransferInst>(I);
+ const auto *MTI = cast<MemTransferInst>(II);
// Don't handle volatile.
- if (MTI.isVolatile())
+ if (MTI->isVolatile())
return false;
// Disable inlining for memmove before calls to ComputeAddress. Otherwise,
// we would emit dead code because we don't currently handle memmoves.
- bool isMemCpy = (I.getIntrinsicID() == Intrinsic::memcpy);
- if (isa<ConstantInt>(MTI.getLength()) && isMemCpy) {
+ bool IsMemCpy = (II->getIntrinsicID() == Intrinsic::memcpy);
+ if (isa<ConstantInt>(MTI->getLength()) && IsMemCpy) {
// Small memcpy's are common enough that we want to do them without a call
// if possible.
- uint64_t Len = cast<ConstantInt>(MTI.getLength())->getZExtValue();
- unsigned Alignment = MTI.getAlignment();
- if (IsMemCpySmall(Len, Alignment)) {
+ uint64_t Len = cast<ConstantInt>(MTI->getLength())->getZExtValue();
+ unsigned Alignment = MTI->getAlignment();
+ if (isMemCpySmall(Len, Alignment)) {
Address Dest, Src;
- if (!ComputeAddress(MTI.getRawDest(), Dest) ||
- !ComputeAddress(MTI.getRawSource(), Src))
+ if (!computeAddress(MTI->getRawDest(), Dest) ||
+ !computeAddress(MTI->getRawSource(), Src))
return false;
- if (TryEmitSmallMemCpy(Dest, Src, Len, Alignment))
+ if (tryEmitSmallMemCpy(Dest, Src, Len, Alignment))
return true;
}
}
- if (!MTI.getLength()->getType()->isIntegerTy(64))
+ if (!MTI->getLength()->getType()->isIntegerTy(64))
return false;
- if (MTI.getSourceAddressSpace() > 255 || MTI.getDestAddressSpace() > 255)
+ if (MTI->getSourceAddressSpace() > 255 || MTI->getDestAddressSpace() > 255)
// Fast instruction selection doesn't support the special
// address spaces.
return false;
- const char *IntrMemName = isa<MemCpyInst>(I) ? "memcpy" : "memmove";
- return SelectCall(&I, IntrMemName);
+ const char *IntrMemName = isa<MemCpyInst>(II) ? "memcpy" : "memmove";
+ return lowerCallTo(II, IntrMemName, II->getNumArgOperands() - 2);
}
case Intrinsic::memset: {
- const MemSetInst &MSI = cast<MemSetInst>(I);
+ const MemSetInst *MSI = cast<MemSetInst>(II);
// Don't handle volatile.
- if (MSI.isVolatile())
+ if (MSI->isVolatile())
return false;
- if (!MSI.getLength()->getType()->isIntegerTy(64))
+ if (!MSI->getLength()->getType()->isIntegerTy(64))
return false;
- if (MSI.getDestAddressSpace() > 255)
+ if (MSI->getDestAddressSpace() > 255)
// Fast instruction selection doesn't support the special
// address spaces.
return false;
- return SelectCall(&I, "memset");
+ return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
+ }
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::pow: {
+ MVT RetVT;
+ if (!isTypeLegal(II->getType(), RetVT))
+ return false;
+
+ if (RetVT != MVT::f32 && RetVT != MVT::f64)
+ return false;
+
+ static const RTLIB::Libcall LibCallTable[3][2] = {
+ { RTLIB::SIN_F32, RTLIB::SIN_F64 },
+ { RTLIB::COS_F32, RTLIB::COS_F64 },
+ { RTLIB::POW_F32, RTLIB::POW_F64 }
+ };
+ RTLIB::Libcall LC;
+ bool Is64Bit = RetVT == MVT::f64;
+ switch (II->getIntrinsicID()) {
+ default:
+ llvm_unreachable("Unexpected intrinsic.");
+ case Intrinsic::sin:
+ LC = LibCallTable[0][Is64Bit];
+ break;
+ case Intrinsic::cos:
+ LC = LibCallTable[1][Is64Bit];
+ break;
+ case Intrinsic::pow:
+ LC = LibCallTable[2][Is64Bit];
+ break;
+ }
+
+ ArgListTy Args;
+ Args.reserve(II->getNumArgOperands());
+
+ // Populate the argument list.
+ for (auto &Arg : II->arg_operands()) {
+ ArgListEntry Entry;
+ Entry.Val = Arg;
+ Entry.Ty = Arg->getType();
+ Args.push_back(Entry);
+ }
+
+ CallLoweringInfo CLI;
+ CLI.setCallee(TLI.getLibcallCallingConv(LC), II->getType(),
+ TLI.getLibcallName(LC), std::move(Args));
+ if (!lowerCallTo(CLI))
+ return false;
+ updateValueMap(II, CLI.ResultReg);
+ return true;
+ }
+ case Intrinsic::fabs: {
+ MVT VT;
+ if (!isTypeLegal(II->getType(), VT))
+ return false;
+
+ unsigned Opc;
+ switch (VT.SimpleTy) {
+ default:
+ return false;
+ case MVT::f32:
+ Opc = AArch64::FABSSr;
+ break;
+ case MVT::f64:
+ Opc = AArch64::FABSDr;
+ break;
+ }
+ unsigned SrcReg = getRegForValue(II->getOperand(0));
+ if (!SrcReg)
+ return false;
+ bool SrcRegIsKill = hasTrivialKill(II->getOperand(0));
+ unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+ .addReg(SrcReg, getKillRegState(SrcRegIsKill));
+ updateValueMap(II, ResultReg);
+ return true;
}
case Intrinsic::trap: {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
.addImm(1);
return true;
}
+ case Intrinsic::sqrt: {
+ Type *RetTy = II->getCalledFunction()->getReturnType();
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ unsigned Op0Reg = getRegForValue(II->getOperand(0));
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(II->getOperand(0));
+
+ unsigned ResultReg = fastEmit_r(VT, VT, ISD::FSQRT, Op0Reg, Op0IsKill);
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(II, ResultReg);
+ return true;
+ }
+ case Intrinsic::sadd_with_overflow:
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::ssub_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::umul_with_overflow: {
+ // This implements the basic lowering of the xalu with overflow intrinsics.
+ const Function *Callee = II->getCalledFunction();
+ auto *Ty = cast<StructType>(Callee->getReturnType());
+ Type *RetTy = Ty->getTypeAtIndex(0U);
+
+ MVT VT;
+ if (!isTypeLegal(RetTy, VT))
+ return false;
+
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return false;
+
+ const Value *LHS = II->getArgOperand(0);
+ const Value *RHS = II->getArgOperand(1);
+ // Canonicalize immediate to the RHS.
+ if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
+ isCommutativeIntrinsic(II))
+ std::swap(LHS, RHS);
+
+ // Simplify multiplies.
+ unsigned IID = II->getIntrinsicID();
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::smul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2) {
+ IID = Intrinsic::sadd_with_overflow;
+ RHS = LHS;
+ }
+ break;
+ case Intrinsic::umul_with_overflow:
+ if (const auto *C = dyn_cast<ConstantInt>(RHS))
+ if (C->getValue() == 2) {
+ IID = Intrinsic::uadd_with_overflow;
+ RHS = LHS;
+ }
+ break;
+ }
+
+ unsigned ResultReg1 = 0, ResultReg2 = 0, MulReg = 0;
+ AArch64CC::CondCode CC = AArch64CC::Invalid;
+ switch (IID) {
+ default: llvm_unreachable("Unexpected intrinsic!");
+ case Intrinsic::sadd_with_overflow:
+ ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::VS;
+ break;
+ case Intrinsic::uadd_with_overflow:
+ ResultReg1 = emitAdd(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::HS;
+ break;
+ case Intrinsic::ssub_with_overflow:
+ ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::VS;
+ break;
+ case Intrinsic::usub_with_overflow:
+ ResultReg1 = emitSub(VT, LHS, RHS, /*SetFlags=*/true);
+ CC = AArch64CC::LO;
+ break;
+ case Intrinsic::smul_with_overflow: {
+ CC = AArch64CC::NE;
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return false;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (VT == MVT::i32) {
+ MulReg = emitSMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ unsigned ShiftReg = emitLSR_ri(MVT::i64, MVT::i64, MulReg,
+ /*IsKill=*/false, 32);
+ MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+ AArch64::sub_32);
+ ShiftReg = fastEmitInst_extractsubreg(VT, ShiftReg, /*IsKill=*/true,
+ AArch64::sub_32);
+ emitSubs_rs(VT, ShiftReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+ AArch64_AM::ASR, 31, /*WantResult=*/false);
+ } else {
+ assert(VT == MVT::i64 && "Unexpected value type.");
+ MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ unsigned SMULHReg = fastEmit_rr(VT, VT, ISD::MULHS, LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ emitSubs_rs(VT, SMULHReg, /*IsKill=*/true, MulReg, /*IsKill=*/false,
+ AArch64_AM::ASR, 63, /*WantResult=*/false);
+ }
+ break;
+ }
+ case Intrinsic::umul_with_overflow: {
+ CC = AArch64CC::NE;
+ unsigned LHSReg = getRegForValue(LHS);
+ if (!LHSReg)
+ return false;
+ bool LHSIsKill = hasTrivialKill(LHS);
+
+ unsigned RHSReg = getRegForValue(RHS);
+ if (!RHSReg)
+ return false;
+ bool RHSIsKill = hasTrivialKill(RHS);
+
+ if (VT == MVT::i32) {
+ MulReg = emitUMULL_rr(MVT::i64, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ emitSubs_rs(MVT::i64, AArch64::XZR, /*IsKill=*/true, MulReg,
+ /*IsKill=*/false, AArch64_AM::LSR, 32,
+ /*WantResult=*/false);
+ MulReg = fastEmitInst_extractsubreg(VT, MulReg, /*IsKill=*/true,
+ AArch64::sub_32);
+ } else {
+ assert(VT == MVT::i64 && "Unexpected value type.");
+ MulReg = emitMul_rr(VT, LHSReg, LHSIsKill, RHSReg, RHSIsKill);
+ unsigned UMULHReg = fastEmit_rr(VT, VT, ISD::MULHU, LHSReg, LHSIsKill,
+ RHSReg, RHSIsKill);
+ emitSubs_rr(VT, AArch64::XZR, /*IsKill=*/true, UMULHReg,
+ /*IsKill=*/false, /*WantResult=*/false);
+ }
+ break;
+ }
+ }
+
+ if (MulReg) {
+ ResultReg1 = createResultReg(TLI.getRegClassFor(VT));
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg1).addReg(MulReg);
+ }
+
+ ResultReg2 = fastEmitInst_rri(AArch64::CSINCWr, &AArch64::GPR32RegClass,
+ AArch64::WZR, /*IsKill=*/true, AArch64::WZR,
+ /*IsKill=*/true, getInvertedCondCode(CC));
+ (void)ResultReg2;
+ assert((ResultReg1 + 1) == ResultReg2 &&
+ "Nonconsecutive result registers.");
+ updateValueMap(II, ResultReg1, 2);
+ return true;
+ }
}
return false;
}
-bool AArch64FastISel::SelectRet(const Instruction *I) {
+bool AArch64FastISel::selectRet(const Instruction *I) {
const ReturnInst *Ret = cast<ReturnInst>(I);
const Function &F = *I->getParent()->getParent();
@@ -1572,8 +3665,7 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ValLocs;
- CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
- I->getContext());
+ CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
CCInfo.AnalyzeReturn(Outs, RetCC);
@@ -1586,11 +3678,14 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
const Value *RV = Ret->getOperand(0);
// Don't bother handling odd stuff for now.
- if (VA.getLocInfo() != CCValAssign::Full)
+ if ((VA.getLocInfo() != CCValAssign::Full) &&
+ (VA.getLocInfo() != CCValAssign::BCvt))
return false;
+
// Only handle register returns for now.
if (!VA.isRegLoc())
return false;
+
unsigned Reg = getRegForValue(RV);
if (Reg == 0)
return false;
@@ -1606,12 +3701,14 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
return false;
// Vectors (of > 1 lane) in big endian need tricky handling.
- if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+ if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
+ !Subtarget->isLittleEndian())
return false;
MVT RVVT = RVEVT.getSimpleVT();
if (RVVT == MVT::f128)
return false;
+
MVT DestVT = VA.getValVT();
// Special handling for extended integers.
if (RVVT != DestVT) {
@@ -1621,8 +3718,8 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
return false;
- bool isZExt = Outs[0].Flags.isZExt();
- SrcReg = EmitIntExt(RVVT, SrcReg, DestVT, isZExt);
+ bool IsZExt = Outs[0].Flags.isZExt();
+ SrcReg = emitIntExt(RVVT, SrcReg, DestVT, IsZExt);
if (SrcReg == 0)
return false;
}
@@ -1642,7 +3739,7 @@ bool AArch64FastISel::SelectRet(const Instruction *I) {
return true;
}
-bool AArch64FastISel::SelectTrunc(const Instruction *I) {
+bool AArch64FastISel::selectTrunc(const Instruction *I) {
Type *DestTy = I->getType();
Value *Op = I->getOperand(0);
Type *SrcTy = Op->getType();
@@ -1667,10 +3764,14 @@ bool AArch64FastISel::SelectTrunc(const Instruction *I) {
unsigned SrcReg = getRegForValue(Op);
if (!SrcReg)
return false;
+ bool SrcIsKill = hasTrivialKill(Op);
// If we're truncating from i64 to a smaller non-legal type then generate an
- // AND. Otherwise, we know the high bits are undefined and a truncate doesn't
- // generate any code.
+ // AND. Otherwise, we know the high bits are undefined and a truncate only
+ // generate a COPY. We cannot mark the source register also as result
+ // register, because this can incorrectly transfer the kill flag onto the
+ // source register.
+ unsigned ResultReg;
if (SrcVT == MVT::i64) {
uint64_t Mask = 0;
switch (DestVT.SimpleTy) {
@@ -1688,23 +3789,23 @@ bool AArch64FastISel::SelectTrunc(const Instruction *I) {
break;
}
// Issue an extract_subreg to get the lower 32-bits.
- unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
+ unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg, SrcIsKill,
AArch64::sub_32);
- MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
// Create the AND instruction which performs the actual truncation.
- unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
- ANDReg)
- .addReg(Reg32)
- .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
- SrcReg = ANDReg;
+ ResultReg = emitAnd_ri(MVT::i32, Reg32, /*IsKill=*/true, Mask);
+ assert(ResultReg && "Unexpected AND instruction emission failure.");
+ } else {
+ ResultReg = createResultReg(&AArch64::GPR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(SrcReg, getKillRegState(SrcIsKill));
}
- UpdateValueMap(I, SrcReg);
+ updateValueMap(I, ResultReg);
return true;
}
-unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+unsigned AArch64FastISel::emiti1Ext(unsigned SrcReg, MVT DestVT, bool IsZExt) {
assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
DestVT == MVT::i64) &&
"Unexpected value type.");
@@ -1712,14 +3813,9 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
if (DestVT == MVT::i8 || DestVT == MVT::i16)
DestVT = MVT::i32;
- if (isZExt) {
- MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
- unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
- ResultReg)
- .addReg(SrcReg)
- .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
-
+ if (IsZExt) {
+ unsigned ResultReg = emitAnd_ri(MVT::i32, SrcReg, /*TODO:IsKill=*/false, 1);
+ assert(ResultReg && "Unexpected AND instruction emission failure.");
if (DestVT == MVT::i64) {
// We're ZExt i1 to i64. The ANDWri Wd, Ws, #1 implicitly clears the
// upper 32 bits. Emit a SUBREG_TO_REG to extend from Wd to Xd.
@@ -1737,18 +3833,389 @@ unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
// FIXME: We're SExt i1 to i64.
return 0;
}
- unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
- ResultReg)
- .addReg(SrcReg)
+ return fastEmitInst_rii(AArch64::SBFMWri, &AArch64::GPR32RegClass, SrcReg,
+ /*TODO:IsKill=*/false, 0, 0);
+ }
+}
+
+unsigned AArch64FastISel::emitMul_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ unsigned Opc, ZReg;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8:
+ case MVT::i16:
+ case MVT::i32:
+ RetVT = MVT::i32;
+ Opc = AArch64::MADDWrrr; ZReg = AArch64::WZR; break;
+ case MVT::i64:
+ Opc = AArch64::MADDXrrr; ZReg = AArch64::XZR; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ return fastEmitInst_rrr(Opc, RC, Op0, Op0IsKill, Op1, Op1IsKill,
+ /*IsKill=*/ZReg, true);
+}
+
+unsigned AArch64FastISel::emitSMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ if (RetVT != MVT::i64)
+ return 0;
+
+ return fastEmitInst_rrr(AArch64::SMADDLrrr, &AArch64::GPR64RegClass,
+ Op0, Op0IsKill, Op1, Op1IsKill,
+ AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitUMULL_rr(MVT RetVT, unsigned Op0, bool Op0IsKill,
+ unsigned Op1, bool Op1IsKill) {
+ if (RetVT != MVT::i64)
+ return 0;
+
+ return fastEmitInst_rrr(AArch64::UMADDLrrr, &AArch64::GPR64RegClass,
+ Op0, Op0IsKill, Op1, Op1IsKill,
+ AArch64::XZR, /*IsKill=*/true);
+}
+
+unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill) {
+ unsigned Opc = 0;
+ bool NeedTrunc = false;
+ uint64_t Mask = 0;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xff; break;
+ case MVT::i16: Opc = AArch64::LSLVWr; NeedTrunc = true; Mask = 0xffff; break;
+ case MVT::i32: Opc = AArch64::LSLVWr; break;
+ case MVT::i64: Opc = AArch64::LSLVXr; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ if (NeedTrunc) {
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+ Op1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+ Op1IsKill);
+ if (NeedTrunc)
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+ bool Op0IsKill, uint64_t Shift,
+ bool IsZExt) {
+ assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+ "Unexpected source/return type pair.");
+ assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+ "Unexpected source value type.");
+ assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+ RetVT == MVT::i64) && "Unexpected return value type.");
+
+ bool Is64Bit = (RetVT == MVT::i64);
+ unsigned RegSize = Is64Bit ? 64 : 32;
+ unsigned DstBits = RetVT.getSizeInBits();
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ // Just emit a copy for "zero" shifts.
+ if (Shift == 0) {
+ if (RetVT == SrcVT) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill));
+ return ResultReg;
+ } else
+ return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ }
+
+ // Don't deal with undefined shifts.
+ if (Shift >= DstBits)
+ return 0;
+
+ // For immediate shifts we can fold the zero-/sign-extension into the shift.
+ // {S|U}BFM Wd, Wn, #r, #s
+ // Wd<32+s-r,32-r> = Wn<s:0> when r > s
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = shl i16 %1, 4
+ // Wd<32+7-28,32-28> = Wn<7:0> <- clamp s to 7
+ // 0b1111_1111_1111_1111__1111_1010_1010_0000 sext
+ // 0b0000_0000_0000_0000__0000_0101_0101_0000 sext | zext
+ // 0b0000_0000_0000_0000__0000_1010_1010_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = shl i16 %1, 8
+ // Wd<32+7-24,32-24> = Wn<7:0>
+ // 0b1111_1111_1111_1111__1010_1010_0000_0000 sext
+ // 0b0000_0000_0000_0000__0101_0101_0000_0000 sext | zext
+ // 0b0000_0000_0000_0000__1010_1010_0000_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = shl i16 %1, 12
+ // Wd<32+3-20,32-20> = Wn<3:0>
+ // 0b1111_1111_1111_1111__1010_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0101_0000_0000_0000 sext | zext
+ // 0b0000_0000_0000_0000__1010_0000_0000_0000 zext
+
+ unsigned ImmR = RegSize - Shift;
+ // Limit the width to the length of the source type.
+ unsigned ImmS = std::min<unsigned>(SrcBits - 1, DstBits - 1 - Shift);
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::SBFMWri, AArch64::SBFMXri},
+ {AArch64::UBFMWri, AArch64::UBFMXri}
+ };
+ unsigned Opc = OpcTable[IsZExt][Is64Bit];
+ if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+ unsigned TmpReg = MRI.createVirtualRegister(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), TmpReg)
.addImm(0)
- .addImm(0);
- return ResultReg;
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addImm(AArch64::sub_32);
+ Op0 = TmpReg;
+ Op0IsKill = true;
}
+ return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
}
-unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
- bool isZExt) {
+unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill) {
+ unsigned Opc = 0;
+ bool NeedTrunc = false;
+ uint64_t Mask = 0;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xff; break;
+ case MVT::i16: Opc = AArch64::LSRVWr; NeedTrunc = true; Mask = 0xffff; break;
+ case MVT::i32: Opc = AArch64::LSRVWr; break;
+ case MVT::i64: Opc = AArch64::LSRVXr; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ if (NeedTrunc) {
+ Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Op0IsKill, Mask);
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+ Op0IsKill = Op1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+ Op1IsKill);
+ if (NeedTrunc)
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+ bool Op0IsKill, uint64_t Shift,
+ bool IsZExt) {
+ assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+ "Unexpected source/return type pair.");
+ assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+ "Unexpected source value type.");
+ assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+ RetVT == MVT::i64) && "Unexpected return value type.");
+
+ bool Is64Bit = (RetVT == MVT::i64);
+ unsigned RegSize = Is64Bit ? 64 : 32;
+ unsigned DstBits = RetVT.getSizeInBits();
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ // Just emit a copy for "zero" shifts.
+ if (Shift == 0) {
+ if (RetVT == SrcVT) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill));
+ return ResultReg;
+ } else
+ return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ }
+
+ // Don't deal with undefined shifts.
+ if (Shift >= DstBits)
+ return 0;
+
+ // For immediate shifts we can fold the zero-/sign-extension into the shift.
+ // {S|U}BFM Wd, Wn, #r, #s
+ // Wd<s-r:0> = Wn<s:r> when r <= s
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = lshr i16 %1, 4
+ // Wd<7-4:0> = Wn<7:4>
+ // 0b0000_0000_0000_0000__0000_1111_1111_1010 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+ // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = lshr i16 %1, 8
+ // Wd<7-7,0> = Wn<7:7>
+ // 0b0000_0000_0000_0000__0000_0000_1111_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = lshr i16 %1, 12
+ // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+ // 0b0000_0000_0000_0000__0000_0000_0000_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ if (Shift >= SrcBits && IsZExt)
+ return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+ // It is not possible to fold a sign-extend into the LShr instruction. In this
+ // case emit a sign-extend.
+ if (!IsZExt) {
+ Op0 = emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ if (!Op0)
+ return 0;
+ Op0IsKill = true;
+ SrcVT = RetVT;
+ SrcBits = SrcVT.getSizeInBits();
+ IsZExt = true;
+ }
+
+ unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+ unsigned ImmS = SrcBits - 1;
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::SBFMWri, AArch64::SBFMXri},
+ {AArch64::UBFMWri, AArch64::UBFMXri}
+ };
+ unsigned Opc = OpcTable[IsZExt][Is64Bit];
+ if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+ unsigned TmpReg = MRI.createVirtualRegister(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+ .addImm(0)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addImm(AArch64::sub_32);
+ Op0 = TmpReg;
+ Op0IsKill = true;
+ }
+ return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg, bool Op0IsKill,
+ unsigned Op1Reg, bool Op1IsKill) {
+ unsigned Opc = 0;
+ bool NeedTrunc = false;
+ uint64_t Mask = 0;
+ switch (RetVT.SimpleTy) {
+ default: return 0;
+ case MVT::i8: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xff; break;
+ case MVT::i16: Opc = AArch64::ASRVWr; NeedTrunc = true; Mask = 0xffff; break;
+ case MVT::i32: Opc = AArch64::ASRVWr; break;
+ case MVT::i64: Opc = AArch64::ASRVXr; break;
+ }
+
+ const TargetRegisterClass *RC =
+ (RetVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ if (NeedTrunc) {
+ Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*IsZExt=*/false);
+ Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Op1IsKill, Mask);
+ Op0IsKill = Op1IsKill = true;
+ }
+ unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op0IsKill, Op1Reg,
+ Op1IsKill);
+ if (NeedTrunc)
+ ResultReg = emitAnd_ri(MVT::i32, ResultReg, /*IsKill=*/true, Mask);
+ return ResultReg;
+}
+
+unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
+ bool Op0IsKill, uint64_t Shift,
+ bool IsZExt) {
+ assert(RetVT.SimpleTy >= SrcVT.SimpleTy &&
+ "Unexpected source/return type pair.");
+ assert((SrcVT == MVT::i1 || SrcVT == MVT::i8 || SrcVT == MVT::i16 ||
+ SrcVT == MVT::i32 || SrcVT == MVT::i64) &&
+ "Unexpected source value type.");
+ assert((RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32 ||
+ RetVT == MVT::i64) && "Unexpected return value type.");
+
+ bool Is64Bit = (RetVT == MVT::i64);
+ unsigned RegSize = Is64Bit ? 64 : 32;
+ unsigned DstBits = RetVT.getSizeInBits();
+ unsigned SrcBits = SrcVT.getSizeInBits();
+ const TargetRegisterClass *RC =
+ Is64Bit ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+
+ // Just emit a copy for "zero" shifts.
+ if (Shift == 0) {
+ if (RetVT == SrcVT) {
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::COPY), ResultReg)
+ .addReg(Op0, getKillRegState(Op0IsKill));
+ return ResultReg;
+ } else
+ return emitIntExt(SrcVT, Op0, RetVT, IsZExt);
+ }
+
+ // Don't deal with undefined shifts.
+ if (Shift >= DstBits)
+ return 0;
+
+ // For immediate shifts we can fold the zero-/sign-extension into the shift.
+ // {S|U}BFM Wd, Wn, #r, #s
+ // Wd<s-r:0> = Wn<s:r> when r <= s
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = ashr i16 %1, 4
+ // Wd<7-4:0> = Wn<7:4>
+ // 0b1111_1111_1111_1111__1111_1111_1111_1010 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0101 sext | zext
+ // 0b0000_0000_0000_0000__0000_0000_0000_1010 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = ashr i16 %1, 8
+ // Wd<7-7,0> = Wn<7:7>
+ // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ // %1 = {s|z}ext i8 {0b1010_1010|0b0101_0101} to i16
+ // %2 = ashr i16 %1, 12
+ // Wd<7-7,0> = Wn<7:7> <- clamp r to 7
+ // 0b1111_1111_1111_1111__1111_1111_1111_1111 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 sext
+ // 0b0000_0000_0000_0000__0000_0000_0000_0000 zext
+
+ if (Shift >= SrcBits && IsZExt)
+ return materializeInt(ConstantInt::get(*Context, APInt(RegSize, 0)), RetVT);
+
+ unsigned ImmR = std::min<unsigned>(SrcBits - 1, Shift);
+ unsigned ImmS = SrcBits - 1;
+ static const unsigned OpcTable[2][2] = {
+ {AArch64::SBFMWri, AArch64::SBFMXri},
+ {AArch64::UBFMWri, AArch64::UBFMXri}
+ };
+ unsigned Opc = OpcTable[IsZExt][Is64Bit];
+ if (SrcVT.SimpleTy <= MVT::i32 && RetVT == MVT::i64) {
+ unsigned TmpReg = MRI.createVirtualRegister(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), TmpReg)
+ .addImm(0)
+ .addReg(Op0, getKillRegState(Op0IsKill))
+ .addImm(AArch64::sub_32);
+ Op0 = TmpReg;
+ Op0IsKill = true;
+ }
+ return fastEmitInst_rii(Opc, RC, Op0, Op0IsKill, ImmR, ImmS);
+}
+
+unsigned AArch64FastISel::emitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+ bool IsZExt) {
assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
// FastISel does not have plumbing to deal with extensions where the SrcVT or
@@ -1768,24 +4235,24 @@ unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
default:
return 0;
case MVT::i1:
- return Emiti1Ext(SrcReg, DestVT, isZExt);
+ return emiti1Ext(SrcReg, DestVT, IsZExt);
case MVT::i8:
if (DestVT == MVT::i64)
- Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
else
- Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+ Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
Imm = 7;
break;
case MVT::i16:
if (DestVT == MVT::i64)
- Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
else
- Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
+ Opc = IsZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
Imm = 15;
break;
case MVT::i32:
assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
- Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
+ Opc = IsZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
Imm = 31;
break;
}
@@ -1803,45 +4270,167 @@ unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
SrcReg = Src64;
}
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addReg(SrcReg)
- .addImm(0)
- .addImm(Imm);
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ return fastEmitInst_rii(Opc, RC, SrcReg, /*TODO:IsKill=*/false, 0, Imm);
+}
- return ResultReg;
+static bool isZExtLoad(const MachineInstr *LI) {
+ switch (LI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURWi:
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRHHroX:
+ case AArch64::LDRWroX:
+ case AArch64::LDRBBroW:
+ case AArch64::LDRHHroW:
+ case AArch64::LDRWroW:
+ return true;
+ }
}
-bool AArch64FastISel::SelectIntExt(const Instruction *I) {
- // On ARM, in general, integer casts don't involve legal types; this code
- // handles promotable integers. The high bits for a type smaller than
- // the register size are assumed to be undefined.
- Type *DestTy = I->getType();
- Value *Src = I->getOperand(0);
- Type *SrcTy = Src->getType();
+static bool isSExtLoad(const MachineInstr *LI) {
+ switch (LI->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSWi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWui:
+ case AArch64::LDRSBWroX:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSWroX:
+ case AArch64::LDRSBWroW:
+ case AArch64::LDRSHWroW:
+ case AArch64::LDRSBXroW:
+ case AArch64::LDRSHXroW:
+ case AArch64::LDRSWroW:
+ return true;
+ }
+}
- bool isZExt = isa<ZExtInst>(I);
- unsigned SrcReg = getRegForValue(Src);
- if (!SrcReg)
+bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
+ MVT SrcVT) {
+ const auto *LI = dyn_cast<LoadInst>(I->getOperand(0));
+ if (!LI || !LI->hasOneUse())
return false;
- EVT SrcEVT = TLI.getValueType(SrcTy, true);
- EVT DestEVT = TLI.getValueType(DestTy, true);
- if (!SrcEVT.isSimple())
+ // Check if the load instruction has already been selected.
+ unsigned Reg = lookUpRegForValue(LI);
+ if (!Reg)
return false;
- if (!DestEVT.isSimple())
+
+ MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+ if (!MI)
return false;
- MVT SrcVT = SrcEVT.getSimpleVT();
- MVT DestVT = DestEVT.getSimpleVT();
- unsigned ResultReg = EmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
- if (ResultReg == 0)
+ // Check if the correct load instruction has been emitted - SelectionDAG might
+ // have emitted a zero-extending load, but we need a sign-extending load.
+ bool IsZExt = isa<ZExtInst>(I);
+ const auto *LoadMI = MI;
+ if (LoadMI->getOpcode() == TargetOpcode::COPY &&
+ LoadMI->getOperand(1).getSubReg() == AArch64::sub_32) {
+ unsigned LoadReg = MI->getOperand(1).getReg();
+ LoadMI = MRI.getUniqueVRegDef(LoadReg);
+ assert(LoadMI && "Expected valid instruction");
+ }
+ if (!(IsZExt && isZExtLoad(LoadMI)) && !(!IsZExt && isSExtLoad(LoadMI)))
+ return false;
+
+ // Nothing to be done.
+ if (RetVT != MVT::i64 || SrcVT > MVT::i32) {
+ updateValueMap(I, Reg);
+ return true;
+ }
+
+ if (IsZExt) {
+ unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), Reg64)
+ .addImm(0)
+ .addReg(Reg, getKillRegState(true))
+ .addImm(AArch64::sub_32);
+ Reg = Reg64;
+ } else {
+ assert((MI->getOpcode() == TargetOpcode::COPY &&
+ MI->getOperand(1).getSubReg() == AArch64::sub_32) &&
+ "Expected copy instruction");
+ Reg = MI->getOperand(1).getReg();
+ MI->eraseFromParent();
+ }
+ updateValueMap(I, Reg);
+ return true;
+}
+
+bool AArch64FastISel::selectIntExt(const Instruction *I) {
+ assert((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
+ "Unexpected integer extend instruction.");
+ MVT RetVT;
+ MVT SrcVT;
+ if (!isTypeSupported(I->getType(), RetVT))
+ return false;
+
+ if (!isTypeSupported(I->getOperand(0)->getType(), SrcVT))
+ return false;
+
+ // Try to optimize already sign-/zero-extended values from load instructions.
+ if (optimizeIntExtLoad(I, RetVT, SrcVT))
+ return true;
+
+ unsigned SrcReg = getRegForValue(I->getOperand(0));
+ if (!SrcReg)
+ return false;
+ bool SrcIsKill = hasTrivialKill(I->getOperand(0));
+
+ // Try to optimize already sign-/zero-extended values from function arguments.
+ bool IsZExt = isa<ZExtInst>(I);
+ if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
+ if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
+ if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
+ unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(AArch64::SUBREG_TO_REG), ResultReg)
+ .addImm(0)
+ .addReg(SrcReg, getKillRegState(SrcIsKill))
+ .addImm(AArch64::sub_32);
+ SrcReg = ResultReg;
+ }
+ // Conservatively clear all kill flags from all uses, because we are
+ // replacing a sign-/zero-extend instruction at IR level with a nop at MI
+ // level. The result of the instruction at IR level might have been
+ // trivially dead, which is now not longer true.
+ unsigned UseReg = lookUpRegForValue(I);
+ if (UseReg)
+ MRI.clearKillFlags(UseReg);
+
+ updateValueMap(I, SrcReg);
+ return true;
+ }
+ }
+
+ unsigned ResultReg = emitIntExt(SrcVT, SrcReg, RetVT, IsZExt);
+ if (!ResultReg)
return false;
- UpdateValueMap(I, ResultReg);
+
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
EVT DestEVT = TLI.getValueType(I->getType(), true);
if (!DestEVT.isSimple())
return false;
@@ -1851,144 +4440,529 @@ bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
return false;
unsigned DivOpc;
- bool is64bit = (DestVT == MVT::i64);
+ bool Is64bit = (DestVT == MVT::i64);
switch (ISDOpcode) {
default:
return false;
case ISD::SREM:
- DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
+ DivOpc = Is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
break;
case ISD::UREM:
- DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
+ DivOpc = Is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
break;
}
- unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
+ unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
unsigned Src0Reg = getRegForValue(I->getOperand(0));
if (!Src0Reg)
return false;
+ bool Src0IsKill = hasTrivialKill(I->getOperand(0));
unsigned Src1Reg = getRegForValue(I->getOperand(1));
if (!Src1Reg)
return false;
+ bool Src1IsKill = hasTrivialKill(I->getOperand(1));
- unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
- .addReg(Src0Reg)
- .addReg(Src1Reg);
+ const TargetRegisterClass *RC =
+ (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
+ unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, /*IsKill=*/false,
+ Src1Reg, /*IsKill=*/false);
+ assert(QuotReg && "Unexpected DIV instruction emission failure.");
// The remainder is computed as numerator - (quotient * denominator) using the
// MSUB instruction.
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
- .addReg(QuotReg)
- .addReg(Src1Reg)
- .addReg(Src0Reg);
- UpdateValueMap(I, ResultReg);
+ unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, /*IsKill=*/true,
+ Src1Reg, Src1IsKill, Src0Reg,
+ Src0IsKill);
+ updateValueMap(I, ResultReg);
return true;
}
-bool AArch64FastISel::SelectMul(const Instruction *I) {
- EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
- if (!SrcEVT.isSimple())
+bool AArch64FastISel::selectMul(const Instruction *I) {
+ MVT VT;
+ if (!isTypeSupported(I->getType(), VT, /*IsVectorAllowed=*/true))
return false;
- MVT SrcVT = SrcEVT.getSimpleVT();
- // Must be simple value type. Don't handle vectors.
- if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16 &&
- SrcVT != MVT::i8)
+ if (VT.isVector())
+ return selectBinaryOp(I, ISD::MUL);
+
+ const Value *Src0 = I->getOperand(0);
+ const Value *Src1 = I->getOperand(1);
+ if (const auto *C = dyn_cast<ConstantInt>(Src0))
+ if (C->getValue().isPowerOf2())
+ std::swap(Src0, Src1);
+
+ // Try to simplify to a shift instruction.
+ if (const auto *C = dyn_cast<ConstantInt>(Src1))
+ if (C->getValue().isPowerOf2()) {
+ uint64_t ShiftVal = C->getValue().logBase2();
+ MVT SrcVT = VT;
+ bool IsZExt = true;
+ if (const auto *ZExt = dyn_cast<ZExtInst>(Src0)) {
+ if (!isIntExtFree(ZExt)) {
+ MVT VT;
+ if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), VT)) {
+ SrcVT = VT;
+ IsZExt = true;
+ Src0 = ZExt->getOperand(0);
+ }
+ }
+ } else if (const auto *SExt = dyn_cast<SExtInst>(Src0)) {
+ if (!isIntExtFree(SExt)) {
+ MVT VT;
+ if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), VT)) {
+ SrcVT = VT;
+ IsZExt = false;
+ Src0 = SExt->getOperand(0);
+ }
+ }
+ }
+
+ unsigned Src0Reg = getRegForValue(Src0);
+ if (!Src0Reg)
+ return false;
+ bool Src0IsKill = hasTrivialKill(Src0);
+
+ unsigned ResultReg =
+ emitLSL_ri(VT, SrcVT, Src0Reg, Src0IsKill, ShiftVal, IsZExt);
+
+ if (ResultReg) {
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+ }
+
+ unsigned Src0Reg = getRegForValue(I->getOperand(0));
+ if (!Src0Reg)
+ return false;
+ bool Src0IsKill = hasTrivialKill(I->getOperand(0));
+
+ unsigned Src1Reg = getRegForValue(I->getOperand(1));
+ if (!Src1Reg)
+ return false;
+ bool Src1IsKill = hasTrivialKill(I->getOperand(1));
+
+ unsigned ResultReg = emitMul_rr(VT, Src0Reg, Src0IsKill, Src1Reg, Src1IsKill);
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectShift(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeSupported(I->getType(), RetVT, /*IsVectorAllowed=*/true))
+ return false;
+
+ if (RetVT.isVector())
+ return selectOperator(I, I->getOpcode());
+
+ if (const auto *C = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ unsigned ResultReg = 0;
+ uint64_t ShiftVal = C->getZExtValue();
+ MVT SrcVT = RetVT;
+ bool IsZExt = (I->getOpcode() == Instruction::AShr) ? false : true;
+ const Value *Op0 = I->getOperand(0);
+ if (const auto *ZExt = dyn_cast<ZExtInst>(Op0)) {
+ if (!isIntExtFree(ZExt)) {
+ MVT TmpVT;
+ if (isValueAvailable(ZExt) && isTypeSupported(ZExt->getSrcTy(), TmpVT)) {
+ SrcVT = TmpVT;
+ IsZExt = true;
+ Op0 = ZExt->getOperand(0);
+ }
+ }
+ } else if (const auto *SExt = dyn_cast<SExtInst>(Op0)) {
+ if (!isIntExtFree(SExt)) {
+ MVT TmpVT;
+ if (isValueAvailable(SExt) && isTypeSupported(SExt->getSrcTy(), TmpVT)) {
+ SrcVT = TmpVT;
+ IsZExt = false;
+ Op0 = SExt->getOperand(0);
+ }
+ }
+ }
+
+ unsigned Op0Reg = getRegForValue(Op0);
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(Op0);
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected instruction.");
+ case Instruction::Shl:
+ ResultReg = emitLSL_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ break;
+ case Instruction::AShr:
+ ResultReg = emitASR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ break;
+ case Instruction::LShr:
+ ResultReg = emitLSR_ri(RetVT, SrcVT, Op0Reg, Op0IsKill, ShiftVal, IsZExt);
+ break;
+ }
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+ unsigned Op1Reg = getRegForValue(I->getOperand(1));
+ if (!Op1Reg)
+ return false;
+ bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+ unsigned ResultReg = 0;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected instruction.");
+ case Instruction::Shl:
+ ResultReg = emitLSL_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ break;
+ case Instruction::AShr:
+ ResultReg = emitASR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ break;
+ case Instruction::LShr:
+ ResultReg = emitLSR_rr(RetVT, Op0Reg, Op0IsKill, Op1Reg, Op1IsKill);
+ break;
+ }
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectBitCast(const Instruction *I) {
+ MVT RetVT, SrcVT;
+
+ if (!isTypeLegal(I->getOperand(0)->getType(), SrcVT))
+ return false;
+ if (!isTypeLegal(I->getType(), RetVT))
return false;
unsigned Opc;
- unsigned ZReg;
- switch (SrcVT.SimpleTy) {
+ if (RetVT == MVT::f32 && SrcVT == MVT::i32)
+ Opc = AArch64::FMOVWSr;
+ else if (RetVT == MVT::f64 && SrcVT == MVT::i64)
+ Opc = AArch64::FMOVXDr;
+ else if (RetVT == MVT::i32 && SrcVT == MVT::f32)
+ Opc = AArch64::FMOVSWr;
+ else if (RetVT == MVT::i64 && SrcVT == MVT::f64)
+ Opc = AArch64::FMOVDXr;
+ else
+ return false;
+
+ const TargetRegisterClass *RC = nullptr;
+ switch (RetVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected value type.");
+ case MVT::i32: RC = &AArch64::GPR32RegClass; break;
+ case MVT::i64: RC = &AArch64::GPR64RegClass; break;
+ case MVT::f32: RC = &AArch64::FPR32RegClass; break;
+ case MVT::f64: RC = &AArch64::FPR64RegClass; break;
+ }
+ unsigned Op0Reg = getRegForValue(I->getOperand(0));
+ if (!Op0Reg)
+ return false;
+ bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+ unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg, Op0IsKill);
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectFRem(const Instruction *I) {
+ MVT RetVT;
+ if (!isTypeLegal(I->getType(), RetVT))
+ return false;
+
+ RTLIB::Libcall LC;
+ switch (RetVT.SimpleTy) {
default:
return false;
- case MVT::i8:
- case MVT::i16:
- case MVT::i32:
- ZReg = AArch64::WZR;
- Opc = AArch64::MADDWrrr;
- SrcVT = MVT::i32;
+ case MVT::f32:
+ LC = RTLIB::REM_F32;
break;
- case MVT::i64:
- ZReg = AArch64::XZR;
- Opc = AArch64::MADDXrrr;
+ case MVT::f64:
+ LC = RTLIB::REM_F64;
break;
}
+ ArgListTy Args;
+ Args.reserve(I->getNumOperands());
+
+ // Populate the argument list.
+ for (auto &Arg : I->operands()) {
+ ArgListEntry Entry;
+ Entry.Val = Arg;
+ Entry.Ty = Arg->getType();
+ Args.push_back(Entry);
+ }
+
+ CallLoweringInfo CLI;
+ CLI.setCallee(TLI.getLibcallCallingConv(LC), I->getType(),
+ TLI.getLibcallName(LC), std::move(Args));
+ if (!lowerCallTo(CLI))
+ return false;
+ updateValueMap(I, CLI.ResultReg);
+ return true;
+}
+
+bool AArch64FastISel::selectSDiv(const Instruction *I) {
+ MVT VT;
+ if (!isTypeLegal(I->getType(), VT))
+ return false;
+
+ if (!isa<ConstantInt>(I->getOperand(1)))
+ return selectBinaryOp(I, ISD::SDIV);
+
+ const APInt &C = cast<ConstantInt>(I->getOperand(1))->getValue();
+ if ((VT != MVT::i32 && VT != MVT::i64) || !C ||
+ !(C.isPowerOf2() || (-C).isPowerOf2()))
+ return selectBinaryOp(I, ISD::SDIV);
+
+ unsigned Lg2 = C.countTrailingZeros();
unsigned Src0Reg = getRegForValue(I->getOperand(0));
if (!Src0Reg)
return false;
+ bool Src0IsKill = hasTrivialKill(I->getOperand(0));
- unsigned Src1Reg = getRegForValue(I->getOperand(1));
- if (!Src1Reg)
+ if (cast<BinaryOperator>(I)->isExact()) {
+ unsigned ResultReg = emitASR_ri(VT, VT, Src0Reg, Src0IsKill, Lg2);
+ if (!ResultReg)
+ return false;
+ updateValueMap(I, ResultReg);
+ return true;
+ }
+
+ int64_t Pow2MinusOne = (1ULL << Lg2) - 1;
+ unsigned AddReg = emitAdd_ri_(VT, Src0Reg, /*IsKill=*/false, Pow2MinusOne);
+ if (!AddReg)
return false;
- // Create the base instruction, then add the operands.
- unsigned ResultReg = createResultReg(TLI.getRegClassFor(SrcVT));
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
- .addReg(Src0Reg)
- .addReg(Src1Reg)
- .addReg(ZReg);
- UpdateValueMap(I, ResultReg);
+ // (Src0 < 0) ? Pow2 - 1 : 0;
+ if (!emitICmp_ri(VT, Src0Reg, /*IsKill=*/false, 0))
+ return false;
+
+ unsigned SelectOpc;
+ const TargetRegisterClass *RC;
+ if (VT == MVT::i64) {
+ SelectOpc = AArch64::CSELXr;
+ RC = &AArch64::GPR64RegClass;
+ } else {
+ SelectOpc = AArch64::CSELWr;
+ RC = &AArch64::GPR32RegClass;
+ }
+ unsigned SelectReg =
+ fastEmitInst_rri(SelectOpc, RC, AddReg, /*IsKill=*/true, Src0Reg,
+ Src0IsKill, AArch64CC::LT);
+ if (!SelectReg)
+ return false;
+
+ // Divide by Pow2 --> ashr. If we're dividing by a negative value we must also
+ // negate the result.
+ unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
+ unsigned ResultReg;
+ if (C.isNegative())
+ ResultReg = emitAddSub_rs(/*UseAdd=*/false, VT, ZeroReg, /*IsKill=*/true,
+ SelectReg, /*IsKill=*/true, AArch64_AM::ASR, Lg2);
+ else
+ ResultReg = emitASR_ri(VT, VT, SelectReg, /*IsKill=*/true, Lg2);
+
+ if (!ResultReg)
+ return false;
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
+/// This is mostly a copy of the existing FastISel getRegForGEPIndex code. We
+/// have to duplicate it for AArch64, because otherwise we would fail during the
+/// sign-extend emission.
+std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
+ unsigned IdxN = getRegForValue(Idx);
+ if (IdxN == 0)
+ // Unhandled operand. Halt "fast" selection and bail.
+ return std::pair<unsigned, bool>(0, false);
+
+ bool IdxNIsKill = hasTrivialKill(Idx);
+
+ // If the index is smaller or larger than intptr_t, truncate or extend it.
+ MVT PtrVT = TLI.getPointerTy();
+ EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+ if (IdxVT.bitsLT(PtrVT)) {
+ IdxN = emitIntExt(IdxVT.getSimpleVT(), IdxN, PtrVT, /*IsZExt=*/false);
+ IdxNIsKill = true;
+ } else if (IdxVT.bitsGT(PtrVT))
+ llvm_unreachable("AArch64 FastISel doesn't support types larger than i64");
+ return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+}
+
+/// This is mostly a copy of the existing FastISel GEP code, but we have to
+/// duplicate it for AArch64, because otherwise we would bail out even for
+/// simple cases. This is because the standard fastEmit functions don't cover
+/// MUL at all and ADD is lowered very inefficientily.
+bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
+ unsigned N = getRegForValue(I->getOperand(0));
+ if (!N)
+ return false;
+ bool NIsKill = hasTrivialKill(I->getOperand(0));
+
+ // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+ // into a single N = N + TotalOffset.
+ uint64_t TotalOffs = 0;
+ Type *Ty = I->getOperand(0)->getType();
+ MVT VT = TLI.getPointerTy();
+ for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
+ const Value *Idx = *OI;
+ if (auto *StTy = dyn_cast<StructType>(Ty)) {
+ unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
+ // N = N + Offset
+ if (Field)
+ TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
+ Ty = StTy->getElementType(Field);
+ } else {
+ Ty = cast<SequentialType>(Ty)->getElementType();
+ // If this is a constant subscript, handle it quickly.
+ if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+ if (CI->isZero())
+ continue;
+ // N = N + Offset
+ TotalOffs +=
+ DL.getTypeAllocSize(Ty) * cast<ConstantInt>(CI)->getSExtValue();
+ continue;
+ }
+ if (TotalOffs) {
+ N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+ if (!N)
+ return false;
+ NIsKill = true;
+ TotalOffs = 0;
+ }
+
+ // N = N + Idx * ElementSize;
+ uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+ std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
+ unsigned IdxN = Pair.first;
+ bool IdxNIsKill = Pair.second;
+ if (!IdxN)
+ return false;
+
+ if (ElementSize != 1) {
+ unsigned C = fastEmit_i(VT, VT, ISD::Constant, ElementSize);
+ if (!C)
+ return false;
+ IdxN = emitMul_rr(VT, IdxN, IdxNIsKill, C, true);
+ if (!IdxN)
+ return false;
+ IdxNIsKill = true;
+ }
+ N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+ if (!N)
+ return false;
+ }
+ }
+ if (TotalOffs) {
+ N = emitAdd_ri_(VT, N, NIsKill, TotalOffs);
+ if (!N)
+ return false;
+ }
+ updateValueMap(I, N);
return true;
}
-bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
+bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
switch (I->getOpcode()) {
default:
break;
- case Instruction::Load:
- return SelectLoad(I);
- case Instruction::Store:
- return SelectStore(I);
+ case Instruction::Add:
+ case Instruction::Sub:
+ return selectAddSub(I);
+ case Instruction::Mul:
+ return selectMul(I);
+ case Instruction::SDiv:
+ return selectSDiv(I);
+ case Instruction::SRem:
+ if (!selectBinaryOp(I, ISD::SREM))
+ return selectRem(I, ISD::SREM);
+ return true;
+ case Instruction::URem:
+ if (!selectBinaryOp(I, ISD::UREM))
+ return selectRem(I, ISD::UREM);
+ return true;
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ return selectShift(I);
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return selectLogicalOp(I);
case Instruction::Br:
- return SelectBranch(I);
+ return selectBranch(I);
case Instruction::IndirectBr:
- return SelectIndirectBr(I);
- case Instruction::FCmp:
- case Instruction::ICmp:
- return SelectCmp(I);
- case Instruction::Select:
- return SelectSelect(I);
- case Instruction::FPExt:
- return SelectFPExt(I);
- case Instruction::FPTrunc:
- return SelectFPTrunc(I);
+ return selectIndirectBr(I);
+ case Instruction::BitCast:
+ if (!FastISel::selectBitCast(I))
+ return selectBitCast(I);
+ return true;
case Instruction::FPToSI:
- return SelectFPToInt(I, /*Signed=*/true);
+ if (!selectCast(I, ISD::FP_TO_SINT))
+ return selectFPToInt(I, /*Signed=*/true);
+ return true;
case Instruction::FPToUI:
- return SelectFPToInt(I, /*Signed=*/false);
+ return selectFPToInt(I, /*Signed=*/false);
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return selectIntExt(I);
+ case Instruction::Trunc:
+ if (!selectCast(I, ISD::TRUNCATE))
+ return selectTrunc(I);
+ return true;
+ case Instruction::FPExt:
+ return selectFPExt(I);
+ case Instruction::FPTrunc:
+ return selectFPTrunc(I);
case Instruction::SIToFP:
- return SelectIntToFP(I, /*Signed=*/true);
+ if (!selectCast(I, ISD::SINT_TO_FP))
+ return selectIntToFP(I, /*Signed=*/true);
+ return true;
case Instruction::UIToFP:
- return SelectIntToFP(I, /*Signed=*/false);
- case Instruction::SRem:
- return SelectRem(I, ISD::SREM);
- case Instruction::URem:
- return SelectRem(I, ISD::UREM);
- case Instruction::Call:
- if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
- return SelectIntrinsicCall(*II);
- return SelectCall(I);
+ return selectIntToFP(I, /*Signed=*/false);
+ case Instruction::Load:
+ return selectLoad(I);
+ case Instruction::Store:
+ return selectStore(I);
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ return selectCmp(I);
+ case Instruction::Select:
+ return selectSelect(I);
case Instruction::Ret:
- return SelectRet(I);
- case Instruction::Trunc:
- return SelectTrunc(I);
- case Instruction::ZExt:
- case Instruction::SExt:
- return SelectIntExt(I);
- case Instruction::Mul:
- // FIXME: This really should be handled by the target-independent selector.
- return SelectMul(I);
+ return selectRet(I);
+ case Instruction::FRem:
+ return selectFRem(I);
+ case Instruction::GetElementPtr:
+ return selectGetElementPtr(I);
}
- return false;
+
+ // fall-back to target-independent instruction selection.
+ return selectOperator(I, I->getOpcode());
// Silence warnings.
(void)&CC_AArch64_DarwinPCS_VarArg;
}
namespace llvm {
-llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
- const TargetLibraryInfo *libInfo) {
- return new AArch64FastISel(funcInfo, libInfo);
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+ const TargetLibraryInfo *LibInfo) {
+ return new AArch64FastISel(FuncInfo, LibInfo);
}
}
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 9c33717..66aa216 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -17,16 +17,16 @@
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -86,13 +86,14 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
#ifndef NDEBUG
- const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+ const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
assert(!RegInfo->needsStackRealignment(MF) &&
"No stack realignment on AArch64!");
#endif
return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken());
+ MFI->isFrameAddressTaken() || MFI->hasStackMap() ||
+ MFI->hasPatchPoint());
}
/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -109,13 +110,13 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
const AArch64InstrInfo *TII =
- static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+ static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
DebugLoc DL = I->getDebugLoc();
int Opc = I->getOpcode();
bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
if (!TFI->hasReservedCallFrame(MF)) {
unsigned Align = getStackAlignment();
@@ -131,7 +132,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
// FIXME: in-function stack adjustment for calls is limited to 24-bits
// because there's no guaranteed temporary register available.
//
- // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+ // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
// 1) For offset <= 12-bit, we use LSL #0
// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
// LSL #0, and the other uses LSL #12.
@@ -158,7 +159,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
MachineFrameInfo *MFI = MF.getFrameInfo();
MachineModuleInfo &MMI = MF.getMMI();
const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
- const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
DebugLoc DL = MBB.findDebugLoc(MBBI);
// Add callee saved registers to move list.
@@ -166,7 +167,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
if (CSI.empty())
return;
- const DataLayout *TD = MF.getTarget().getDataLayout();
+ const DataLayout *TD = MF.getSubtarget().getDataLayout();
bool HasFP = hasFP(MF);
// Calculate amount of bytes used for return address storing.
@@ -195,7 +196,8 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
nullptr, DwarfReg, Offset - TotalSkipped));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
}
}
@@ -205,8 +207,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
const Function *Fn = MF.getFunction();
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getTarget().getRegisterInfo());
- const TargetInstrInfo *TII = MF.getTarget().getInstrInfo();
+ MF.getSubtarget().getRegisterInfo());
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
@@ -233,7 +235,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
unsigned CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
} else if (NumBytes) {
++NumRedZoneFunctions;
}
@@ -300,7 +303,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
if (needsFrameMoves) {
- const DataLayout *TD = MF.getTarget().getDataLayout();
+ const DataLayout *TD = MF.getSubtarget().getDataLayout();
const int StackGrowth = -TD->getPointerSize(0);
unsigned FramePtr = RegInfo->getFrameRegister(MF);
@@ -376,26 +379,30 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
unsigned CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
// Record the location of the stored LR
unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
// Record the location of the stored FP
CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
} else {
// Encode the stack size of the leaf function.
unsigned CFIIndex = MMI.addFrameInst(
MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex);
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
}
// Now emit the moves for whatever callee saved regs we have.
@@ -435,9 +442,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
MachineFrameInfo *MFI = MF.getFrameInfo();
const AArch64InstrInfo *TII =
- static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+ static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getTarget().getRegisterInfo());
+ MF.getSubtarget().getRegisterInfo());
DebugLoc DL = MBBI->getDebugLoc();
unsigned RetOpcode = MBBI->getOpcode();
@@ -548,7 +555,7 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
bool PreferFP) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getTarget().getRegisterInfo());
+ MF.getSubtarget().getRegisterInfo());
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
int FPOffset = MFI->getObjectOffset(FI) + 16;
int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
@@ -617,7 +624,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
unsigned Count = CSI.size();
DebugLoc DL;
assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
@@ -693,7 +700,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
const std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
MachineFunction &MF = *MBB.getParent();
- const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+ const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
unsigned Count = CSI.size();
DebugLoc DL;
assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
@@ -761,7 +768,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
MachineFunction &MF, RegScavenger *RS) const {
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
- MF.getTarget().getRegisterInfo());
+ MF.getSubtarget().getRegisterInfo());
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
MachineRegisterInfo *MRI = &MF.getRegInfo();
SmallVector<unsigned, 4> UnspilledCSGPRs;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 7686e6f..df3875f 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64_FRAMELOWERING_H
-#define AArch64_FRAMELOWERING_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#include "llvm/Target/TargetFrameLowering.h"
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3f49fab..bb2e1e2 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -303,7 +303,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
/// \brief Determine wether it is worth to fold V into an extended register.
bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
- // it hurts if the a value is used at least twice, unless we are optimizing
+ // it hurts if the value is used at least twice, unless we are optimizing
// for code size.
if (ForCodeSize || V.hasOneUse())
return true;
@@ -569,6 +569,27 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
return isWorthFolding(N);
}
+/// If there's a use of this ADDlow that's not itself a load/store then we'll
+/// need to create a real ADD instruction from it anyway and there's no point in
+/// folding it into the mem op. Theoretically, it shouldn't matter, but there's
+/// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
+/// leads to duplaicated ADRP instructions.
+static bool isWorthFoldingADDlow(SDValue N) {
+ for (auto Use : N->uses()) {
+ if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
+ Use->getOpcode() != ISD::ATOMIC_LOAD &&
+ Use->getOpcode() != ISD::ATOMIC_STORE)
+ return false;
+
+ // ldar and stlr have much more restrictive addressing modes (just a
+ // register).
+ if (cast<MemSDNode>(Use)->getOrdering() > Monotonic)
+ return false;
+ }
+
+ return true;
+}
+
/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
/// immediate" address. The "Size" argument is the size in bytes of the memory
/// reference, which determines the scale.
@@ -582,7 +603,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
return true;
}
- if (N.getOpcode() == AArch64ISD::ADDlow) {
+ if (N.getOpcode() == AArch64ISD::ADDlow && isWorthFoldingADDlow(N)) {
GlobalAddressSDNode *GAN =
dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
Base = N.getOperand(0);
@@ -594,7 +615,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
unsigned Alignment = GV->getAlignment();
const DataLayout *DL = TLI->getDataLayout();
Type *Ty = GV->getType()->getElementType();
- if (Alignment == 0 && Ty->isSized() && !Subtarget->isTargetDarwin())
+ if (Alignment == 0 && Ty->isSized())
Alignment = DL->getABITypeAlignment(Ty);
if (Alignment >= Size)
@@ -777,6 +798,21 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
return false;
}
+// Check if the given immediate is preferred by ADD. If an immediate can be
+// encoded in an ADD, or it can be encoded in an "ADD LSL #12" and can not be
+// encoded by one MOVZ, return true.
+static bool isPreferredADD(int64_t ImmOff) {
+ // Constant in [0x0, 0xfff] can be encoded in ADD.
+ if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
+ return true;
+ // Check if it can be encoded in an "ADD LSL #12".
+ if ((ImmOff & 0xffffffffff000fffLL) == 0x0LL)
+ // As a single MOVZ is faster than a "ADD of LSL #12", ignore such constant.
+ return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
+ (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
+ return false;
+}
+
bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
SDValue &Base, SDValue &Offset,
SDValue &SignExtend,
@@ -786,11 +822,6 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
SDValue LHS = N.getOperand(0);
SDValue RHS = N.getOperand(1);
- // We don't want to match immediate adds here, because they are better lowered
- // to the register-immediate addressing modes.
- if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
- return false;
-
// Check if this particular node is reused in any non-memory related
// operation. If yes, do not try to fold this node into the address
// computation, since the computation will be kept.
@@ -800,6 +831,36 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
return false;
}
+ // Watch out if RHS is a wide immediate, it can not be selected into
+ // [BaseReg+Imm] addressing mode. Also it may not be able to be encoded into
+ // ADD/SUB. Instead it will use [BaseReg + 0] address mode and generate
+ // instructions like:
+ // MOV X0, WideImmediate
+ // ADD X1, BaseReg, X0
+ // LDR X2, [X1, 0]
+ // For such situation, using [BaseReg, XReg] addressing mode can save one
+ // ADD/SUB:
+ // MOV X0, WideImmediate
+ // LDR X2, [BaseReg, X0]
+ if (isa<ConstantSDNode>(RHS)) {
+ int64_t ImmOff = (int64_t)dyn_cast<ConstantSDNode>(RHS)->getZExtValue();
+ unsigned Scale = Log2_32(Size);
+ // Skip the immediate can be seleced by load/store addressing mode.
+ // Also skip the immediate can be encoded by a single ADD (SUB is also
+ // checked by using -ImmOff).
+ if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
+ isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
+ return false;
+
+ SDLoc DL(N.getNode());
+ SDValue Ops[] = { RHS };
+ SDNode *MOVI =
+ CurDAG->getMachineNode(AArch64::MOVi64imm, DL, MVT::i64, Ops);
+ SDValue MOVIV = SDValue(MOVI, 0);
+ // This ADD of two X register will be selected into [Reg+Reg] mode.
+ N = CurDAG->getNode(ISD::ADD, DL, MVT::i64, LHS, MOVIV);
+ }
+
// Remember if it is worth folding N when it produces extended register.
bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
@@ -1381,20 +1442,21 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
return true;
}
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
- unsigned &LSB, unsigned &MSB) {
- // We are looking for the following pattern which basically extracts a single
- // bit from the source value and places it in the LSB of the destination
- // value, all other bits of the destination value or set to zero:
+static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
+ SDValue &Opd0, unsigned &LSB,
+ unsigned &MSB) {
+ // We are looking for the following pattern which basically extracts several
+ // continuous bits from the source value and places it from the LSB of the
+ // destination value, all other bits of the destination value or set to zero:
//
// Value2 = AND Value, MaskImm
// SRL Value2, ShiftImm
//
- // with MaskImm >> ShiftImm == 1.
+ // with MaskImm >> ShiftImm to search for the bit width.
//
// This gets selected into a single UBFM:
//
- // UBFM Value, ShiftImm, ShiftImm
+ // UBFM Value, ShiftImm, BitWide + Srl_imm -1
//
if (N->getOpcode() != ISD::SRL)
@@ -1410,15 +1472,16 @@ static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
if (!isIntImmediate(N->getOperand(1), Srl_imm))
return false;
- // Check whether we really have a one bit extract here.
- if (And_mask >> Srl_imm == 0x1) {
+ // Check whether we really have several bits extract here.
+ unsigned BitWide = 64 - CountLeadingOnes_64(~(And_mask >> Srl_imm));
+ if (BitWide && isMask_64(And_mask >> Srl_imm)) {
if (N->getValueType(0) == MVT::i32)
Opc = AArch64::UBFMWri;
else
Opc = AArch64::UBFMXri;
- LSB = MSB = Srl_imm;
-
+ LSB = Srl_imm;
+ MSB = BitWide + Srl_imm - 1;
return true;
}
@@ -1439,8 +1502,8 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
assert((VT == MVT::i32 || VT == MVT::i64) &&
"Type checking must have been done before calling this function");
- // Check for AND + SRL doing a one bit extract.
- if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+ // Check for AND + SRL doing several bits extract.
+ if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
return true;
// we're looking for a shift of a shift
@@ -2116,7 +2179,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case 32:
SubReg = AArch64::ssub;
break;
- case 16: // FALLTHROUGH
+ case 16:
+ SubReg = AArch64::hsub;
+ break;
case 8:
llvm_unreachable("unexpected zext-requiring extract element!");
}
@@ -2204,9 +2269,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
@@ -2222,9 +2287,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
@@ -2240,9 +2305,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
@@ -2258,9 +2323,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
@@ -2276,9 +2341,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
@@ -2294,9 +2359,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
@@ -2312,9 +2377,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
@@ -2330,9 +2395,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
@@ -2348,9 +2413,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
@@ -2364,7 +2429,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::aarch64_neon_ld2lane:
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectLoadLane(Node, 2, AArch64::LD2i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectLoadLane(Node, 2, AArch64::LD2i16);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2376,7 +2442,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::aarch64_neon_ld3lane:
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectLoadLane(Node, 3, AArch64::LD3i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectLoadLane(Node, 3, AArch64::LD3i16);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2388,7 +2455,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::aarch64_neon_ld4lane:
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectLoadLane(Node, 4, AArch64::LD4i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectLoadLane(Node, 4, AArch64::LD4i16);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2448,9 +2516,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectStore(Node, 2, AArch64::ST1Twov8b);
else if (VT == MVT::v16i8)
return SelectStore(Node, 2, AArch64::ST1Twov16b);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectStore(Node, 2, AArch64::ST1Twov4h);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectStore(Node, 2, AArch64::ST1Twov8h);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectStore(Node, 2, AArch64::ST1Twov2s);
@@ -2467,9 +2535,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectStore(Node, 3, AArch64::ST1Threev8b);
else if (VT == MVT::v16i8)
return SelectStore(Node, 3, AArch64::ST1Threev16b);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectStore(Node, 3, AArch64::ST1Threev4h);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectStore(Node, 3, AArch64::ST1Threev8h);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectStore(Node, 3, AArch64::ST1Threev2s);
@@ -2486,9 +2554,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectStore(Node, 4, AArch64::ST1Fourv8b);
else if (VT == MVT::v16i8)
return SelectStore(Node, 4, AArch64::ST1Fourv16b);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectStore(Node, 4, AArch64::ST1Fourv4h);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectStore(Node, 4, AArch64::ST1Fourv8h);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectStore(Node, 4, AArch64::ST1Fourv2s);
@@ -2505,9 +2573,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectStore(Node, 2, AArch64::ST2Twov8b);
else if (VT == MVT::v16i8)
return SelectStore(Node, 2, AArch64::ST2Twov16b);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectStore(Node, 2, AArch64::ST2Twov4h);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectStore(Node, 2, AArch64::ST2Twov8h);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectStore(Node, 2, AArch64::ST2Twov2s);
@@ -2524,9 +2592,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectStore(Node, 3, AArch64::ST3Threev8b);
else if (VT == MVT::v16i8)
return SelectStore(Node, 3, AArch64::ST3Threev16b);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectStore(Node, 3, AArch64::ST3Threev4h);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectStore(Node, 3, AArch64::ST3Threev8h);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectStore(Node, 3, AArch64::ST3Threev2s);
@@ -2543,9 +2611,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectStore(Node, 4, AArch64::ST4Fourv8b);
else if (VT == MVT::v16i8)
return SelectStore(Node, 4, AArch64::ST4Fourv16b);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectStore(Node, 4, AArch64::ST4Fourv4h);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectStore(Node, 4, AArch64::ST4Fourv8h);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectStore(Node, 4, AArch64::ST4Fourv2s);
@@ -2560,7 +2628,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::aarch64_neon_st2lane: {
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectStoreLane(Node, 2, AArch64::ST2i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectStoreLane(Node, 2, AArch64::ST2i16);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2573,7 +2642,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::aarch64_neon_st3lane: {
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectStoreLane(Node, 3, AArch64::ST3i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectStoreLane(Node, 3, AArch64::ST3i16);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2586,7 +2656,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case Intrinsic::aarch64_neon_st4lane: {
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectStoreLane(Node, 4, AArch64::ST4i8);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectStoreLane(Node, 4, AArch64::ST4i16);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2603,9 +2674,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
@@ -2622,9 +2693,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
@@ -2641,9 +2712,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
@@ -2660,9 +2731,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
@@ -2679,9 +2750,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
@@ -2698,9 +2769,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
@@ -2717,9 +2788,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
@@ -2736,9 +2807,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
@@ -2755,9 +2826,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
@@ -2774,9 +2845,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
else if (VT == MVT::v16i8)
return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
@@ -2791,7 +2862,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case AArch64ISD::LD1LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2804,7 +2876,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case AArch64ISD::LD2LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2817,7 +2890,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case AArch64ISD::LD3LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2830,7 +2904,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
case AArch64ISD::LD4LANEpost: {
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2846,9 +2921,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
else if (VT == MVT::v16i8)
return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
@@ -2866,9 +2941,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
else if (VT == MVT::v16i8)
return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
@@ -2886,9 +2961,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
else if (VT == MVT::v16i8)
return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
@@ -2906,9 +2981,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
else if (VT == MVT::v16i8)
return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
@@ -2926,9 +3001,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
else if (VT == MVT::v16i8)
return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
@@ -2946,9 +3021,9 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
else if (VT == MVT::v16i8)
return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
- else if (VT == MVT::v4i16)
+ else if (VT == MVT::v4i16 || VT == MVT::v4f16)
return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
- else if (VT == MVT::v8i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v8f16)
return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
else if (VT == MVT::v2i32 || VT == MVT::v2f32)
return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
@@ -2964,7 +3039,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2978,7 +3054,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
@@ -2992,7 +3069,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
VT = Node->getOperand(1).getValueType();
if (VT == MVT::v16i8 || VT == MVT::v8i8)
return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
- else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+ else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+ VT == MVT::v8f16)
return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
VT == MVT::v2f32)
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index f2004ea..0d44f99 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12,9 +12,10 @@
//===----------------------------------------------------------------------===//
#include "AArch64ISelLowering.h"
+#include "AArch64CallingConvention.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64PerfectShuffle.h"
#include "AArch64Subtarget.h"
-#include "AArch64MachineFunctionInfo.h"
#include "AArch64TargetMachine.h"
#include "AArch64TargetObjectFile.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
@@ -38,10 +39,12 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+namespace {
enum AlignMode {
StrictAlign,
NoStrictAlign
};
+}
static cl::opt<AlignMode>
Align(cl::desc("Load/store alignment support"),
@@ -64,18 +67,9 @@ EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
cl::desc("Allow AArch64 SLI/SRI formation"),
cl::init(false));
-//===----------------------------------------------------------------------===//
-// AArch64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
- if (TT.isOSBinFormatMachO())
- return new AArch64_MachoTargetObjectFile();
-
- return new AArch64_ELFTargetObjectFile();
-}
-AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
- : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) {
+AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
+ : TargetLowering(TM) {
Subtarget = &TM.getSubtarget<AArch64Subtarget>();
// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
@@ -106,6 +100,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
addDRTypeForNEON(MVT::v2i32);
addDRTypeForNEON(MVT::v1i64);
addDRTypeForNEON(MVT::v1f64);
+ addDRTypeForNEON(MVT::v4f16);
addQRTypeForNEON(MVT::v4f32);
addQRTypeForNEON(MVT::v2f64);
@@ -113,6 +108,7 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
addQRTypeForNEON(MVT::v8i16);
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
+ addQRTypeForNEON(MVT::v8f16);
}
// Compute derived properties from the register classes
@@ -278,6 +274,94 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+ // f16 is storage-only, so we promote operations to f32 if we know this is
+ // valid, and ignore them otherwise. The operations not mentioned here will
+ // fail to select, but this is not a major problem as no source language
+ // should be emitting native f16 operations yet.
+ setOperationAction(ISD::FADD, MVT::f16, Promote);
+ setOperationAction(ISD::FDIV, MVT::f16, Promote);
+ setOperationAction(ISD::FMUL, MVT::f16, Promote);
+ setOperationAction(ISD::FSUB, MVT::f16, Promote);
+
+ // v4f16 is also a storage-only type, so promote it to v4f32 when that is
+ // known to be safe.
+ setOperationAction(ISD::FADD, MVT::v4f16, Promote);
+ setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
+ setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
+ setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
+ setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
+ setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
+ AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
+ AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
+
+ // Expand all other v4f16 operations.
+ // FIXME: We could generate better code by promoting some operations to
+ // a pair of v4f32s
+ setOperationAction(ISD::FABS, MVT::v4f16, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
+ setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
+ setOperationAction(ISD::FMA, MVT::v4f16, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
+ setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
+ setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
+ setOperationAction(ISD::FREM, MVT::v4f16, Expand);
+ setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
+ setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
+ setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
+ setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
+ setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
+ setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
+
+
+ // v8f16 is also a storage-only type, so expand it.
+ setOperationAction(ISD::FABS, MVT::v8f16, Expand);
+ setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+ setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
+ setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
+ setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
+ setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
+ setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
+ setOperationAction(ISD::FMA, MVT::v8f16, Expand);
+ setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+ setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
+ setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
+ setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
+ setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
+ setOperationAction(ISD::FREM, MVT::v8f16, Expand);
+ setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
+ setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
+ setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+ setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
+ setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
+ setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
+ setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
+ setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
+ setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
+ setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
+ setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
+ setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
+ setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
+ setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
+
// AArch64 has implementations of a lot of rounding-like FP operations.
static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
@@ -303,13 +387,24 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
}
+ // Make floating-point constants legal for the large code model, so they don't
+ // become loads from the constant pool.
+ if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
+ setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+ setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+ }
+
// AArch64 does not have floating-point extending loads, i1 sign-extending
// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
- setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
- setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
- setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
+ }
+ for (MVT VT : MVT::integer_valuetypes())
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
+
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
@@ -439,30 +534,31 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+ // Custom handling for some quad-vector types to detect MULL.
+ setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+ setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
// directly.
- for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
- setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
- Expand);
-
- setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
- setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
-
- setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
-
- for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
- InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
- setTruncStoreAction((MVT::SimpleValueType)VT,
- (MVT::SimpleValueType)InnerVT, Expand);
- setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
- setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
- setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+ for (MVT VT : MVT::vector_valuetypes()) {
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+ setOperationAction(ISD::MULHS, VT, Expand);
+ setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+ setOperationAction(ISD::MULHU, VT, Expand);
+ setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+ setOperationAction(ISD::BSWAP, VT, Expand);
+
+ for (MVT InnerVT : MVT::vector_valuetypes()) {
+ setTruncStoreAction(VT, InnerVT, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
+ }
}
// AArch64 has implementations of a lot of rounding-like FP operations.
@@ -477,16 +573,20 @@ AArch64TargetLowering::AArch64TargetLowering(TargetMachine &TM)
setOperationAction(ISD::FROUND, Ty, Legal);
}
}
+
+ // Prefer likely predicted branches to selects on out-of-order cores.
+ if (Subtarget->isCortexA57())
+ PredictableSelectIsExpensive = true;
}
void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
- if (VT == MVT::v2f32) {
+ if (VT == MVT::v2f32 || VT == MVT::v4f16) {
setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
- } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+ } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
@@ -523,7 +623,8 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
- setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+ for (MVT InnerVT : MVT::all_valuetypes())
+ setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
// CNT supports only B element sizes.
if (VT != MVT::v8i8 && VT != MVT::v16i8)
@@ -727,6 +828,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
+ case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
@@ -756,6 +858,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
+ case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
+ case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
}
}
@@ -774,7 +878,8 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
// EndBB:
// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ const TargetInstrInfo *TII =
+ getTargetMachine().getSubtargetImpl()->getInstrInfo();
MachineFunction *MF = MBB->getParent();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
DebugLoc DL = MI->getDebugLoc();
@@ -1020,6 +1125,8 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+ SDValue Cmp;
+ AArch64CC::CondCode AArch64CC;
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
@@ -1051,9 +1158,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
break;
case ISD::SETLE:
case ISD::SETGT:
- if ((VT == MVT::i32 && C != 0x7fffffff &&
+ if ((VT == MVT::i32 && C != INT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
- (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+ (VT == MVT::i64 && C != INT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1062,9 +1169,9 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
break;
case ISD::SETULE:
case ISD::SETUGT:
- if ((VT == MVT::i32 && C != 0xffffffff &&
+ if ((VT == MVT::i32 && C != UINT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
- (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+ (VT == MVT::i64 && C != UINT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
@@ -1074,9 +1181,45 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
}
}
}
-
- SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
- AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
+ // For the i8 operand, the largest immediate is 255, so this can be easily
+ // encoded in the compare instruction. For the i16 operand, however, the
+ // largest immediate cannot be encoded in the compare.
+ // Therefore, use a sign extending load and cmn to avoid materializing the -1
+ // constant. For example,
+ // movz w1, #65535
+ // ldrh w0, [x0, #0]
+ // cmp w0, w1
+ // >
+ // ldrsh w0, [x0, #0]
+ // cmn w0, #1
+ // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
+ // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
+ // both the LHS and RHS are truely zero extended and to make sure the
+ // transformation is profitable.
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
+ if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
+ isa<LoadSDNode>(LHS)) {
+ if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
+ cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
+ LHS.getNode()->hasNUsesOfValue(1, 0)) {
+ int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
+ if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
+ SDValue SExt =
+ DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
+ DAG.getValueType(MVT::i16));
+ Cmp = emitComparison(SExt,
+ DAG.getConstant(ValueofRHS, RHS.getValueType()),
+ CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
+ AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+ return Cmp;
+ }
+ }
+ }
+ }
+ Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+ AArch64CC = changeIntCCToAArch64CC(CC);
AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
return Cmp;
}
@@ -1333,8 +1476,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- // The data thing is not used.
- // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+ unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
bool IsStream = !Locality;
// When the locality number is set
@@ -1349,6 +1491,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
// built the mask value encoding the expected behavior.
unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
+ (!IsData << 3) | // IsDataCache bit
(Locality << 1) | // Cache level bits
(unsigned)IsStream; // Stream bit
return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
@@ -1400,7 +1543,10 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
if (VT.getSizeInBits() > InVT.getSizeInBits()) {
SDLoc dl(Op);
- SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+ MVT ExtVT =
+ MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
+ VT.getVectorNumElements());
+ SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
}
@@ -1505,7 +1651,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
(ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
- StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+ StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
.setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
@@ -1529,6 +1675,197 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
0);
}
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+ if (OrigVT.getSizeInBits() >= 64)
+ return OrigVT;
+
+ assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+ MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+ switch (OrigSimpleTy) {
+ default: llvm_unreachable("Unexpected Vector Type");
+ case MVT::v2i8:
+ case MVT::v2i16:
+ return MVT::v2i32;
+ case MVT::v4i8:
+ return MVT::v4i16;
+ }
+}
+
+static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
+ const EVT &OrigTy,
+ const EVT &ExtTy,
+ unsigned ExtOpcode) {
+ // The vector originally had a size of OrigTy. It was then extended to ExtTy.
+ // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
+ // 64-bits we need to insert a new extension so that it will be 64-bits.
+ assert(ExtTy.is128BitVector() && "Unexpected extension size");
+ if (OrigTy.getSizeInBits() >= 64)
+ return N;
+
+ // Must extend size to at least 64 bits to be used as an operand for VMULL.
+ EVT NewVT = getExtensionTo64Bits(OrigTy);
+
+ return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
+}
+
+static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
+ bool isSigned) {
+ EVT VT = N->getValueType(0);
+
+ if (N->getOpcode() != ISD::BUILD_VECTOR)
+ return false;
+
+ for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+ SDNode *Elt = N->getOperand(i).getNode();
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ unsigned HalfSize = EltSize / 2;
+ if (isSigned) {
+ if (!isIntN(HalfSize, C->getSExtValue()))
+ return false;
+ } else {
+ if (!isUIntN(HalfSize, C->getZExtValue()))
+ return false;
+ }
+ continue;
+ }
+ return false;
+ }
+
+ return true;
+}
+
+static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
+ return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
+ N->getOperand(0)->getValueType(0),
+ N->getValueType(0),
+ N->getOpcode());
+
+ assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
+ EVT VT = N->getValueType(0);
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
+ unsigned NumElts = VT.getVectorNumElements();
+ MVT TruncVT = MVT::getIntegerVT(EltSize);
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
+ const APInt &CInt = C->getAPIntValue();
+ // Element types smaller than 32 bits are not legal, so use i32 elements.
+ // The values are implicitly truncated so sext vs. zext doesn't matter.
+ Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
+ }
+ return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+ MVT::getVectorVT(TruncVT, NumElts), Ops);
+}
+
+static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::SIGN_EXTEND)
+ return true;
+ if (isExtendedBUILD_VECTOR(N, DAG, true))
+ return true;
+ return false;
+}
+
+static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() == ISD::ZERO_EXTEND)
+ return true;
+ if (isExtendedBUILD_VECTOR(N, DAG, false))
+ return true;
+ return false;
+}
+
+static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+ SDNode *N0 = N->getOperand(0).getNode();
+ SDNode *N1 = N->getOperand(1).getNode();
+ return N0->hasOneUse() && N1->hasOneUse() &&
+ isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
+ }
+ return false;
+}
+
+static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
+ SDNode *N0 = N->getOperand(0).getNode();
+ SDNode *N1 = N->getOperand(1).getNode();
+ return N0->hasOneUse() && N1->hasOneUse() &&
+ isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
+ }
+ return false;
+}
+
+static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
+ // Multiplications are only custom-lowered for 128-bit vectors so that
+ // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
+ EVT VT = Op.getValueType();
+ assert(VT.is128BitVector() && VT.isInteger() &&
+ "unexpected type for custom-lowering ISD::MUL");
+ SDNode *N0 = Op.getOperand(0).getNode();
+ SDNode *N1 = Op.getOperand(1).getNode();
+ unsigned NewOpc = 0;
+ bool isMLA = false;
+ bool isN0SExt = isSignExtended(N0, DAG);
+ bool isN1SExt = isSignExtended(N1, DAG);
+ if (isN0SExt && isN1SExt)
+ NewOpc = AArch64ISD::SMULL;
+ else {
+ bool isN0ZExt = isZeroExtended(N0, DAG);
+ bool isN1ZExt = isZeroExtended(N1, DAG);
+ if (isN0ZExt && isN1ZExt)
+ NewOpc = AArch64ISD::UMULL;
+ else if (isN1SExt || isN1ZExt) {
+ // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
+ // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
+ if (isN1SExt && isAddSubSExt(N0, DAG)) {
+ NewOpc = AArch64ISD::SMULL;
+ isMLA = true;
+ } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
+ NewOpc = AArch64ISD::UMULL;
+ isMLA = true;
+ } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
+ std::swap(N0, N1);
+ NewOpc = AArch64ISD::UMULL;
+ isMLA = true;
+ }
+ }
+
+ if (!NewOpc) {
+ if (VT == MVT::v2i64)
+ // Fall through to expand this. It is not legal.
+ return SDValue();
+ else
+ // Other vector multiplications are legal.
+ return Op;
+ }
+ }
+
+ // Legalize to a S/UMULL instruction
+ SDLoc DL(Op);
+ SDValue Op0;
+ SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
+ if (!isMLA) {
+ Op0 = skipExtensionForVectorMULL(N0, DAG);
+ assert(Op0.getValueType().is64BitVector() &&
+ Op1.getValueType().is64BitVector() &&
+ "unexpected types for extended operands to VMULL");
+ return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
+ }
+ // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
+ // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
+ // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
+ SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
+ SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
+ EVT Op1VT = Op1.getValueType();
+ return DAG.getNode(N0->getOpcode(), DL, VT,
+ DAG.getNode(NewOpc, DL, VT,
+ DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
+ DAG.getNode(NewOpc, DL, VT,
+ DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
+}
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
@@ -1629,6 +1966,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerFP_TO_INT(Op, DAG);
case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);
+ case ISD::MUL:
+ return LowerMUL(Op, DAG);
}
}
@@ -1643,8 +1982,7 @@ unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
#include "AArch64GenCallingConv.inc"
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
+/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) const {
switch (CC) {
@@ -1669,8 +2007,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
// At this point, Ins[].VT may already be promoted to i32. To correctly
// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
@@ -1774,10 +2112,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
- unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+ unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
uint32_t BEAlign = 0;
- if (ArgSize < 8 && !Subtarget->isLittleEndian())
+ if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
+ !Ins[i].Flags.isInConsecutiveRegs())
BEAlign = 8 - ArgSize;
int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
@@ -1809,7 +2148,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(FI),
- MemVT, false, false, false, nullptr);
+ MemVT, false, false, false, 0);
InVals.push_back(ArgValue);
}
@@ -1941,8 +2280,8 @@ SDValue AArch64TargetLowering::LowerCallResult(
: RetCC_AArch64_AAPCS;
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC);
// Copy all of the result registers out of their specified physreg.
@@ -2011,6 +2350,21 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return false;
}
+ // Externally-defined functions with weak linkage should not be
+ // tail-called on AArch64 when the OS does not support dynamic
+ // pre-emption of symbols, as the AAELF spec requires normal calls
+ // to undefined weak functions to be replaced with a NOP or jump to the
+ // next instruction. The behaviour of branch instructions in this
+ // situation (as used for tail calls) is implementation-defined, so we
+ // cannot rely on the linker replacing the tail call with a return.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ const GlobalValue *GV = G->getGlobal();
+ const Triple TT(getTargetMachine().getTargetTriple());
+ if (GV->hasExternalWeakLinkage() &&
+ (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+ return false;
+ }
+
// Now we search for cases where we can use a tail call without changing the
// ABI. Sibcall is used in some places (particularly gcc) to refer to this
// concept.
@@ -2028,8 +2382,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
@@ -2041,13 +2395,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
// results are returned in the same way as what the caller expects.
if (!CCMatch) {
SmallVector<CCValAssign, 16> RVLocs1;
- CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs1, *DAG.getContext());
+ CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
+ *DAG.getContext());
CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
SmallVector<CCValAssign, 16> RVLocs2;
- CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs2, *DAG.getContext());
+ CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
+ *DAG.getContext());
CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
if (RVLocs1.size() != RVLocs2.size())
@@ -2072,8 +2426,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
return true;
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
@@ -2170,8 +2524,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
- getTargetMachine(), ArgLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+ *DAG.getContext());
if (IsVarArg) {
// Handle fixed and variable vector arguments differently.
@@ -2316,9 +2670,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
- : VA.getLocVT().getSizeInBits();
+ : VA.getValVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
- if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+ if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
+ !Flags.isInConsecutiveRegs()) {
if (OpSize < 8)
BEAlign = 8 - OpSize;
}
@@ -2350,8 +2705,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
SDValue Cpy = DAG.getMemcpy(
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
- /*isVolatile = */ false,
- /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
+ /*isVol = */ false,
+ /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
@@ -2440,7 +2795,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo *TRI =
+ getTargetMachine().getSubtargetImpl()->getRegisterInfo();
const AArch64RegisterInfo *ARI =
static_cast<const AArch64RegisterInfo *>(TRI);
if (IsThisReturn) {
@@ -2494,7 +2850,7 @@ bool AArch64TargetLowering::CanLowerReturn(
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+ CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC);
}
@@ -2508,8 +2864,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
- CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
- getTargetMachine(), RVLocs, *DAG.getContext());
+ CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+ *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC);
// Copy the result values into the output registers.
@@ -2560,7 +2916,8 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy();
SDLoc DL(Op);
- const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+ const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+ const GlobalValue *GV = GN->getGlobal();
unsigned char OpFlags =
Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
@@ -2575,6 +2932,25 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
}
+ if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
+ assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+ "use of MO_CONSTPOOL only supported on small model");
+ SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+ unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+ SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
+ SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
+ MachinePointerInfo::getConstantPool(),
+ /*isVolatile=*/ false,
+ /*isNonTemporal=*/ true,
+ /*isInvariant=*/ true, 8);
+ if (GN->getOffset() != 0)
+ return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
+ DAG.getConstant(GN->getOffset(), PtrVT));
+ return GlobalAddr;
+ }
+
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
@@ -2651,7 +3027,8 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo *TRI =
+ getTargetMachine().getSubtargetImpl()->getRegisterInfo();
const AArch64RegisterInfo *ARI =
static_cast<const AArch64RegisterInfo *>(TRI);
const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2701,7 +3078,8 @@ SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
- const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterInfo *TRI =
+ getTargetMachine().getSubtargetImpl()->getRegisterInfo();
const AArch64RegisterInfo *ARI =
static_cast<const AArch64RegisterInfo *>(TRI);
const uint32_t *Mask = ARI->getTLSCallPreservedMask();
@@ -2916,11 +3294,6 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
-
- // TBZ only operates on i64's, but the ext should be free.
- if (Test.getValueType() == MVT::i32)
- Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
}
@@ -2936,18 +3309,29 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
-
- // TBNZ only operates on i64's, but the ext should be free.
- if (Test.getValueType() == MVT::i32)
- Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
}
return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+ } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
+ // Don't combine AND since emitComparison converts the AND to an ANDS
+ // (a.k.a. TST) and the test in the test bit and branch instruction
+ // becomes redundant. This would also increase register pressure.
+ uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+ return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
+ DAG.getConstant(Mask, MVT::i64), Dest);
}
}
+ if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
+ LHS.getOpcode() != ISD::AND) {
+ // Don't combine AND since emitComparison converts the AND to an ANDS
+ // (a.k.a. TST) and the test in the test bit and branch instruction
+ // becomes redundant. This would also increase register pressure.
+ uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
+ return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
+ DAG.getConstant(Mask, MVT::i64), Dest);
+ }
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
@@ -3062,6 +3446,9 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
return SDValue();
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
// While there is no integer popcount instruction, it can
// be more efficiently lowered to the following sequence that uses
// AdvSIMD registers/instructions as long as the copies to/from
@@ -4013,8 +4400,10 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint(
return;
case 'J': {
uint64_t NVal = -C->getSExtValue();
- if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+ if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
+ CVal = C->getSExtValue();
break;
+ }
return;
}
// The K and L constraints apply *only* to logical immediates, including
@@ -4138,10 +4527,30 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
- SmallVector<SDValue, 2> SourceVecs;
- SmallVector<unsigned, 2> MinElts;
- SmallVector<unsigned, 2> MaxElts;
+ struct ShuffleSourceInfo {
+ SDValue Vec;
+ unsigned MinElt;
+ unsigned MaxElt;
+
+ // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
+ // be compatible with the shuffle we intend to construct. As a result
+ // ShuffleVec will be some sliding window into the original Vec.
+ SDValue ShuffleVec;
+
+ // Code should guarantee that element i in Vec starts at element "WindowBase
+ // + i * WindowScale in ShuffleVec".
+ int WindowBase;
+ int WindowScale;
+
+ bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
+ ShuffleSourceInfo(SDValue Vec)
+ : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
+ WindowScale(1) {}
+ };
+ // First gather all vectors used as an immediate source for this BUILD_VECTOR
+ // node.
+ SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.getOpcode() == ISD::UNDEF)
@@ -4152,133 +4561,155 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
return SDValue();
}
- // Record this extraction against the appropriate vector if possible...
+ // Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
- unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
- bool FoundSource = false;
- for (unsigned j = 0; j < SourceVecs.size(); ++j) {
- if (SourceVecs[j] == SourceVec) {
- if (MinElts[j] > EltNo)
- MinElts[j] = EltNo;
- if (MaxElts[j] < EltNo)
- MaxElts[j] = EltNo;
- FoundSource = true;
- break;
- }
- }
+ auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
+ if (Source == Sources.end())
+ Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
- // Or record a new source if not...
- if (!FoundSource) {
- SourceVecs.push_back(SourceVec);
- MinElts.push_back(EltNo);
- MaxElts.push_back(EltNo);
- }
+ // Update the minimum and maximum lane number seen.
+ unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+ Source->MinElt = std::min(Source->MinElt, EltNo);
+ Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
// Currently only do something sane when at most two source vectors
- // involved.
- if (SourceVecs.size() > 2)
+ // are involved.
+ if (Sources.size() > 2)
return SDValue();
- SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
- int VEXTOffsets[2] = { 0, 0 };
- int OffsetMultipliers[2] = { 1, 1 };
-
- // This loop extracts the usage patterns of the source vectors
- // and prepares appropriate SDValues for a shuffle if possible.
- for (unsigned i = 0; i < SourceVecs.size(); ++i) {
- unsigned NumSrcElts = SourceVecs[i].getValueType().getVectorNumElements();
- SDValue CurSource = SourceVecs[i];
- if (SourceVecs[i].getValueType().getVectorElementType() !=
- VT.getVectorElementType()) {
- // It may hit this case if SourceVecs[i] is AssertSext/AssertZext.
- // Then bitcast it to the vector which holds asserted element type,
- // and record the multiplier of element width between SourceVecs and
- // Build_vector which is needed to extract the correct lanes later.
- EVT CastVT =
- EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- SourceVecs[i].getValueSizeInBits() /
- VT.getVectorElementType().getSizeInBits());
-
- CurSource = DAG.getNode(ISD::BITCAST, dl, CastVT, SourceVecs[i]);
- OffsetMultipliers[i] = CastVT.getVectorNumElements() / NumSrcElts;
- NumSrcElts *= OffsetMultipliers[i];
- MaxElts[i] *= OffsetMultipliers[i];
- MinElts[i] *= OffsetMultipliers[i];
+ // Find out the smallest element size among result and two sources, and use
+ // it as element size to build the shuffle_vector.
+ EVT SmallestEltTy = VT.getVectorElementType();
+ for (auto &Source : Sources) {
+ EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
+ if (SrcEltTy.bitsLT(SmallestEltTy)) {
+ SmallestEltTy = SrcEltTy;
}
+ }
+ unsigned ResMultiplier =
+ VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
+ NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
- if (CurSource.getValueType() == VT) {
- // No VEXT necessary
- ShuffleSrcs[i] = CurSource;
- VEXTOffsets[i] = 0;
+ // If the source vector is too wide or too narrow, we may nevertheless be able
+ // to construct a compatible shuffle either by concatenating it with UNDEF or
+ // extracting a suitable range of elements.
+ for (auto &Src : Sources) {
+ EVT SrcVT = Src.ShuffleVec.getValueType();
+
+ if (SrcVT.getSizeInBits() == VT.getSizeInBits())
continue;
- } else if (NumSrcElts < NumElts) {
+
+ // This stage of the search produces a source with the same element type as
+ // the original, but with a total width matching the BUILD_VECTOR output.
+ EVT EltVT = SrcVT.getVectorElementType();
+ unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
+ EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
+
+ if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
+ assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
- ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, CurSource,
- DAG.getUNDEF(CurSource.getValueType()));
+ Src.ShuffleVec =
+ DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
+ DAG.getUNDEF(Src.ShuffleVec.getValueType()));
continue;
}
- // Since only 64-bit and 128-bit vectors are legal on ARM and
- // we've eliminated the other cases...
- assert(NumSrcElts == 2 * NumElts &&
- "unexpected vector sizes in ReconstructShuffle");
+ assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
- if (MaxElts[i] - MinElts[i] >= NumElts) {
+ if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
// Span too large for a VEXT to cope
return SDValue();
}
- if (MinElts[i] >= NumElts) {
+ if (Src.MinElt >= NumSrcElts) {
// The extraction can just take the second half
- VEXTOffsets[i] = NumElts;
- ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
- DAG.getIntPtrConstant(NumElts));
- } else if (MaxElts[i] < NumElts) {
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, MVT::i64));
+ Src.WindowBase = -NumSrcElts;
+ } else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
- VEXTOffsets[i] = 0;
- ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
- DAG.getIntPtrConstant(0));
+ Src.ShuffleVec =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, MVT::i64));
} else {
// An actual VEXT is needed
- VEXTOffsets[i] = MinElts[i];
- SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
- DAG.getIntPtrConstant(0));
- SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CurSource,
- DAG.getIntPtrConstant(NumElts));
- unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
- ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
- DAG.getConstant(Imm, MVT::i32));
+ SDValue VEXTSrc1 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(0, MVT::i64));
+ SDValue VEXTSrc2 =
+ DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
+ DAG.getConstant(NumSrcElts, MVT::i64));
+ unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
+
+ Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
+ VEXTSrc2, DAG.getConstant(Imm, MVT::i32));
+ Src.WindowBase = -Src.MinElt;
}
}
- SmallVector<int, 8> Mask;
-
- for (unsigned i = 0; i < NumElts; ++i) {
+ // Another possible incompatibility occurs from the vector element types. We
+ // can fix this by bitcasting the source vectors to the same type we intend
+ // for the shuffle.
+ for (auto &Src : Sources) {
+ EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
+ if (SrcEltTy == SmallestEltTy)
+ continue;
+ assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
+ Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+ Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
+ Src.WindowBase *= Src.WindowScale;
+ }
+
+ // Final sanity check before we try to actually produce a shuffle.
+ DEBUG(
+ for (auto Src : Sources)
+ assert(Src.ShuffleVec.getValueType() == ShuffleVT);
+ );
+
+ // The stars all align, our next step is to produce the mask for the shuffle.
+ SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
+ int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
+ for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
- if (Entry.getOpcode() == ISD::UNDEF) {
- Mask.push_back(-1);
+ if (Entry.getOpcode() == ISD::UNDEF)
continue;
- }
- SDValue ExtractVec = Entry.getOperand(0);
- int ExtractElt =
- cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
- if (ExtractVec == SourceVecs[0]) {
- Mask.push_back(ExtractElt * OffsetMultipliers[0] - VEXTOffsets[0]);
- } else {
- Mask.push_back(ExtractElt * OffsetMultipliers[1] + NumElts -
- VEXTOffsets[1]);
- }
+ auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
+ int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
+
+ // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
+ // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
+ // segment.
+ EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
+ int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
+ VT.getVectorElementType().getSizeInBits());
+ int LanesDefined = BitsDefined / BitsPerShuffleLane;
+
+ // This source is expected to fill ResMultiplier lanes of the final shuffle,
+ // starting at the appropriate offset.
+ int *LaneMask = &Mask[i * ResMultiplier];
+
+ int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
+ ExtractBase += NumElts * (Src - Sources.begin());
+ for (int j = 0; j < LanesDefined; ++j)
+ LaneMask[j] = ExtractBase + j;
}
// Final check before we try to produce nonsense...
- if (isShuffleMaskLegal(Mask, VT))
- return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
- &Mask[0]);
+ if (!isShuffleMaskLegal(Mask, ShuffleVT))
+ return SDValue();
- return SDValue();
+ SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
+ for (unsigned i = 0; i < Sources.size(); ++i)
+ ShuffleOps[i] = Sources[i].ShuffleVec;
+
+ SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
+ ShuffleOps[1], &Mask[0]);
+ return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
// check if an EXT instruction can handle the shuffle mask when the
@@ -4607,7 +5038,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
VT.getVectorElementType() == MVT::f32)
return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
// vrev <4 x i16> -> REV32
- if (VT.getVectorElementType() == MVT::i16)
+ if (VT.getVectorElementType() == MVT::i16 ||
+ VT.getVectorElementType() == MVT::f16)
return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
// vrev <4 x i8> -> REV16
assert(VT.getVectorElementType() == MVT::i8);
@@ -4727,7 +5159,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
static unsigned getDUPLANEOp(EVT EltType) {
if (EltType == MVT::i8)
return AArch64ISD::DUPLANE8;
- if (EltType == MVT::i16)
+ if (EltType == MVT::i16 || EltType == MVT::f16)
return AArch64ISD::DUPLANE16;
if (EltType == MVT::i32 || EltType == MVT::f32)
return AArch64ISD::DUPLANE32;
@@ -4857,7 +5289,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
EVT ScalarVT = VT.getVectorElementType();
- if (ScalarVT.getSizeInBits() < 32)
+
+ if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
ScalarVT = MVT::i32;
return DAG.getNode(
@@ -4945,7 +5378,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -4954,7 +5387,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -4963,7 +5396,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(16, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -4972,7 +5405,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(24, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -4981,7 +5414,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -4990,7 +5423,7 @@ SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
@@ -5145,7 +5578,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5154,7 +5587,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5163,7 +5596,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(16, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5172,7 +5605,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(24, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5181,7 +5614,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5190,7 +5623,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
@@ -5263,13 +5696,13 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (VT.getSizeInBits() == 128) {
SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
DAG.getConstant(CnstVal, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
// Support the V64 version via subregister insertion.
SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
DAG.getConstant(CnstVal, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
@@ -5278,7 +5711,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5287,7 +5720,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5296,7 +5729,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(16, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5305,7 +5738,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(24, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5314,7 +5747,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5323,7 +5756,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5332,7 +5765,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(264, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5341,7 +5774,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(272, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
@@ -5349,7 +5782,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
// The few faces of FMOV...
@@ -5358,7 +5791,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
@@ -5366,7 +5799,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
DAG.getConstant(CnstVal, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
// The many faces of MVNI...
@@ -5377,7 +5810,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
@@ -5386,7 +5819,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
@@ -5395,7 +5828,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(16, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
@@ -5404,7 +5837,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(24, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
@@ -5413,7 +5846,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(0, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
@@ -5422,7 +5855,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(8, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
@@ -5431,7 +5864,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(264, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
@@ -5440,7 +5873,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
DAG.getConstant(CnstVal, MVT::i32),
DAG.getConstant(272, MVT::i32));
- return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+ return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
@@ -5616,11 +6049,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
- VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+ VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+ VT == MVT::v8f16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
- VT != MVT::v1i64 && VT != MVT::v2f32)
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
return SDValue();
// For V64 types, we perform insertion by expanding the value
@@ -5649,11 +6083,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
- VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+ VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
+ VT == MVT::v8f16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
- VT != MVT::v1i64 && VT != MVT::v2f32)
+ VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
return SDValue();
// For V64 types, we perform extraction by expanding the value
@@ -6187,7 +6622,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
Attribute::NoImplicitFloat) &&
(memOpAlign(SrcAlign, DstAlign, 16) ||
- (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+ (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
return MVT::f128;
return Size >= 8 ? MVT::i64 : MVT::i32;
@@ -6382,6 +6817,48 @@ static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
return performIntegerAbsCombine(N, DAG);
}
+SDValue
+AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ std::vector<SDNode *> *Created) const {
+ // fold (sdiv X, pow2)
+ EVT VT = N->getValueType(0);
+ if ((VT != MVT::i32 && VT != MVT::i64) ||
+ !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ unsigned Lg2 = Divisor.countTrailingZeros();
+ SDValue Zero = DAG.getConstant(0, VT);
+ SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT);
+
+ // Add (N0 < 0) ? Pow2 - 1 : 0;
+ SDValue CCVal;
+ SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+ SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
+
+ if (Created) {
+ Created->push_back(Cmp.getNode());
+ Created->push_back(Add.getNode());
+ Created->push_back(CSel.getNode());
+ }
+
+ // Divide by pow2.
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64));
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (Divisor.isNonNegative())
+ return SRA;
+
+ if (Created)
+ Created->push_back(SRA.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA);
+}
+
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -6459,14 +6936,14 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
return SDValue();
- // Now check that the other operand of the AND is a constant splat. We could
+ // Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
if (BuildVectorSDNode *BV =
dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
- // Bail out if the vector isn't a constant splat.
- if (!BV->getConstantSplatNode())
+ // Bail out if the vector isn't a constant.
+ if (!BV->isConstant())
return SDValue();
// Everything checks out. Build up the new and improved node.
@@ -6486,7 +6963,8 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
return SDValue();
}
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+ const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
@@ -6505,7 +6983,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
// conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
// This eliminates an "integer-to-vector-move UOP and improve throughput.
SDValue N0 = N->getOperand(0);
- if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+ if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not change the width of a volatile load.
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -7266,11 +7744,11 @@ static SDValue performExtendCombine(SDNode *N,
// If the vector type isn't a simple VT, it's beyond the scope of what
// we're worried about here. Let legalization do its thing and hope for
// the best.
- if (!ResVT.isSimple())
+ SDValue Src = N->getOperand(0);
+ EVT SrcVT = Src->getValueType(0);
+ if (!ResVT.isSimple() || !SrcVT.isSimple())
return SDValue();
- SDValue Src = N->getOperand(0);
- MVT SrcVT = Src->getValueType(0).getSimpleVT();
// If the source VT is a 64-bit vector, we can play games and get the
// better results we want.
if (SrcVT.getSizeInBits() != 64)
@@ -7294,9 +7772,9 @@ static SDValue performExtendCombine(SDNode *N,
EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
LoVT.getVectorNumElements());
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getIntPtrConstant(0));
+ DAG.getConstant(0, MVT::i64));
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
- DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+ DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64));
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
@@ -7418,9 +7896,9 @@ static SDValue performSTORECombine(SDNode *N,
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
- DAG.getIntPtrConstant(0));
+ DAG.getConstant(0, MVT::i64));
SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
- DAG.getIntPtrConstant(NumElts));
+ DAG.getConstant(NumElts, MVT::i64));
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
@@ -7504,7 +7982,7 @@ static SDValue performPostLD1Combine(SDNode *N,
Ops.push_back(Inc);
EVT Tys[3] = { VT, MVT::i64, MVT::Other };
- SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+ SDVTList SDTys = DAG.getVTList(Tys);
unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
MemVT,
@@ -7634,7 +8112,7 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
Tys[n] = VecTy;
Tys[n++] = MVT::i64; // Type of write back register
Tys[n] = MVT::Other; // Type of the chain
- SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+ SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
@@ -7655,10 +8133,272 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
return SDValue();
}
+// Checks to see if the value is the prescribed width and returns information
+// about its extension mode.
+static
+bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
+ ExtType = ISD::NON_EXTLOAD;
+ switch(V.getNode()->getOpcode()) {
+ default:
+ return false;
+ case ISD::LOAD: {
+ LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
+ if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
+ || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
+ ExtType = LoadNode->getExtensionType();
+ return true;
+ }
+ return false;
+ }
+ case ISD::AssertSext: {
+ VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+ if ((TypeNode->getVT() == MVT::i8 && width == 8)
+ || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+ ExtType = ISD::SEXTLOAD;
+ return true;
+ }
+ return false;
+ }
+ case ISD::AssertZext: {
+ VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
+ if ((TypeNode->getVT() == MVT::i8 && width == 8)
+ || (TypeNode->getVT() == MVT::i16 && width == 16)) {
+ ExtType = ISD::ZEXTLOAD;
+ return true;
+ }
+ return false;
+ }
+ case ISD::Constant:
+ case ISD::TargetConstant: {
+ if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+ 1LL << (width - 1))
+ return true;
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// This function does a whole lot of voodoo to determine if the tests are
+// equivalent without and with a mask. Essentially what happens is that given a
+// DAG resembling:
+//
+// +-------------+ +-------------+ +-------------+ +-------------+
+// | Input | | AddConstant | | CompConstant| | CC |
+// +-------------+ +-------------+ +-------------+ +-------------+
+// | | | |
+// V V | +----------+
+// +-------------+ +----+ | |
+// | ADD | |0xff| | |
+// +-------------+ +----+ | |
+// | | | |
+// V V | |
+// +-------------+ | |
+// | AND | | |
+// +-------------+ | |
+// | | |
+// +-----+ | |
+// | | |
+// V V V
+// +-------------+
+// | CMP |
+// +-------------+
+//
+// The AND node may be safely removed for some combinations of inputs. In
+// particular we need to take into account the extension type of the Input,
+// the exact values of AddConstant, CompConstant, and CC, along with the nominal
+// width of the input (this can work for any width inputs, the above graph is
+// specific to 8 bits.
+//
+// The specific equations were worked out by generating output tables for each
+// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
+// problem was simplified by working with 4 bit inputs, which means we only
+// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
+// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
+// patterns present in both extensions (0,7). For every distinct set of
+// AddConstant and CompConstants bit patterns we can consider the masked and
+// unmasked versions to be equivalent if the result of this function is true for
+// all 16 distinct bit patterns of for the current extension type of Input (w0).
+//
+// sub w8, w0, w1
+// and w10, w8, #0x0f
+// cmp w8, w2
+// cset w9, AArch64CC
+// cmp w10, w2
+// cset w11, AArch64CC
+// cmp w9, w11
+// cset w0, eq
+// ret
+//
+// Since the above function shows when the outputs are equivalent it defines
+// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
+// would be expensive to run during compiles. The equations below were written
+// in a test harness that confirmed they gave equivalent outputs to the above
+// for all inputs function, so they can be used determine if the removal is
+// legal instead.
+//
+// isEquivalentMaskless() is the code for testing if the AND can be removed
+// factored out of the DAG recognition as the DAG can take several forms.
+
+static
+bool isEquivalentMaskless(unsigned CC, unsigned width,
+ ISD::LoadExtType ExtType, signed AddConstant,
+ signed CompConstant) {
+ // By being careful about our equations and only writing the in term
+ // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
+ // make them generally applicable to all bit widths.
+ signed MaxUInt = (1 << width);
+
+ // For the purposes of these comparisons sign extending the type is
+ // equivalent to zero extending the add and displacing it by half the integer
+ // width. Provided we are careful and make sure our equations are valid over
+ // the whole range we can just adjust the input and avoid writing equations
+ // for sign extended inputs.
+ if (ExtType == ISD::SEXTLOAD)
+ AddConstant -= (1 << (width-1));
+
+ switch(CC) {
+ case AArch64CC::LE:
+ case AArch64CC::GT: {
+ if ((AddConstant == 0) ||
+ (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
+ (AddConstant >= 0 && CompConstant < 0) ||
+ (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
+ return true;
+ } break;
+ case AArch64CC::LT:
+ case AArch64CC::GE: {
+ if ((AddConstant == 0) ||
+ (AddConstant >= 0 && CompConstant <= 0) ||
+ (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
+ return true;
+ } break;
+ case AArch64CC::HI:
+ case AArch64CC::LS: {
+ if ((AddConstant >= 0 && CompConstant < 0) ||
+ (AddConstant <= 0 && CompConstant >= -1 &&
+ CompConstant < AddConstant + MaxUInt))
+ return true;
+ } break;
+ case AArch64CC::PL:
+ case AArch64CC::MI: {
+ if ((AddConstant == 0) ||
+ (AddConstant > 0 && CompConstant <= 0) ||
+ (AddConstant < 0 && CompConstant <= AddConstant))
+ return true;
+ } break;
+ case AArch64CC::LO:
+ case AArch64CC::HS: {
+ if ((AddConstant >= 0 && CompConstant <= 0) ||
+ (AddConstant <= 0 && CompConstant >= 0 &&
+ CompConstant <= AddConstant + MaxUInt))
+ return true;
+ } break;
+ case AArch64CC::EQ:
+ case AArch64CC::NE: {
+ if ((AddConstant > 0 && CompConstant < 0) ||
+ (AddConstant < 0 && CompConstant >= 0 &&
+ CompConstant < AddConstant + MaxUInt) ||
+ (AddConstant >= 0 && CompConstant >= 0 &&
+ CompConstant >= AddConstant) ||
+ (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
+
+ return true;
+ } break;
+ case AArch64CC::VS:
+ case AArch64CC::VC:
+ case AArch64CC::AL:
+ case AArch64CC::NV:
+ return true;
+ case AArch64CC::Invalid:
+ break;
+ }
+
+ return false;
+}
+
+static
+SDValue performCONDCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG, unsigned CCIndex,
+ unsigned CmpIndex) {
+ unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
+ SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
+ unsigned CondOpcode = SubsNode->getOpcode();
+
+ if (CondOpcode != AArch64ISD::SUBS)
+ return SDValue();
+
+ // There is a SUBS feeding this condition. Is it fed by a mask we can
+ // use?
+
+ SDNode *AndNode = SubsNode->getOperand(0).getNode();
+ unsigned MaskBits = 0;
+
+ if (AndNode->getOpcode() != ISD::AND)
+ return SDValue();
+
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
+ uint32_t CNV = CN->getZExtValue();
+ if (CNV == 255)
+ MaskBits = 8;
+ else if (CNV == 65535)
+ MaskBits = 16;
+ }
+
+ if (!MaskBits)
+ return SDValue();
+
+ SDValue AddValue = AndNode->getOperand(0);
+
+ if (AddValue.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // The basic dag structure is correct, grab the inputs and validate them.
+
+ SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
+ SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
+ SDValue SubsInputValue = SubsNode->getOperand(1);
+
+ // The mask is present and the provenance of all the values is a smaller type,
+ // lets see if the mask is superfluous.
+
+ if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
+ !isa<ConstantSDNode>(SubsInputValue.getNode()))
+ return SDValue();
+
+ ISD::LoadExtType ExtType;
+
+ if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
+ !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
+ !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
+ return SDValue();
+
+ if(!isEquivalentMaskless(CC, MaskBits, ExtType,
+ cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
+ cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
+ return SDValue();
+
+ // The AND is not necessary, remove it.
+
+ SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
+ SubsNode->getValueType(1));
+ SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
+
+ SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
+ DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
+
+ return SDValue(N, 0);
+}
+
// Optimize compare with zero and branch.
static SDValue performBRCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
+ SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
+ if (NV.getNode())
+ N = NV.getNode();
SDValue Chain = N->getOperand(0);
SDValue Dest = N->getOperand(1);
SDValue CCVal = N->getOperand(2);
@@ -7747,21 +8487,29 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
EVT ResVT = N->getValueType(0);
- if (!N->getOperand(1).getValueType().isVector())
+ if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
return SDValue();
- if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
+ // If NumMaskElts == 0, the comparison is larger than select result. The
+ // largest real NEON comparison is 64-bits per lane, which means the result is
+ // at most 32-bits and an illegal vector. Just bail out for now.
+ EVT SrcVT = N0.getOperand(0).getValueType();
+
+ // Don't try to do this optimization when the setcc itself has i1 operands.
+ // There are no legal vectors of i1, so this would be pointless.
+ if (SrcVT == MVT::i1)
return SDValue();
- SDLoc DL(N0);
+ int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
+ if (!ResVT.isVector() || NumMaskElts == 0)
+ return SDValue();
- EVT SrcVT = N0.getOperand(0).getValueType();
- SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
- ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+ SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
// First perform a vector comparison, where lane 0 is the one we're interested
// in.
+ SDLoc DL(N0);
SDValue LHS =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
SDValue RHS =
@@ -7771,8 +8519,8 @@ static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
// Now duplicate the comparison mask we want across all other lanes.
SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
- Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
- Mask);
+ Mask = DAG.getNode(ISD::BITCAST, DL,
+ ResVT.changeVectorElementTypeToInteger(), Mask);
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
@@ -7792,7 +8540,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performMulCombine(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
- return performIntToFpCombine(N, DAG);
+ return performIntToFpCombine(N, DAG, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::INTRINSIC_WO_CHAIN:
@@ -7813,6 +8561,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
+ case AArch64ISD::CSEL:
+ return performCONDCombine(N, DCI, DAG, 2, 3);
case AArch64ISD::DUP:
return performPostLD1Combine(N, DCI, false);
case ISD::INSERT_VECTOR_ELT:
@@ -7968,13 +8718,12 @@ bool AArch64TargetLowering::getPostIndexedAddressParts(
static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) {
- if (N->getValueType(0) != MVT::i16)
- return;
-
SDLoc DL(N);
SDValue Op = N->getOperand(0);
- assert(Op.getValueType() == MVT::f16 &&
- "Inconsistent bitcast? Only 16-bit types should be i16 or f16");
+
+ if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+ return;
+
Op = SDValue(
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
DAG.getUNDEF(MVT::i32), Op,
@@ -8000,17 +8749,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
}
}
-bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
- // Loads and stores less than 128-bits are already atomic; ones above that
- // are doomed anyway, so defer to the default libcall and blame the OS when
- // things go wrong:
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
- return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
- else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
- return LI->getType()->getPrimitiveSizeInBits() == 128;
+bool AArch64TargetLowering::useLoadStackGuardNode() const {
+ return true;
+}
- // For the real atomic operations, we have ldxr/stxr up to 128 bits.
- return Inst->getType()->getPrimitiveSizeInBits() <= 128;
+bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
+ // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+ // reciprocal if there are three or more FDIVs.
+ return NumUsers > 2;
}
TargetLoweringBase::LegalizeTypeAction
@@ -8025,12 +8771,37 @@ AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
+ unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
+ return Size == 128;
+}
+
+// Loads and stores less than 128-bits are already atomic; ones above that
+// are doomed anyway, so defer to the default libcall and blame the OS when
+// things go wrong.
+bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
+ unsigned Size = LI->getType()->getPrimitiveSizeInBits();
+ return Size == 128;
+}
+
+// For the real atomic operations, we have ldxr/stxr up to 128 bits,
+bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
+ unsigned Size = AI->getType()->getPrimitiveSizeInBits();
+ return Size <= 128;
+}
+
+bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
+ return true;
+}
+
Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
- bool IsAcquire =
- Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+ bool IsAcquire = isAtLeastAcquire(Ord);
// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i64, i64} and we have to recombine them into a
@@ -8065,8 +8836,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- bool IsRelease =
- Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+ bool IsRelease = isAtLeastRelease(Ord);
// Since the intrinsics must have legal type, the i128 intrinsics take two
// parameters: "i64, i64". We must marshal Val into the appropriate form
@@ -8093,3 +8863,8 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Val, Stxr->getFunctionType()->getParamType(0)),
Addr);
}
+
+bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+ Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+ return Ty->isArrayTy();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index cb0b9ef..cc25bed 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
-#define LLVM_TARGET_AArch64_ISELLOWERING_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -162,6 +162,16 @@ enum {
SITOF,
UITOF,
+ /// Natural vector cast. ISD::BITCAST is not natural in the big-endian
+ /// world w.r.t vectors; which causes additional REV instructions to be
+ /// generated to compensate for the byte-swapping. But sometimes we do
+ /// need to re-interpret the data in SIMD vector registers in big-endian
+ /// mode without emitting such REV instructions.
+ NVCAST,
+
+ SMULL,
+ UMULL,
+
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
@@ -197,10 +207,9 @@ class AArch64TargetLowering : public TargetLowering {
bool RequireStrictAlign;
public:
- explicit AArch64TargetLowering(TargetMachine &TM);
+ explicit AArch64TargetLowering(const TargetMachine &TM);
- /// Selects the correct CCAssignFn for a the given CallingConvention
- /// value.
+ /// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
@@ -212,10 +221,11 @@ public:
MVT getScalarShiftAmountTy(EVT LHSTy) const override;
- /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+ /// allowsMisalignedMemoryAccesses - Returns true if the target allows
/// unaligned memory accesses. of the specified type.
- bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
- bool *Fast = nullptr) const override {
+ bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+ unsigned Align = 1,
+ bool *Fast = nullptr) const override {
if (RequireStrictAlign)
return false;
// FIXME: True for Cyclone, but not necessary others.
@@ -317,13 +327,17 @@ public:
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
+ bool hasLoadLinkedStoreConditional() const override;
Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const override;
Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Value *Addr, AtomicOrdering Ord) const override;
- bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+ bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
+ bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
+ bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+ bool useLoadStackGuardNode() const override;
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(EVT VT) const override;
@@ -424,6 +438,10 @@ private:
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ std::vector<SDNode *> *Created) const override;
+ bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+
ConstraintType
getConstraintType(const std::string &Constraint) const override;
unsigned getRegisterByName(const char* RegName, EVT VT) const override;
@@ -455,6 +473,10 @@ private:
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
+
+ bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
+ CallingConv::ID CallConv,
+ bool isVarArg) const override;
};
namespace AArch64 {
@@ -464,4 +486,4 @@ FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
} // end namespace llvm
-#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
+#endif
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 3b9e3c6..4923a11 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -29,8 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
class acquiring_load<PatFrag base>
: PatFrag<(ops node:$ptr), (base node:$ptr), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- assert(Ordering != AcquireRelease && "unexpected load ordering");
- return Ordering == Acquire || Ordering == SequentiallyConsistent;
+ return isAtLeastAcquire(Ordering);
}]>;
// An atomic load operation that does not need either acquire or release
@@ -38,7 +37,7 @@ class acquiring_load<PatFrag base>
class relaxed_load<PatFrag base>
: PatFrag<(ops node:$ptr), (base node:$ptr), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- return Ordering == Monotonic || Ordering == Unordered;
+ return !isAtLeastAcquire(Ordering);
}]>;
// 8-bit loads
@@ -114,14 +113,14 @@ class releasing_store<PatFrag base>
: PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
assert(Ordering != AcquireRelease && "unexpected store ordering");
- return Ordering == Release || Ordering == SequentiallyConsistent;
+ return isAtLeastRelease(Ordering);
}]>;
// An atomic store operation that doesn't actually need to be atomic on AArch64.
class relaxed_store<PatFrag base>
: PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
- return Ordering == Monotonic || Ordering == Unordered;
+ return !isAtLeastRelease(Ordering);
}]>;
// 8-bit stores
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index e88c0c0..d295c02 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -843,7 +843,7 @@ def MRSSystemRegisterOperand : AsmOperandClass {
let ParserMethod = "tryParseSysReg";
let DiagnosticType = "MRS";
}
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+// concatenation of op0, op1, CRn, CRm, op2. 16-bit immediate.
def mrs_sysreg_op : Operand<i32> {
let ParserMatchClass = MRSSystemRegisterOperand;
let DecoderMethod = "DecodeMRSSystemRegister";
@@ -863,9 +863,8 @@ def msr_sysreg_op : Operand<i32> {
class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
"mrs", "\t$Rt, $systemreg"> {
- bits<15> systemreg;
- let Inst{20} = 1;
- let Inst{19-5} = systemreg;
+ bits<16> systemreg;
+ let Inst{20-5} = systemreg;
}
// FIXME: Some of these def NZCV, others don't. Best way to model that?
@@ -873,9 +872,8 @@ class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
// would do it, but feels like overkill at this point.
class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
"msr", "\t$systemreg, $Rt"> {
- bits<15> systemreg;
- let Inst{20} = 1;
- let Inst{19-5} = systemreg;
+ bits<16> systemreg;
+ let Inst{20-5} = systemreg;
}
def SystemPStateFieldOperand : AsmOperandClass {
@@ -1351,14 +1349,15 @@ class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
}
multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+ // MADD/MSUB generation is decided by MachineCombiner.cpp
def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
- [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+ [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>,
Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> {
let Inst{31} = 0;
}
def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
- [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+ [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>,
Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> {
let Inst{31} = 1;
}
@@ -1636,7 +1635,7 @@ class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
multiclass AddSub<bit isSub, string mnemonic,
SDPatternOperator OpNode = null_frag> {
- let hasSideEffects = 0 in {
+ let hasSideEffects = 0, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
// Add/Subtract immediate
def Wri : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
mnemonic, OpNode> {
@@ -1961,14 +1960,14 @@ class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
string Alias> {
- let AddedComplexity = 6 in
+ let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
[(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
logical_imm32:$imm))]> {
let Inst{31} = 0;
let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
}
- let AddedComplexity = 6 in
+ let AddedComplexity = 6, isReMaterializable = 1, isAsCheapAsAMove = 1 in
def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
[(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
logical_imm64:$imm))]> {
@@ -2013,8 +2012,10 @@ class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
// Split from LogicalImm as not all instructions have both.
multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
SDPatternOperator OpNode> {
+ let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+ }
def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
[(set GPR32:$Rd, (OpNode GPR32:$Rn,
@@ -2995,7 +2996,7 @@ class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
: BaseLoadStorePreIdx<sz, V, opc,
(outs GPR64sp:$wback, regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset), asm,
- "$Rn = $wback", []>,
+ "$Rn = $wback,@earlyclobber $wback", []>,
Sched<[WriteLD, WriteAdr]>;
let mayStore = 1, mayLoad = 0 in
@@ -3004,7 +3005,7 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
: BaseLoadStorePreIdx<sz, V, opc,
(outs GPR64sp:$wback),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
- asm, "$Rn = $wback",
+ asm, "$Rn = $wback,@earlyclobber $wback",
[(set GPR64sp:$wback,
(storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
Sched<[WriteAdr, WriteST]>;
@@ -3014,7 +3015,6 @@ class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
// Load/store post-indexed
//---
-// (pre-index) load/stores.
class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
string asm, string cstr, list<dag> pat>
: I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
@@ -3042,7 +3042,7 @@ class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
: BaseLoadStorePostIdx<sz, V, opc,
(outs GPR64sp:$wback, regtype:$Rt),
(ins GPR64sp:$Rn, simm9:$offset),
- asm, "$Rn = $wback", []>,
+ asm, "$Rn = $wback,@earlyclobber $wback", []>,
Sched<[WriteLD, WriteI]>;
let mayStore = 1, mayLoad = 0 in
@@ -3051,7 +3051,7 @@ class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
: BaseLoadStorePostIdx<sz, V, opc,
(outs GPR64sp:$wback),
(ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
- asm, "$Rn = $wback",
+ asm, "$Rn = $wback,@earlyclobber $wback",
[(set GPR64sp:$wback,
(storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
Sched<[WriteAdr, WriteST, ReadAdrBase]>;
@@ -3115,7 +3115,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
// (pre-indexed)
class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
string asm>
- : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback,@earlyclobber $wback", []> {
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
@@ -3156,7 +3156,7 @@ class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
string asm>
- : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+ : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback,@earlyclobber $wback", []> {
bits<5> Rt;
bits<5> Rt2;
bits<5> Rn;
@@ -4383,7 +4383,7 @@ class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
}
multiclass SIMDVectorLShiftLongBySizeBHS {
- let neverHasSideEffects = 1 in {
+ let hasSideEffects = 0 in {
def v8i8 : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
"shll", ".8h", ".8b", "8">;
def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
@@ -5260,6 +5260,10 @@ multiclass SIMDZipVector<bits<3>opc, string asm,
def v2i64 : BaseSIMDZipVector<0b111, opc, V128,
asm, ".2d", OpNode, v2i64>;
+ def : Pat<(v4f16 (OpNode V64:$Rn, V64:$Rm)),
+ (!cast<Instruction>(NAME#"v4i16") V64:$Rn, V64:$Rm)>;
+ def : Pat<(v8f16 (OpNode V128:$Rn, V128:$Rm)),
+ (!cast<Instruction>(NAME#"v8i16") V128:$Rn, V128:$Rm)>;
def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
(!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index ce85b2c..e582ed4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
+#include "AArch64MachineCombinerPattern.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -260,8 +261,9 @@ void AArch64InstrInfo::instantiateCondBranch(
BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
} else {
// Folded compare-and-branch
+ // Note that we use addOperand instead of addReg to keep the flags.
const MachineInstrBuilder MIB =
- BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+ BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
if (Cond.size() > 3)
MIB.addImm(Cond[3].getImm());
MIB.addMBB(TBB);
@@ -541,6 +543,51 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
CC);
}
+// FIXME: this implementation should be micro-architecture dependent, so a
+// micro-architecture target hook should be introduced here in future.
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
+ if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
+ return MI->isAsCheapAsAMove();
+
+ switch (MI->getOpcode()) {
+ default:
+ return false;
+
+ // add/sub on register without shift
+ case AArch64::ADDWri:
+ case AArch64::ADDXri:
+ case AArch64::SUBWri:
+ case AArch64::SUBXri:
+ return (MI->getOperand(3).getImm() == 0);
+
+ // logical ops on immediate
+ case AArch64::ANDWri:
+ case AArch64::ANDXri:
+ case AArch64::EORWri:
+ case AArch64::EORXri:
+ case AArch64::ORRWri:
+ case AArch64::ORRXri:
+ return true;
+
+ // logical ops on register without shift
+ case AArch64::ANDWrr:
+ case AArch64::ANDXrr:
+ case AArch64::BICWrr:
+ case AArch64::BICXrr:
+ case AArch64::EONWrr:
+ case AArch64::EONXrr:
+ case AArch64::EORWrr:
+ case AArch64::EORXrr:
+ case AArch64::ORNWrr:
+ case AArch64::ORNXrr:
+ case AArch64::ORRWrr:
+ case AArch64::ORRXrr:
+ return true;
+ }
+
+ llvm_unreachable("Unknown opcode to check as cheap as a move!");
+}
+
bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const {
@@ -561,6 +608,42 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
}
}
+bool
+AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
+ MachineInstr *MIb,
+ AliasAnalysis *AA) const {
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ unsigned BaseRegA = 0, BaseRegB = 0;
+ int OffsetA = 0, OffsetB = 0;
+ int WidthA = 0, WidthB = 0;
+
+ assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
+ "MIa must be a store or a load");
+ assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
+ "MIb must be a store or a load");
+
+ if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
+ MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+ return false;
+
+ // Retrieve the base register, offset from the base register and width. Width
+ // is the size of memory that is being loaded/stored (e.g. 1, 2, 4, 8). If
+ // base registers are identical, and the offset of a lower memory access +
+ // the width doesn't overlap the offset of a higher memory access,
+ // then the memory accesses are different.
+ if (getLdStBaseRegImmOfsWidth(MIa, BaseRegA, OffsetA, WidthA, TRI) &&
+ getLdStBaseRegImmOfsWidth(MIb, BaseRegB, OffsetB, WidthB, TRI)) {
+ if (BaseRegA == BaseRegB) {
+ int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
+ int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
+ int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
+ if (LowOffset + LowWidth <= HighOffset)
+ return true;
+ }
+ }
+ return false;
+}
+
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
/// Return true if the comparison instruction can be analyzed.
@@ -595,7 +678,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
SrcReg = MI->getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
- CmpValue = MI->getOperand(2).getImm();
+ // FIXME: In order to convert CmpValue to 0 or 1
+ CmpValue = (MI->getOperand(2).getImm() != 0);
return true;
case AArch64::ANDSWri:
case AArch64::ANDSXri:
@@ -604,9 +688,14 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
SrcReg = MI->getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
- CmpValue = AArch64_AM::decodeLogicalImmediate(
- MI->getOperand(2).getImm(),
- MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+ // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
+ // while the type of CmpValue is int. When converting uint64_t to int,
+ // the high 32 bits of uint64_t will be lost.
+ // In fact it causes a bug in spec2006-483.xalancbmk
+ // CmpValue is only used to compare with zero in OptimizeCompareInstr
+ CmpValue = (AArch64_AM::decodeLogicalImmediate(
+ MI->getOperand(2).getImm(),
+ MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0);
return true;
}
@@ -619,8 +708,8 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
MachineFunction *MF = MBB->getParent();
assert(MF && "Can't get MachineFunction here");
const TargetMachine *TM = &MF->getTarget();
- const TargetInstrInfo *TII = TM->getInstrInfo();
- const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+ const TargetInstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
+ const TargetRegisterInfo *TRI = TM->getSubtargetImpl()->getRegisterInfo();
MachineRegisterInfo *MRI = &MF->getRegInfo();
for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
@@ -652,6 +741,87 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
return true;
}
+/// \brief Return the opcode that does not set flags when possible - otherwise
+/// return the original opcode. The caller is responsible to do the actual
+/// substitution and legality checking.
+static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
+ // Don't convert all compare instructions, because for some the zero register
+ // encoding becomes the sp register.
+ bool MIDefinesZeroReg = false;
+ if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR))
+ MIDefinesZeroReg = true;
+
+ switch (MI->getOpcode()) {
+ default:
+ return MI->getOpcode();
+ case AArch64::ADDSWrr:
+ return AArch64::ADDWrr;
+ case AArch64::ADDSWri:
+ return MIDefinesZeroReg ? AArch64::ADDSWri : AArch64::ADDWri;
+ case AArch64::ADDSWrs:
+ return MIDefinesZeroReg ? AArch64::ADDSWrs : AArch64::ADDWrs;
+ case AArch64::ADDSWrx:
+ return AArch64::ADDWrx;
+ case AArch64::ADDSXrr:
+ return AArch64::ADDXrr;
+ case AArch64::ADDSXri:
+ return MIDefinesZeroReg ? AArch64::ADDSXri : AArch64::ADDXri;
+ case AArch64::ADDSXrs:
+ return MIDefinesZeroReg ? AArch64::ADDSXrs : AArch64::ADDXrs;
+ case AArch64::ADDSXrx:
+ return AArch64::ADDXrx;
+ case AArch64::SUBSWrr:
+ return AArch64::SUBWrr;
+ case AArch64::SUBSWri:
+ return MIDefinesZeroReg ? AArch64::SUBSWri : AArch64::SUBWri;
+ case AArch64::SUBSWrs:
+ return MIDefinesZeroReg ? AArch64::SUBSWrs : AArch64::SUBWrs;
+ case AArch64::SUBSWrx:
+ return AArch64::SUBWrx;
+ case AArch64::SUBSXrr:
+ return AArch64::SUBXrr;
+ case AArch64::SUBSXri:
+ return MIDefinesZeroReg ? AArch64::SUBSXri : AArch64::SUBXri;
+ case AArch64::SUBSXrs:
+ return MIDefinesZeroReg ? AArch64::SUBSXrs : AArch64::SUBXrs;
+ case AArch64::SUBSXrx:
+ return AArch64::SUBXrx;
+ }
+}
+
+/// True when condition code could be modified on the instruction
+/// trace starting at from and ending at to.
+static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To,
+ const bool CheckOnlyCCWrites,
+ const TargetRegisterInfo *TRI) {
+ // We iterate backward starting \p To until we hit \p From
+ MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin();
+
+ // Early exit if To is at the beginning of the BB.
+ if (I == B)
+ return true;
+
+ // Check whether the definition of SrcReg is in the same basic block as
+ // Compare. If not, assume the condition code gets modified on some path.
+ if (To->getParent() != From->getParent())
+ return true;
+
+ // Check that NZCV isn't set on the trace.
+ for (--I; I != E; --I) {
+ const MachineInstr &Instr = *I;
+
+ if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+ (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI)))
+ // This instruction modifies or uses NZCV after the one we want to
+ // change.
+ return true;
+ if (I == B)
+ // We currently don't allow the instruction trace to cross basic
+ // block boundaries
+ return true;
+ }
+ return false;
+}
/// optimizeCompareInstr - Convert the instruction supplying the argument to the
/// comparison into one that sets the zero bit in the flags register.
bool AArch64InstrInfo::optimizeCompareInstr(
@@ -661,28 +831,15 @@ bool AArch64InstrInfo::optimizeCompareInstr(
// Replace SUBSWrr with SUBWrr if NZCV is not used.
int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
if (Cmp_NZCV != -1) {
- unsigned NewOpc;
- switch (CmpInstr->getOpcode()) {
- default:
- return false;
- case AArch64::ADDSWrr: NewOpc = AArch64::ADDWrr; break;
- case AArch64::ADDSWri: NewOpc = AArch64::ADDWri; break;
- case AArch64::ADDSWrs: NewOpc = AArch64::ADDWrs; break;
- case AArch64::ADDSWrx: NewOpc = AArch64::ADDWrx; break;
- case AArch64::ADDSXrr: NewOpc = AArch64::ADDXrr; break;
- case AArch64::ADDSXri: NewOpc = AArch64::ADDXri; break;
- case AArch64::ADDSXrs: NewOpc = AArch64::ADDXrs; break;
- case AArch64::ADDSXrx: NewOpc = AArch64::ADDXrx; break;
- case AArch64::SUBSWrr: NewOpc = AArch64::SUBWrr; break;
- case AArch64::SUBSWri: NewOpc = AArch64::SUBWri; break;
- case AArch64::SUBSWrs: NewOpc = AArch64::SUBWrs; break;
- case AArch64::SUBSWrx: NewOpc = AArch64::SUBWrx; break;
- case AArch64::SUBSXrr: NewOpc = AArch64::SUBXrr; break;
- case AArch64::SUBSXri: NewOpc = AArch64::SUBXri; break;
- case AArch64::SUBSXrs: NewOpc = AArch64::SUBXrs; break;
- case AArch64::SUBSXrx: NewOpc = AArch64::SUBXrx; break;
+ if (CmpInstr->definesRegister(AArch64::WZR) ||
+ CmpInstr->definesRegister(AArch64::XZR)) {
+ CmpInstr->eraseFromParent();
+ return true;
}
-
+ unsigned Opc = CmpInstr->getOpcode();
+ unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+ if (NewOpc == Opc)
+ return false;
const MCInstrDesc &MCID = get(NewOpc);
CmpInstr->setDesc(MCID);
CmpInstr->RemoveOperand(Cmp_NZCV);
@@ -693,6 +850,9 @@ bool AArch64InstrInfo::optimizeCompareInstr(
}
// Continue only if we have a "ri" where immediate is zero.
+ // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
+ // function.
+ assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
if (CmpValue != 0 || SrcReg2 != 0)
return false;
@@ -705,36 +865,10 @@ bool AArch64InstrInfo::optimizeCompareInstr(
if (!MI)
return false;
- // We iterate backward, starting from the instruction before CmpInstr and
- // stop when reaching the definition of the source register or done with the
- // basic block, to check whether NZCV is used or modified in between.
- MachineBasicBlock::iterator I = CmpInstr, E = MI,
- B = CmpInstr->getParent()->begin();
-
- // Early exit if CmpInstr is at the beginning of the BB.
- if (I == B)
- return false;
-
- // Check whether the definition of SrcReg is in the same basic block as
- // Compare. If not, we can't optimize away the Compare.
- if (MI->getParent() != CmpInstr->getParent())
- return false;
-
- // Check that NZCV isn't set between the comparison instruction and the one we
- // want to change.
+ bool CheckOnlyCCWrites = false;
const TargetRegisterInfo *TRI = &getRegisterInfo();
- for (--I; I != E; --I) {
- const MachineInstr &Instr = *I;
-
- if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
- Instr.readsRegister(AArch64::NZCV, TRI))
- // This instruction modifies or uses NZCV after the one we want to
- // change. We can't do this transformation.
- return false;
- if (I == B)
- // The 'and' is below the comparison instruction.
- return false;
- }
+ if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI))
+ return false;
unsigned NewOpc = MI->getOpcode();
switch (MI->getOpcode()) {
@@ -848,6 +982,56 @@ bool AArch64InstrInfo::optimizeCompareInstr(
return true;
}
+bool
+AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+ if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+ return false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ unsigned Reg = MI->getOperand(0).getReg();
+ const GlobalValue *GV =
+ cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+ const TargetMachine &TM = MBB.getParent()->getTarget();
+ unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
+ const unsigned char MO_NC = AArch64II::MO_NC;
+
+ if ((OpFlags & AArch64II::MO_GOT) != 0) {
+ BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill).addImm(0)
+ .addMemOperand(*MI->memoperands_begin());
+ } else if (TM.getCodeModel() == CodeModel::Large) {
+ BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+ BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+ BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+ BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill).addImm(0)
+ .addMemOperand(*MI->memoperands_begin());
+ } else {
+ BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
+ .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+ unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | MO_NC;
+ BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
+ .addReg(Reg, RegState::Kill)
+ .addGlobalAddress(GV, 0, LoFlags)
+ .addMemOperand(*MI->memoperands_begin());
+ }
+
+ MBB.erase(MI);
+
+ return true;
+}
+
/// Return true if this is this instruction has a non-zero immediate
bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
switch (MI->getOpcode()) {
@@ -963,12 +1147,14 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
return true;
}
+ break;
case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
if (MI->getOperand(2).getImm() == 0) {
assert(MI->getDesc().getNumOperands() == 4 &&
MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
return true;
}
+ break;
}
return false;
}
@@ -991,6 +1177,7 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
"invalid ORRv16i8 operands");
return true;
}
+ break;
}
return false;
}
@@ -1152,6 +1339,102 @@ AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
};
}
+bool AArch64InstrInfo::getLdStBaseRegImmOfsWidth(
+ MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
+ const TargetRegisterInfo *TRI) const {
+ // Handle only loads/stores with base register followed by immediate offset.
+ if (LdSt->getNumOperands() != 3)
+ return false;
+ if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+ return false;
+
+ // Offset is calculated as the immediate operand multiplied by the scaling factor.
+ // Unscaled instructions have scaling factor set to 1.
+ int Scale = 0;
+ switch (LdSt->getOpcode()) {
+ default:
+ return false;
+ case AArch64::LDURQi:
+ case AArch64::STURQi:
+ Width = 16;
+ Scale = 1;
+ break;
+ case AArch64::LDURXi:
+ case AArch64::LDURDi:
+ case AArch64::STURXi:
+ case AArch64::STURDi:
+ Width = 8;
+ Scale = 1;
+ break;
+ case AArch64::LDURWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURSWi:
+ case AArch64::STURWi:
+ case AArch64::STURSi:
+ Width = 4;
+ Scale = 1;
+ break;
+ case AArch64::LDURHi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSHWi:
+ case AArch64::STURHi:
+ case AArch64::STURHHi:
+ Width = 2;
+ Scale = 1;
+ break;
+ case AArch64::LDURBi:
+ case AArch64::LDURBBi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSBWi:
+ case AArch64::STURBi:
+ case AArch64::STURBBi:
+ Width = 1;
+ Scale = 1;
+ break;
+ case AArch64::LDRXui:
+ case AArch64::STRXui:
+ Scale = Width = 8;
+ break;
+ case AArch64::LDRWui:
+ case AArch64::STRWui:
+ Scale = Width = 4;
+ break;
+ case AArch64::LDRBui:
+ case AArch64::STRBui:
+ Scale = Width = 1;
+ break;
+ case AArch64::LDRHui:
+ case AArch64::STRHui:
+ Scale = Width = 2;
+ break;
+ case AArch64::LDRSui:
+ case AArch64::STRSui:
+ Scale = Width = 4;
+ break;
+ case AArch64::LDRDui:
+ case AArch64::STRDui:
+ Scale = Width = 8;
+ break;
+ case AArch64::LDRQui:
+ case AArch64::STRQui:
+ Scale = Width = 16;
+ break;
+ case AArch64::LDRBBui:
+ case AArch64::STRBBui:
+ Scale = Width = 1;
+ break;
+ case AArch64::LDRHHui:
+ case AArch64::STRHHui:
+ Scale = Width = 2;
+ break;
+ };
+
+ BaseReg = LdSt->getOperand(1).getReg();
+ Offset = LdSt->getOperand(2).getImm() * Scale;
+ return true;
+}
+
/// Detect opportunities for ldp/stp formation.
///
/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
@@ -1194,16 +1477,15 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
}
}
-MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
- int FrameIx,
- uint64_t Offset,
- const MDNode *MDPtr,
- DebugLoc DL) const {
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
+ MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
+ const MDNode *Expr, DebugLoc DL) const {
MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
.addFrameIndex(FrameIx)
.addImm(0)
.addImm(Offset)
- .addMetadata(MDPtr);
+ .addMetadata(Var)
+ .addMetadata(Expr);
return &*MIB;
}
@@ -2087,3 +2369,592 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
NopInst.setOpcode(AArch64::HINT);
NopInst.addOperand(MCOperand::CreateImm(0));
}
+/// useMachineCombiner - return true when a target supports MachineCombiner
+bool AArch64InstrInfo::useMachineCombiner() const {
+ // AArch64 supports the combiner
+ return true;
+}
+//
+// True when Opc sets flag
+static bool isCombineInstrSettingFlag(unsigned Opc) {
+ switch (Opc) {
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXrr:
+ // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+//
+// 32b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate32(unsigned Opc) {
+ switch (Opc) {
+ case AArch64::ADDWrr:
+ case AArch64::ADDWri:
+ case AArch64::SUBWrr:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWri:
+ case AArch64::SUBSWrr:
+ // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+ case AArch64::SUBWri:
+ case AArch64::SUBSWri:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+//
+// 64b Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate64(unsigned Opc) {
+ switch (Opc) {
+ case AArch64::ADDXrr:
+ case AArch64::ADDXri:
+ case AArch64::SUBXrr:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXri:
+ case AArch64::SUBSXrr:
+ // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
+ case AArch64::SUBXri:
+ case AArch64::SUBSXri:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+//
+// Opcodes that can be combined with a MUL
+static bool isCombineInstrCandidate(unsigned Opc) {
+ return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
+}
+
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+ unsigned MulOpc, unsigned ZeroReg) {
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineInstr *MI = nullptr;
+ // We need a virtual register definition.
+ if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ MI = MRI.getUniqueVRegDef(MO.getReg());
+ // And it needs to be in the trace (otherwise, it won't have a depth).
+ if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
+ return false;
+
+ assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+ MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+
+ // The third input reg must be zero.
+ if (MI->getOperand(3).getReg() != ZeroReg)
+ return false;
+
+ // Must only used by the user we combine with.
+ if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+ return false;
+
+ return true;
+}
+
+/// hasPattern - return true when there is potentially a faster code sequence
+/// for an instruction chain ending in \p Root. All potential patterns are
+/// listed
+/// in the \p Pattern vector. Pattern should be sorted in priority order since
+/// the pattern evaluator stops checking as soon as it finds a faster sequence.
+
+bool AArch64InstrInfo::hasPattern(
+ MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const {
+ unsigned Opc = Root.getOpcode();
+ MachineBasicBlock &MBB = *Root.getParent();
+ bool Found = false;
+
+ if (!isCombineInstrCandidate(Opc))
+ return 0;
+ if (isCombineInstrSettingFlag(Opc)) {
+ int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
+ // When NZCV is live bail out.
+ if (Cmp_NZCV == -1)
+ return 0;
+ unsigned NewOpc = convertFlagSettingOpcode(&Root);
+ // When opcode can't change bail out.
+ // CHECKME: do we miss any cases for opcode conversion?
+ if (NewOpc == Opc)
+ return 0;
+ Opc = NewOpc;
+ }
+
+ switch (Opc) {
+ default:
+ break;
+ case AArch64::ADDWrr:
+ assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+ "ADDWrr does not have register operands");
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::ADDXrr:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBWrr:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBXrr:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP1);
+ Found = true;
+ }
+ if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP2);
+ Found = true;
+ }
+ break;
+ case AArch64::ADDWri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULADDWI_OP1);
+ Found = true;
+ }
+ break;
+ case AArch64::ADDXri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULADDXI_OP1);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBWri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr,
+ AArch64::WZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1);
+ Found = true;
+ }
+ break;
+ case AArch64::SUBXri:
+ if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr,
+ AArch64::XZR)) {
+ Pattern.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1);
+ Found = true;
+ }
+ break;
+ }
+ return Found;
+}
+
+/// genMadd - Generate madd instruction and combine mul and add.
+/// Example:
+/// MUL I=A,B,0
+/// ADD R,I,C
+/// ==> MADD R,A,B,C
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ unsigned IdxMulOpd, unsigned MaddOpc,
+ const TargetRegisterClass *RC) {
+ assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+ unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
+ MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+ unsigned ResultReg = Root.getOperand(0).getReg();
+ unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ bool Src0IsKill = MUL->getOperand(1).isKill();
+ unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ bool Src1IsKill = MUL->getOperand(2).isKill();
+ unsigned SrcReg2 = Root.getOperand(IdxOtherOpd).getReg();
+ bool Src2IsKill = Root.getOperand(IdxOtherOpd).isKill();
+
+ if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ MRI.constrainRegClass(ResultReg, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ MRI.constrainRegClass(SrcReg0, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ MRI.constrainRegClass(SrcReg1, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
+ MRI.constrainRegClass(SrcReg2, RC);
+
+ MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+ ResultReg)
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addReg(SrcReg2, getKillRegState(Src2IsKill));
+ // Insert the MADD
+ InsInstrs.push_back(MIB);
+ return MUL;
+}
+
+/// genMaddR - Generate madd instruction and combine mul and add using
+/// an extra virtual register
+/// Example - an ADD intermediate needs to be stored in a register:
+/// MUL I=A,B,0
+/// ADD R,I,Imm
+/// ==> ORR V, ZR, Imm
+/// ==> MADD R,A,B,V
+/// \param Root is the ADD instruction
+/// \param [out] InsInstrs is a vector of machine instructions and will
+/// contain the generated madd instruction
+/// \param IdxMulOpd is index of operand in Root that is the result of
+/// the MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the madd instruction
+/// \param VR is a virtual register that holds the value of an ADD operand
+/// (V in the example above).
+static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ unsigned IdxMulOpd, unsigned MaddOpc,
+ unsigned VR, const TargetRegisterClass *RC) {
+ assert(IdxMulOpd == 1 || IdxMulOpd == 2);
+
+ MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg());
+ unsigned ResultReg = Root.getOperand(0).getReg();
+ unsigned SrcReg0 = MUL->getOperand(1).getReg();
+ bool Src0IsKill = MUL->getOperand(1).isKill();
+ unsigned SrcReg1 = MUL->getOperand(2).getReg();
+ bool Src1IsKill = MUL->getOperand(2).isKill();
+
+ if (TargetRegisterInfo::isVirtualRegister(ResultReg))
+ MRI.constrainRegClass(ResultReg, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg0))
+ MRI.constrainRegClass(SrcReg0, RC);
+ if (TargetRegisterInfo::isVirtualRegister(SrcReg1))
+ MRI.constrainRegClass(SrcReg1, RC);
+ if (TargetRegisterInfo::isVirtualRegister(VR))
+ MRI.constrainRegClass(VR, RC);
+
+ MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
+ ResultReg)
+ .addReg(SrcReg0, getKillRegState(Src0IsKill))
+ .addReg(SrcReg1, getKillRegState(Src1IsKill))
+ .addReg(VR);
+ // Insert the MADD
+ InsInstrs.push_back(MIB);
+ return MUL;
+}
+
+/// genAlternativeCodeSequence - when hasPattern() finds a pattern
+/// this function generates the instructions that could replace the
+/// original code sequence
+void AArch64InstrInfo::genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+ MachineBasicBlock &MBB = *Root.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineFunction &MF = *MBB.getParent();
+ const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+ MachineInstr *MUL;
+ const TargetRegisterClass *RC;
+ unsigned Opc;
+ switch (Pattern) {
+ default:
+ // signal error.
+ break;
+ case MachineCombinerPattern::MC_MULADDW_OP1:
+ case MachineCombinerPattern::MC_MULADDX_OP1:
+ // MUL I=A,B,0
+ // ADD R,I,C
+ // ==> MADD R,A,B,C
+ // --- Create(MADD);
+ if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) {
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MC_MULADDW_OP2:
+ case MachineCombinerPattern::MC_MULADDX_OP2:
+ // MUL I=A,B,0
+ // ADD R,C,I
+ // ==> MADD R,A,B,C
+ // --- Create(MADD);
+ if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) {
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MC_MULADDWI_OP1:
+ case MachineCombinerPattern::MC_MULADDXI_OP1: {
+ // MUL I=A,B,0
+ // ADD R,I,Imm
+ // ==> ORR V, ZR, Imm
+ // ==> MADD R,A,B,V
+ // --- Create(MADD);
+ const TargetRegisterClass *OrrRC;
+ unsigned BitSize, OrrOpc, ZeroReg;
+ if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) {
+ OrrOpc = AArch64::ORRWri;
+ OrrRC = &AArch64::GPR32spRegClass;
+ BitSize = 32;
+ ZeroReg = AArch64::WZR;
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ OrrOpc = AArch64::ORRXri;
+ OrrRC = &AArch64::GPR64spRegClass;
+ BitSize = 64;
+ ZeroReg = AArch64::XZR;
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ uint64_t Imm = Root.getOperand(2).getImm();
+
+ if (Root.getOperand(3).isImm()) {
+ unsigned Val = Root.getOperand(3).getImm();
+ Imm = Imm << Val;
+ }
+ uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+ .addReg(ZeroReg)
+ .addImm(Encoding);
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+ }
+ break;
+ }
+ case MachineCombinerPattern::MC_MULSUBW_OP1:
+ case MachineCombinerPattern::MC_MULSUBX_OP1: {
+ // MUL I=A,B,0
+ // SUB R,I, C
+ // ==> SUB V, 0, C
+ // ==> MADD R,A,B,V // = -C + A*B
+ // --- Create(MADD);
+ const TargetRegisterClass *SubRC;
+ unsigned SubOpc, ZeroReg;
+ if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) {
+ SubOpc = AArch64::SUBWrr;
+ SubRC = &AArch64::GPR32spRegClass;
+ ZeroReg = AArch64::WZR;
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ SubOpc = AArch64::SUBXrr;
+ SubRC = &AArch64::GPR64spRegClass;
+ ZeroReg = AArch64::XZR;
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ unsigned NewVR = MRI.createVirtualRegister(SubRC);
+ // SUB NewVR, 0, C
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
+ .addReg(ZeroReg)
+ .addOperand(Root.getOperand(2));
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+ break;
+ }
+ case MachineCombinerPattern::MC_MULSUBW_OP2:
+ case MachineCombinerPattern::MC_MULSUBX_OP2:
+ // MUL I=A,B,0
+ // SUB R,C,I
+ // ==> MSUB R,A,B,C (computes C - A*B)
+ // --- Create(MSUB);
+ if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) {
+ Opc = AArch64::MSUBWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ Opc = AArch64::MSUBXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MC_MULSUBWI_OP1:
+ case MachineCombinerPattern::MC_MULSUBXI_OP1: {
+ // MUL I=A,B,0
+ // SUB R,I, Imm
+ // ==> ORR V, ZR, -Imm
+ // ==> MADD R,A,B,V // = -Imm + A*B
+ // --- Create(MADD);
+ const TargetRegisterClass *OrrRC;
+ unsigned BitSize, OrrOpc, ZeroReg;
+ if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) {
+ OrrOpc = AArch64::ORRWri;
+ OrrRC = &AArch64::GPR32spRegClass;
+ BitSize = 32;
+ ZeroReg = AArch64::WZR;
+ Opc = AArch64::MADDWrrr;
+ RC = &AArch64::GPR32RegClass;
+ } else {
+ OrrOpc = AArch64::ORRXri;
+ OrrRC = &AArch64::GPR64spRegClass;
+ BitSize = 64;
+ ZeroReg = AArch64::XZR;
+ Opc = AArch64::MADDXrrr;
+ RC = &AArch64::GPR64RegClass;
+ }
+ unsigned NewVR = MRI.createVirtualRegister(OrrRC);
+ int Imm = Root.getOperand(2).getImm();
+ if (Root.getOperand(3).isImm()) {
+ unsigned Val = Root.getOperand(3).getImm();
+ Imm = Imm << Val;
+ }
+ uint64_t UImm = -Imm << (64 - BitSize) >> (64 - BitSize);
+ uint64_t Encoding;
+ if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc), NewVR)
+ .addReg(ZeroReg)
+ .addImm(Encoding);
+ InsInstrs.push_back(MIB1);
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+ MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
+ }
+ break;
+ }
+ } // end switch (Pattern)
+ // Record MUL and ADD/SUB for deletion
+ DelInstrs.push_back(MUL);
+ DelInstrs.push_back(&Root);
+
+ return;
+}
+
+/// \brief Replace csincr-branch sequence by simple conditional branch
+///
+/// Examples:
+/// 1.
+/// csinc w9, wzr, wzr, <condition code>
+/// tbnz w9, #0, 0x44
+/// to
+/// b.<inverted condition code>
+///
+/// 2.
+/// csinc w9, wzr, wzr, <condition code>
+/// tbz w9, #0, 0x44
+/// to
+/// b.<condition code>
+///
+/// \param MI Conditional Branch
+/// \return True when the simple conditional branch is generated
+///
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
+ bool IsNegativeBranch = false;
+ bool IsTestAndBranch = false;
+ unsigned TargetBBInMI = 0;
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown branch instruction?");
+ case AArch64::Bcc:
+ return false;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ TargetBBInMI = 1;
+ break;
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ TargetBBInMI = 1;
+ IsNegativeBranch = true;
+ break;
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ TargetBBInMI = 2;
+ IsTestAndBranch = true;
+ break;
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ TargetBBInMI = 2;
+ IsNegativeBranch = true;
+ IsTestAndBranch = true;
+ break;
+ }
+ // So we increment a zero register and test for bits other
+ // than bit 0? Conservatively bail out in case the verifier
+ // missed this case.
+ if (IsTestAndBranch && MI->getOperand(1).getImm())
+ return false;
+
+ // Find Definition.
+ assert(MI->getParent() && "Incomplete machine instruciton\n");
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo *MRI = &MF->getRegInfo();
+ unsigned VReg = MI->getOperand(0).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(VReg))
+ return false;
+
+ MachineInstr *DefMI = MRI->getVRegDef(VReg);
+
+ // Look for CSINC
+ if (!(DefMI->getOpcode() == AArch64::CSINCWr &&
+ DefMI->getOperand(1).getReg() == AArch64::WZR &&
+ DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+ !(DefMI->getOpcode() == AArch64::CSINCXr &&
+ DefMI->getOperand(1).getReg() == AArch64::XZR &&
+ DefMI->getOperand(2).getReg() == AArch64::XZR))
+ return false;
+
+ if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+ return false;
+
+ AArch64CC::CondCode CC =
+ (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+ bool CheckOnlyCCWrites = true;
+ // Convert only when the condition code is not modified between
+ // the CSINC and the branch. The CC may be used by other
+ // instructions in between.
+ if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo()))
+ return false;
+ MachineBasicBlock &RefToMBB = *MBB;
+ MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB();
+ DebugLoc DL = MI->getDebugLoc();
+ if (IsNegativeBranch)
+ CC = AArch64CC::getInvertedCondCode(CC);
+ BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+ MI->eraseFromParent();
+ return true;
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index f70b82b..d8f1274 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -11,11 +11,12 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AArch64INSTRINFO_H
-#define LLVM_TARGET_AArch64INSTRINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRINFO_H
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/Target/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
@@ -46,9 +47,15 @@ public:
unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+ bool isAsCheapAsAMove(const MachineInstr *MI) const override;
+
bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
unsigned &DstReg, unsigned &SubIdx) const override;
+ bool
+ areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+ AliasAnalysis *AA = nullptr) const override;
+
unsigned isLoadFromStackSlot(const MachineInstr *MI,
int &FrameIndex) const override;
unsigned isStoreToStackSlot(const MachineInstr *MI,
@@ -87,6 +94,10 @@ public:
unsigned &Offset,
const TargetRegisterInfo *TRI) const override;
+ bool getLdStBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg,
+ int &Offset, int &Width,
+ const TargetRegisterInfo *TRI) const;
+
bool enableClusterLoads() const override { return true; }
bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
@@ -96,8 +107,8 @@ public:
MachineInstr *Second) const override;
MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
- uint64_t Offset, const MDNode *MDPtr,
- DebugLoc DL) const;
+ uint64_t Offset, const MDNode *Var,
+ const MDNode *Expr, DebugLoc DL) const;
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc, unsigned Opcode,
@@ -117,6 +128,7 @@ public:
int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
+ using TargetInstrInfo::foldMemoryOperandImpl;
MachineInstr *
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,
@@ -153,7 +165,27 @@ public:
bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
unsigned SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const override;
-
+ bool optimizeCondBranch(MachineInstr *MI) const override;
+ /// hasPattern - return true when there is potentially a faster code sequence
+ /// for an instruction chain ending in <Root>. All potential patterns are
+ /// listed
+ /// in the <Pattern> array.
+ bool hasPattern(MachineInstr &Root,
+ SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern)
+ const override;
+
+ /// genAlternativeCodeSequence - when hasPattern() finds a pattern
+ /// this function generates the instructions that could replace the
+ /// original code sequence
+ void genAlternativeCodeSequence(
+ MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ SmallVectorImpl<MachineInstr *> &DelInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+ /// useMachineCombiner - AArch64 supports MachineCombiner
+ bool useMachineCombiner() const override;
+
+ bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
private:
void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
MachineBasicBlock *TBB,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 0ba069e..e0fb90a 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -24,6 +24,7 @@ def HasCRC : Predicate<"Subtarget->hasCRC()">,
AssemblerPredicate<"FeatureCRC", "crc">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
+def IsCyclone : Predicate<"Subtarget->isCyclone()">;
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
@@ -236,6 +237,12 @@ def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
SDT_AArch64WrapperLarge>;
+def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
+
+def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+ SDTCisSameAs<1, 2>]>;
+def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
+def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
//===----------------------------------------------------------------------===//
@@ -474,6 +481,24 @@ def trunc_imm : SDNodeXForm<imm, [{
def : Pat<(i64 i64imm_32bit:$src),
(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
+def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32);
+}]>;
+
+def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
+return CurDAG->getTargetConstant(
+ N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64);
+}]>;
+
+
+def : Pat<(f32 fpimm:$in),
+ (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
+def : Pat<(f64 fpimm:$in),
+ (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;
+
+
// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
// sequences.
def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
@@ -632,6 +657,10 @@ def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
+ (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
+ (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
} // AddedComplexity = 7
let AddedComplexity = 5 in {
@@ -782,7 +811,7 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
//===----------------------------------------------------------------------===//
// Bitfield immediate extraction instruction.
//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in
+let hasSideEffects = 0 in
defm EXTR : ExtractImm<"extr">;
def : InstAlias<"ror $dst, $src, $shift",
(EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
@@ -797,7 +826,7 @@ def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
//===----------------------------------------------------------------------===//
// Other bitfield immediate instructions.
//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">;
defm SBFM : BitfieldImm<0b00, "sbfm">;
defm UBFM : BitfieldImm<0b10, "ubfm">;
@@ -970,9 +999,9 @@ def : InstAlias<"cneg $dst, $src, $cc",
// PC-relative instructions.
//===----------------------------------------------------------------------===//
let isReMaterializable = 1 in {
-let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
def ADR : ADRI<0, "adr", adrlabel, []>;
-} // neverHasSideEffects = 1
+} // hasSideEffects = 0
def ADRP : ADRI<1, "adrp", adrplabel,
[(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
@@ -1173,6 +1202,9 @@ defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;
+
defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;
@@ -1213,6 +1245,7 @@ let Predicates = [IsLE] in {
defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+ defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
}
defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
@@ -1226,6 +1259,7 @@ let Predicates = [IsLE] in {
defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
+ defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>;
defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
}
} // AddedComplexity = 10
@@ -1355,6 +1389,8 @@ let Predicates = [IsLE] in {
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+ (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
}
def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
@@ -1376,6 +1412,8 @@ let Predicates = [IsLE] in {
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+ (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
}
def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
@@ -1512,6 +1550,8 @@ let Predicates = [IsLE] in {
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+ (LDURDi GPR64sp:$Rn, simm9:$offset)>;
}
def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
(LDURDi GPR64sp:$Rn, simm9:$offset)>;
@@ -1532,6 +1572,8 @@ let Predicates = [IsLE] in {
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
(LDURQi GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+ (LDURQi GPR64sp:$Rn, simm9:$offset)>;
}
// anyext -> zext
@@ -1828,6 +1870,7 @@ let Predicates = [IsLE] in {
defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+ defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
}
defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
@@ -1842,9 +1885,37 @@ let Predicates = [IsLE] in {
defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+ defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
}
} // AddedComplexity = 10
+// Match stores from lane 0 to the appropriate subreg's store.
+multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
+ ValueType VecTy, ValueType STy,
+ SubRegIndex SubRegIdx,
+ Instruction STRW, Instruction STRX> {
+
+ def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+ (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+ (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+ GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+ def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
+ (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+ (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
+ GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
+}
+
+let AddedComplexity = 19 in {
+ defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
+ defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>;
+ defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>;
+ defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>;
+ defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>;
+}
+
//---
// (unsigned immediate)
defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
@@ -1892,6 +1963,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v2i32 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+ def : Pat<(store (v4f16 FPR64:$Rt),
+ (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+ (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
}
def : Pat<(store (v1f64 FPR64:$Rt),
(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
@@ -1921,6 +1995,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v2i64 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+ def : Pat<(store (v8f16 FPR128:$Rt),
+ (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+ (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
}
def : Pat<(store (f128 FPR128:$Rt),
(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
@@ -1983,6 +2060,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v2i32 FPR64:$Rt),
(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v4f16 FPR64:$Rt),
+ (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+ (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
@@ -2013,6 +2093,9 @@ let Predicates = [IsLE] in {
def : Pat<(store (v2f64 FPR128:$Rt),
(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+ def : Pat<(store (v8f16 FPR128:$Rt),
+ (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+ (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
}
// unscaled i64 truncating stores
@@ -2089,6 +2172,8 @@ def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -2102,6 +2187,8 @@ def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
//---
// (immediate post-indexed)
@@ -2139,6 +2226,8 @@ def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
@@ -2152,6 +2241,8 @@ def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+ (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
//===----------------------------------------------------------------------===//
// Load/store exclusive instructions.
@@ -2384,6 +2475,28 @@ defm FMOV : FPMoveImmediate<"fmov">;
//===----------------------------------------------------------------------===//
defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
+ (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
+ (ABSv8i8 V64:$src)>;
+def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
+ (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
+ (ABSv4i16 V64:$src)>;
+def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
+ (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
+ (ABSv2i32 V64:$src)>;
+def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
+ (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
+ (ABSv16i8 V128:$src)>;
+def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
+ (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
+ (ABSv8i16 V128:$src)>;
+def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
+ (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
+ (ABSv4i32 V128:$src)>;
+def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
+ (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
+ (ABSv2i64 V128:$src)>;
+
defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
@@ -2412,6 +2525,11 @@ def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
(i64 2))))),
(FCVTLv4i32 V128:$Rn)>;
+def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
+ (i64 4))))),
+ (FCVTLv8i16 V128:$Rn)>;
+
defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
@@ -2423,6 +2541,7 @@ def : Pat<(concat_vectors V64:$Rd,
(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
(FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
(FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
@@ -2505,6 +2624,10 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>
defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
+def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
@@ -3101,6 +3224,46 @@ defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+// Additional patterns for SMULL and UMULL
+multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
+ Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+ def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+ (INST8B V64:$Rn, V64:$Rm)>;
+ def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+ (INST4H V64:$Rn, V64:$Rm)>;
+ def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+ (INST2S V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
+ SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
+defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
+ UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+
+// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
+multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
+ Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+ def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
+ (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
+ def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
+ (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
+ def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
+ (INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
+}
+
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+ SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+ UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
+ SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
+defm : Neon_mulacc_widen_patterns<
+ TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
+ UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
+
// Patterns for 64-bit pmull
def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
(PMULLv1i64 V64:$Rn, V64:$Rm)>;
@@ -3183,6 +3346,10 @@ def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+ (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+ (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
// We use EXT to handle extract_subvector to copy the upper 64-bits of a
// 128-bit vector.
@@ -3194,6 +3361,8 @@ def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
+ (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
@@ -3306,6 +3475,19 @@ def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
(v2f64 (DUPv2i64lane
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
(i64 0)))>;
+def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
+ (v4f16 (DUPv4i16lane
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+ (i64 0)))>;
+def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
+ (v8f16 (DUPv8i16lane
+ (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+ (i64 0)))>;
+
+def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+ (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
+ (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
(DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
@@ -3427,6 +3609,23 @@ def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
+ (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+ (EXTRACT_SUBREG
+ (INSvi16lane
+ (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+ VectorIndexS:$imm,
+ (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+ (i64 0)),
+ dsub)>;
+
+def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
+ (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+ (INSvi16lane
+ V128:$Rn, VectorIndexH:$imm,
+ (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+ (i64 0))>;
+
def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
(EXTRACT_SUBREG
@@ -3507,6 +3706,7 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
dsub)>;
}
+defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, INSvi8lane>;
@@ -3522,6 +3722,8 @@ def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
+ (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
(f64 (EXTRACT_SUBREG
(INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
@@ -3532,6 +3734,11 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
(INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
V128:$Rn, VectorIndexS:$idx),
ssub))>;
+def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
+ (f16 (EXTRACT_SUBREG
+ (INSvi16lane (v8f16 (IMPLICIT_DEF)), 0,
+ V128:$Rn, VectorIndexH:$idx),
+ hsub))>;
// All concat_vectors operations are canonicalised to act on i64 vectors for
// AArch64. In the general case we need an instruction, which had just as well be
@@ -3546,6 +3753,7 @@ def : ConcatPat<v2f64, v1f64>;
def : ConcatPat<v4i32, v2i32>;
def : ConcatPat<v4f32, v2f32>;
def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v8f16, v4f16>;
def : ConcatPat<v16i8, v8i8>;
// If the high lanes are undef, though, we can just ignore them:
@@ -3965,7 +4173,7 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
// AdvSIMD indexed element
//----------------------------------------------------------------------------
-let neverHasSideEffects = 1 in {
+let hasSideEffects = 0 in {
defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
}
@@ -4386,7 +4594,7 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
0),
dsub)),
0),
- ssub)))>, Requires<[NotForCodeSize]>;
+ ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -4439,8 +4647,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
0),
dsub)),
0),
- dsub)))>, Requires<[NotForCodeSize]>;
-
+ dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
@@ -4519,7 +4727,7 @@ defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
-let mayLoad = 1, neverHasSideEffects = 1 in {
+let mayLoad = 1, hasSideEffects = 0 in {
defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>;
defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>;
defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>;
@@ -4563,6 +4771,10 @@ def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
(LD1Rv2d GPR64sp:$Rn)>;
def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
(LD1Rv1d GPR64sp:$Rn)>;
+def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+ (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
+ (LD1Rv8h GPR64sp:$Rn)>;
class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
@@ -4576,6 +4788,7 @@ def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>;
def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
+def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;
class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction LD1>
@@ -4590,6 +4803,7 @@ def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>;
def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
+def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>;
defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
@@ -4603,7 +4817,7 @@ defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>;
defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
-let AddedComplexity = 15 in
+let AddedComplexity = 19 in
class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1>
: Pat<(scalar_store
@@ -4617,8 +4831,9 @@ def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>;
def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
+def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>;
-let AddedComplexity = 15 in
+let AddedComplexity = 19 in
class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1>
: Pat<(scalar_store
@@ -4631,6 +4846,7 @@ def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>;
def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
+def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>;
multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1,
@@ -4655,6 +4871,7 @@ defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
ValueType VTy, ValueType STy, Instruction ST1,
@@ -4678,8 +4895,9 @@ defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
-let mayStore = 1, neverHasSideEffects = 1 in {
+let mayStore = 1, hasSideEffects = 0 in {
defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>;
defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>;
@@ -4856,10 +5074,77 @@ def : Pat<(trap), (BRK 1)>;
// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
//
+// Natural vector casts (64 bit)
+def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+
+// Natural vector casts (128 bit)
+def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
@@ -4868,6 +5153,8 @@ def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+ (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
@@ -4880,6 +5167,8 @@ def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
+ (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
@@ -4889,6 +5178,8 @@ def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
+ (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
}
@@ -4917,6 +5208,7 @@ let Predicates = [IsLE] in {
def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
@@ -4926,6 +5218,8 @@ def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
(v1i64 (REV64v4i16 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))),
(v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
+ (v1i64 (REV64v4i16 FPR64:$src))>;
def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
(v1i64 (REV64v2i32 FPR64:$src))>;
}
@@ -4938,6 +5232,7 @@ def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
@@ -4950,6 +5245,8 @@ def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
(v2i32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
(v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
+ (v2i32 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
@@ -4958,6 +5255,7 @@ def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
}
@@ -4970,6 +5268,8 @@ def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))),
(v4i16 (REV16v8i8 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
(v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
+ (v4i16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
(v4i16 (REV32v4i16 FPR64:$src))>;
def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
@@ -4977,12 +5277,41 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
}
let Predicates = [IsLE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))),
+ (v4f16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
+ (v4f16 (REV64v4i16 FPR64:$src))>;
+}
+
+
+
+let Predicates = [IsLE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))),
@@ -4997,6 +5326,8 @@ def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))),
(v8i8 (REV32v8i8 FPR64:$src))>;
def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))),
(v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))),
+ (v8i8 (REV16v8i8 FPR64:$src))>;
}
let Predicates = [IsLE] in {
@@ -5004,6 +5335,7 @@ def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))),
@@ -5014,6 +5346,8 @@ def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))),
(f64 (REV64v2i32 FPR64:$src))>;
def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))),
(f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))),
+ (f64 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;
@@ -5023,6 +5357,7 @@ def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
@@ -5033,6 +5368,8 @@ def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))),
(v1f64 (REV64v8i8 FPR64:$src))>;
def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
(v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
+ (v1f64 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
@@ -5043,6 +5380,7 @@ def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
@@ -5055,6 +5393,8 @@ def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
(v2f32 (REV64v2i32 FPR64:$src))>;
def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
(v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
+ (v2f32 (REV64v4i16 FPR64:$src))>;
}
def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
@@ -5064,6 +5404,7 @@ def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
}
let Predicates = [IsBE] in {
@@ -5075,6 +5416,9 @@ def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
(REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
+ (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src), (i32 8)))>;
def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
@@ -5089,6 +5433,7 @@ let Predicates = [IsLE] in {
def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
}
@@ -5100,6 +5445,8 @@ def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
(v2f64 (REV64v4i32 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
(v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
+ (v2f64 (REV64v8i16 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
(v2f64 (REV64v16i8 FPR128:$src))>;
def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
@@ -5110,6 +5457,7 @@ def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
let Predicates = [IsLE] in {
def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -5120,6 +5468,8 @@ def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
(REV64v4i32 FPR128:$src), (i32 8)))>;
def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
(v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
+ (v4f32 (REV32v8i16 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
(v4f32 (REV32v16i8 FPR128:$src))>;
def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
@@ -5135,6 +5485,7 @@ def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
@@ -5148,6 +5499,8 @@ def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
(v2i64 (REV64v16i8 FPR128:$src))>;
def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
(v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
+ (v2i64 (REV64v8i16 FPR128:$src))>;
}
def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
@@ -5157,6 +5510,7 @@ def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
@@ -5171,6 +5525,8 @@ def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
(v4i32 (REV32v16i8 FPR128:$src))>;
def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
(v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
+ (v4i32 (REV32v8i16 FPR128:$src))>;
}
def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
@@ -5181,6 +5537,7 @@ def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
@@ -5197,6 +5554,36 @@ def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
(v8i16 (REV64v8i16 FPR128:$src))>;
def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
(v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
+ (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))),
+ (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+ (REV64v8i16 FPR128:$src),
+ (i32 8)))>;
+def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
+ (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
+ (v8f16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
+ (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
+ (v8f16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
+ (v8f16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
+ (v8f16 (REV32v8i16 FPR128:$src))>;
}
let Predicates = [IsLE] in {
@@ -5206,6 +5593,7 @@ def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
}
let Predicates = [IsBE] in {
def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
@@ -5222,6 +5610,8 @@ def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
(v16i8 (REV64v16i8 FPR128:$src))>;
def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
(v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
+ (v16i8 (REV16v16i8 FPR128:$src))>;
}
def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
@@ -5245,6 +5635,8 @@ def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
+ (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 3df9c4f..8157981 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -13,20 +13,21 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
using namespace llvm;
#define DEBUG_TYPE "aarch64-ldst-opt"
@@ -108,7 +109,7 @@ private:
int getMemSize(MachineInstr *MemMI);
};
char AArch64LoadStoreOpt::ID = 0;
-}
+} // namespace
static bool isUnscaledLdst(unsigned Opc) {
switch (Opc) {
@@ -931,8 +932,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
const TargetMachine &TM = Fn.getTarget();
- TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
- TRI = TM.getRegisterInfo();
+ TII = static_cast<const AArch64InstrInfo *>(
+ TM.getSubtargetImpl()->getInstrInfo());
+ TRI = TM.getSubtargetImpl()->getRegisterInfo();
bool Modified = false;
for (auto &MBB : Fn)
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 75a17b9..e57b0f4 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -25,8 +25,7 @@
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
-AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
- AsmPrinter &printer)
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
: Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
MCSymbol *
diff --git a/lib/Target/AArch64/AArch64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
index ba50ba9..1e29b80 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -7,8 +7,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64_MCINSTLOWER_H
-#define AArch64_MCINSTLOWER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
#include "llvm/ADT/Triple.h"
#include "llvm/Support/Compiler.h"
@@ -33,7 +33,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
Triple TargetTriple;
public:
- AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+ AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer);
bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
void Lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
new file mode 100644
index 0000000..4164b33
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MachineCombinerPattern.h
@@ -0,0 +1,42 @@
+//===- AArch64MachineCombinerPattern.h -===//
+//===- AArch64 instruction pattern supported by combiner -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines instruction pattern supported by combiner
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H
+
+namespace llvm {
+
+/// Enumeration of instruction pattern supported by machine combiner
+///
+///
+namespace MachineCombinerPattern {
+enum MC_PATTERN : int {
+ MC_NONE = 0,
+ MC_MULADDW_OP1 = 1,
+ MC_MULADDW_OP2 = 2,
+ MC_MULSUBW_OP1 = 3,
+ MC_MULSUBW_OP2 = 4,
+ MC_MULADDWI_OP1 = 5,
+ MC_MULSUBWI_OP1 = 6,
+ MC_MULADDX_OP1 = 7,
+ MC_MULADDX_OP2 = 8,
+ MC_MULSUBX_OP1 = 9,
+ MC_MULSUBX_OP2 = 10,
+ MC_MULADDXI_OP1 = 11,
+ MC_MULSUBXI_OP1 = 12
+};
+} // end namespace MachineCombinerPattern
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 7c257ba..536a8d0 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64MACHINEFUNCTIONINFO_H
-#define AArch64MACHINEFUNCTIONINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@@ -160,4 +160,4 @@ private:
};
} // End llvm namespace
-#endif // AArch64MACHINEFUNCTIONINFO_H
+#endif
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
new file mode 100644
index 0000000..f942c4e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -0,0 +1,383 @@
+//===-- AArch64PBQPRegAlloc.cpp - AArch64 specific PBQP constraints -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file contains the AArch64 / Cortex-A57 specific register allocation
+// constraints for use by the PBQP register allocator.
+//
+// It is essentially a transcription of what is contained in
+// AArch64A57FPLoadBalancing, which tries to use a balanced
+// mix of odd and even D-registers when performing a critical sequence of
+// independent, non-quadword FP/ASIMD floating-point multiply-accumulates.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "aarch64-pbqp"
+
+#include "AArch64.h"
+#include "AArch64PBQPRegAlloc.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+#ifndef NDEBUG
+bool isFPReg(unsigned reg) {
+ return AArch64::FPR32RegClass.contains(reg) ||
+ AArch64::FPR64RegClass.contains(reg) ||
+ AArch64::FPR128RegClass.contains(reg);
+}
+#endif
+
+bool isOdd(unsigned reg) {
+ switch (reg) {
+ default:
+ llvm_unreachable("Register is not from the expected class !");
+ case AArch64::S1:
+ case AArch64::S3:
+ case AArch64::S5:
+ case AArch64::S7:
+ case AArch64::S9:
+ case AArch64::S11:
+ case AArch64::S13:
+ case AArch64::S15:
+ case AArch64::S17:
+ case AArch64::S19:
+ case AArch64::S21:
+ case AArch64::S23:
+ case AArch64::S25:
+ case AArch64::S27:
+ case AArch64::S29:
+ case AArch64::S31:
+ case AArch64::D1:
+ case AArch64::D3:
+ case AArch64::D5:
+ case AArch64::D7:
+ case AArch64::D9:
+ case AArch64::D11:
+ case AArch64::D13:
+ case AArch64::D15:
+ case AArch64::D17:
+ case AArch64::D19:
+ case AArch64::D21:
+ case AArch64::D23:
+ case AArch64::D25:
+ case AArch64::D27:
+ case AArch64::D29:
+ case AArch64::D31:
+ case AArch64::Q1:
+ case AArch64::Q3:
+ case AArch64::Q5:
+ case AArch64::Q7:
+ case AArch64::Q9:
+ case AArch64::Q11:
+ case AArch64::Q13:
+ case AArch64::Q15:
+ case AArch64::Q17:
+ case AArch64::Q19:
+ case AArch64::Q21:
+ case AArch64::Q23:
+ case AArch64::Q25:
+ case AArch64::Q27:
+ case AArch64::Q29:
+ case AArch64::Q31:
+ return true;
+ case AArch64::S0:
+ case AArch64::S2:
+ case AArch64::S4:
+ case AArch64::S6:
+ case AArch64::S8:
+ case AArch64::S10:
+ case AArch64::S12:
+ case AArch64::S14:
+ case AArch64::S16:
+ case AArch64::S18:
+ case AArch64::S20:
+ case AArch64::S22:
+ case AArch64::S24:
+ case AArch64::S26:
+ case AArch64::S28:
+ case AArch64::S30:
+ case AArch64::D0:
+ case AArch64::D2:
+ case AArch64::D4:
+ case AArch64::D6:
+ case AArch64::D8:
+ case AArch64::D10:
+ case AArch64::D12:
+ case AArch64::D14:
+ case AArch64::D16:
+ case AArch64::D18:
+ case AArch64::D20:
+ case AArch64::D22:
+ case AArch64::D24:
+ case AArch64::D26:
+ case AArch64::D28:
+ case AArch64::D30:
+ case AArch64::Q0:
+ case AArch64::Q2:
+ case AArch64::Q4:
+ case AArch64::Q6:
+ case AArch64::Q8:
+ case AArch64::Q10:
+ case AArch64::Q12:
+ case AArch64::Q14:
+ case AArch64::Q16:
+ case AArch64::Q18:
+ case AArch64::Q20:
+ case AArch64::Q22:
+ case AArch64::Q24:
+ case AArch64::Q26:
+ case AArch64::Q28:
+ case AArch64::Q30:
+ return false;
+
+ }
+}
+
+bool haveSameParity(unsigned reg1, unsigned reg2) {
+ assert(isFPReg(reg1) && "Expecting an FP register for reg1");
+ assert(isFPReg(reg2) && "Expecting an FP register for reg2");
+
+ return isOdd(reg1) == isOdd(reg2);
+}
+
+}
+
+bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
+ unsigned Ra) {
+ if (Rd == Ra)
+ return false;
+
+ LiveIntervals &LIs = G.getMetadata().LIS;
+
+ if (TRI->isPhysicalRegister(Rd) || TRI->isPhysicalRegister(Ra)) {
+ DEBUG(dbgs() << "Rd is a physical reg:" << TRI->isPhysicalRegister(Rd)
+ << '\n');
+ DEBUG(dbgs() << "Ra is a physical reg:" << TRI->isPhysicalRegister(Ra)
+ << '\n');
+ return false;
+ }
+
+ PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+ PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(Ra);
+
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+ &G.getNodeMetadata(node1).getAllowedRegs();
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRaAllowed =
+ &G.getNodeMetadata(node2).getAllowedRegs();
+
+ PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+
+ // The edge does not exist. Create one with the appropriate interference
+ // costs.
+ if (edge == G.invalidEdgeId()) {
+ const LiveInterval &ld = LIs.getInterval(Rd);
+ const LiveInterval &la = LIs.getInterval(Ra);
+ bool livesOverlap = ld.overlaps(la);
+
+ PBQPRAGraph::RawMatrix costs(vRdAllowed->size() + 1,
+ vRaAllowed->size() + 1, 0);
+ for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+ unsigned pRd = (*vRdAllowed)[i];
+ for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRaAllowed)[j];
+ if (livesOverlap && TRI->regsOverlap(pRd, pRa))
+ costs[i + 1][j + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+ else
+ costs[i + 1][j + 1] = haveSameParity(pRd, pRa) ? 0.0 : 1.0;
+ }
+ }
+ G.addEdge(node1, node2, std::move(costs));
+ return true;
+ }
+
+ if (G.getEdgeNode1Id(edge) == node2) {
+ std::swap(node1, node2);
+ std::swap(vRdAllowed, vRaAllowed);
+ }
+
+ // Enforce minCost(sameParity(RaClass)) > maxCost(otherParity(RdClass))
+ PBQPRAGraph::RawMatrix costs(G.getEdgeCosts(edge));
+ for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+ unsigned pRd = (*vRdAllowed)[i];
+
+ // Get the maximum cost (excluding unallocatable reg) for same parity
+ // registers
+ PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+ for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRaAllowed)[j];
+ if (haveSameParity(pRd, pRa))
+ if (costs[i + 1][j + 1] !=
+ std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+ costs[i + 1][j + 1] > sameParityMax)
+ sameParityMax = costs[i + 1][j + 1];
+ }
+
+ // Ensure all registers with a different parity have a higher cost
+ // than sameParityMax
+ for (unsigned j = 0, je = vRaAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRaAllowed)[j];
+ if (!haveSameParity(pRd, pRa))
+ if (sameParityMax > costs[i + 1][j + 1])
+ costs[i + 1][j + 1] = sameParityMax + 1.0;
+ }
+ }
+ G.setEdgeCosts(edge, std::move(costs));
+
+ return true;
+}
+
+void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
+ unsigned Ra) {
+ LiveIntervals &LIs = G.getMetadata().LIS;
+
+ // Do some Chain management
+ if (Chains.count(Ra)) {
+ if (Rd != Ra) {
+ DEBUG(dbgs() << "Moving acc chain from " << PrintReg(Ra, TRI) << " to "
+ << PrintReg(Rd, TRI) << '\n';);
+ Chains.remove(Ra);
+ Chains.insert(Rd);
+ }
+ } else {
+ DEBUG(dbgs() << "Creating new acc chain for " << PrintReg(Rd, TRI)
+ << '\n';);
+ Chains.insert(Rd);
+ }
+
+ PBQPRAGraph::NodeId node1 = G.getMetadata().getNodeIdForVReg(Rd);
+
+ const LiveInterval &ld = LIs.getInterval(Rd);
+ for (auto r : Chains) {
+ // Skip self
+ if (r == Rd)
+ continue;
+
+ const LiveInterval &lr = LIs.getInterval(r);
+ if (ld.overlaps(lr)) {
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRdAllowed =
+ &G.getNodeMetadata(node1).getAllowedRegs();
+
+ PBQPRAGraph::NodeId node2 = G.getMetadata().getNodeIdForVReg(r);
+ const PBQPRAGraph::NodeMetadata::AllowedRegVector *vRrAllowed =
+ &G.getNodeMetadata(node2).getAllowedRegs();
+
+ PBQPRAGraph::EdgeId edge = G.findEdge(node1, node2);
+ assert(edge != G.invalidEdgeId() &&
+ "PBQP error ! The edge should exist !");
+
+ DEBUG(dbgs() << "Refining constraint !\n";);
+
+ if (G.getEdgeNode1Id(edge) == node2) {
+ std::swap(node1, node2);
+ std::swap(vRdAllowed, vRrAllowed);
+ }
+
+ // Enforce that cost is higher with all other Chains of the same parity
+ PBQP::Matrix costs(G.getEdgeCosts(edge));
+ for (unsigned i = 0, ie = vRdAllowed->size(); i != ie; ++i) {
+ unsigned pRd = (*vRdAllowed)[i];
+
+ // Get the maximum cost (excluding unallocatable reg) for all other
+ // parity registers
+ PBQP::PBQPNum sameParityMax = std::numeric_limits<PBQP::PBQPNum>::min();
+ for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRrAllowed)[j];
+ if (!haveSameParity(pRd, pRa))
+ if (costs[i + 1][j + 1] !=
+ std::numeric_limits<PBQP::PBQPNum>::infinity() &&
+ costs[i + 1][j + 1] > sameParityMax)
+ sameParityMax = costs[i + 1][j + 1];
+ }
+
+ // Ensure all registers with same parity have a higher cost
+ // than sameParityMax
+ for (unsigned j = 0, je = vRrAllowed->size(); j != je; ++j) {
+ unsigned pRa = (*vRrAllowed)[j];
+ if (haveSameParity(pRd, pRa))
+ if (sameParityMax > costs[i + 1][j + 1])
+ costs[i + 1][j + 1] = sameParityMax + 1.0;
+ }
+ }
+ G.setEdgeCosts(edge, std::move(costs));
+ }
+ }
+}
+
+static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
+ const MachineInstr &MI) {
+ LiveInterval LI = LIs.getInterval(reg);
+ SlotIndex SI = LIs.getInstructionIndex(&MI);
+ return LI.expiredAt(SI);
+}
+
+void A57ChainingConstraint::apply(PBQPRAGraph &G) {
+ const MachineFunction &MF = G.getMetadata().MF;
+ LiveIntervals &LIs = G.getMetadata().LIS;
+
+ TRI = MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+ DEBUG(MF.dump());
+
+ for (const auto &MBB: MF) {
+ Chains.clear(); // FIXME: really needed ? Could not work at MF level ?
+
+ for (const auto &MI: MBB) {
+
+ // Forget Chains which have expired
+ for (auto r : Chains) {
+ SmallVector<unsigned, 8> toDel;
+ if(regJustKilledBefore(LIs, r, MI)) {
+ DEBUG(dbgs() << "Killing chain " << PrintReg(r, TRI) << " at ";
+ MI.print(dbgs()););
+ toDel.push_back(r);
+ }
+
+ while (!toDel.empty()) {
+ Chains.remove(toDel.back());
+ toDel.pop_back();
+ }
+ }
+
+ switch (MI.getOpcode()) {
+ case AArch64::FMSUBSrrr:
+ case AArch64::FMADDSrrr:
+ case AArch64::FNMSUBSrrr:
+ case AArch64::FNMADDSrrr:
+ case AArch64::FMSUBDrrr:
+ case AArch64::FMADDDrrr:
+ case AArch64::FNMSUBDrrr:
+ case AArch64::FNMADDDrrr: {
+ unsigned Rd = MI.getOperand(0).getReg();
+ unsigned Ra = MI.getOperand(3).getReg();
+
+ if (addIntraChainConstraint(G, Rd, Ra))
+ addInterChainConstraint(G, Rd, Ra);
+ break;
+ }
+
+ case AArch64::FMLAv2f32:
+ case AArch64::FMLSv2f32: {
+ unsigned Rd = MI.getOperand(0).getReg();
+ addInterChainConstraint(G, Rd, Rd);
+ break;
+ }
+
+ default:
+ break;
+ }
+ }
+ }
+}
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
new file mode 100644
index 0000000..4f656f9
--- /dev/null
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -0,0 +1,38 @@
+//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+
+namespace llvm {
+
+/// Add the accumulator chaining constraint to a PBQP graph
+class A57ChainingConstraint : public PBQPRAConstraint {
+public:
+ // Add A57 specific constraints to the PBQP graph.
+ void apply(PBQPRAGraph &G) override;
+
+private:
+ SmallSetVector<unsigned, 32> Chains;
+ const TargetRegisterInfo *TRI;
+
+ // Add the accumulator chaining constraint, inside the chain, i.e. so that
+ // parity(Rd) == parity(Ra).
+ // \return true if a constraint was added
+ bool addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+
+ // Add constraints between existing chains
+ void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
+};
+}
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
diff --git a/lib/Target/AArch64/AArch64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
index b22fa24..9e9eec4 100644
--- a/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -12,6 +12,9 @@
//
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64PERFECTSHUFFLE_H
+
// 31 entries have cost 0
// 242 entries have cost 1
// 1447 entries have cost 2
@@ -6584,3 +6587,5 @@ static const unsigned PerfectShuffleTable[6561+1] = {
835584U, // <u,u,u,u>: Cost 0 copy LHS
0
};
+
+#endif
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 4723cc4..97b0f0e 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -21,18 +21,18 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
-#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -193,7 +193,7 @@ private:
// Inserting into the DenseMap may invalidate existing iterator.
// Keep a copy of the key to find the iterator to erase.
Instruction *OldInstr = IPI->first;
- InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
+ InsertPts[NewPt] = std::move(IPI->second);
// Erase IPI.
IPI = InsertPts.find(OldInstr);
InsertPts.erase(IPI);
@@ -569,7 +569,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) {
// global. Do not promote constant expressions either, as they may
// require some code expansion.
if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
- AlreadyChecked.insert(Cst))
+ AlreadyChecked.insert(Cst).second)
LocalChange |= promoteConstant(Cst);
}
}
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 01b9587..d734d43 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -76,7 +76,7 @@ AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
BitVector
AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
// FIXME: avoid re-calculating this every time.
BitVector Reserved(getNumRegs());
@@ -105,7 +105,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
unsigned Reg) const {
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
switch (Reg) {
default:
@@ -169,7 +169,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
unsigned
AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
}
@@ -236,7 +236,7 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
// Note that the incoming offset is based on the SP value at function entry,
// so it'll be negative.
MachineFunction &MF = *MI->getParent()->getParent();
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
MachineFrameInfo *MFI = MF.getFrameInfo();
// Estimate an offset from the frame pointer.
@@ -326,7 +326,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
- MF.getTarget().getFrameLowering());
+ MF.getSubtarget().getFrameLowering());
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
unsigned FrameReg;
@@ -364,7 +364,7 @@ namespace llvm {
unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+ const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
switch (RC->getID()) {
default:
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 76af1ed..51a5034 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
-#define LLVM_TARGET_AArch64REGISTERINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERINFO_H
#define GET_REGINFO_HEADER
#include "AArch64GenRegisterInfo.inc"
@@ -98,4 +98,4 @@ public:
} // end namespace llvm
-#endif // LLVM_TARGET_AArch64REGISTERINFO_H
+#endif
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index a30e4ad..d5ff3f1 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -390,13 +390,14 @@ def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
}
def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
- v1i64],
+ v1i64, v4f16],
64, (sequence "D%u", 0, 31)>;
// We don't (yet) have an f128 legal type, so don't use that here. We
// normalize 128-bit vectors to v2f64 for arg passing and such, so use
// that here.
def FPR128 : RegisterClass<"AArch64",
- [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+ [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
+ v8f16],
128, (sequence "Q%u", 0, 31)>;
// The lower 16 vector registers. Some instructions can only take registers
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 8209f96..3ec4157 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -12,11 +12,24 @@
//
//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// The Cortex-A57 is a traditional superscaler microprocessor with a
+// conservative 3-wide in-order stage for decode and dispatch. Combined with the
+// much wider out-of-order issue stage, this produced a need to carefully
+// schedule micro-ops so that all three decoded each cycle are successfully
+// issued as the reservation station(s) simply don't stay occupied for long.
+// Therefore, IssueWidth is set to the narrower of the two at three, while still
+// modeling the machine as out-of-order.
+
def CortexA57Model : SchedMachineModel {
- let IssueWidth = 8; // 3-way decode and 8-way issue
+ let IssueWidth = 3; // 3-way decode and dispatch
let MicroOpBufferSize = 128; // 128 micro-op re-order buffer
let LoadLatency = 4; // Optimistic load latency
let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
+
+ // Enable partial & runtime unrolling. The magic number is chosen based on
+ // experiments and benchmarking data.
+ let LoopMicroOpBufferSize = 16;
}
//===----------------------------------------------------------------------===//
@@ -24,18 +37,17 @@ def CortexA57Model : SchedMachineModel {
// Cortex A-57 has 8 pipelines that each has its own 8-entry queue where
// micro-ops wait for their operands and then issue out-of-order.
-def A57UnitB : ProcResource<1> { let BufferSize = 8; } // Type B micro-ops
-def A57UnitI : ProcResource<2> { let BufferSize = 8; } // Type I micro-ops
-def A57UnitM : ProcResource<1> { let BufferSize = 8; } // Type M micro-ops
-def A57UnitL : ProcResource<1> { let BufferSize = 8; } // Type L micro-ops
-def A57UnitS : ProcResource<1> { let BufferSize = 8; } // Type S micro-ops
-def A57UnitX : ProcResource<1> { let BufferSize = 8; } // Type X micro-ops
-def A57UnitW : ProcResource<1> { let BufferSize = 8; } // Type W micro-ops
+def A57UnitB : ProcResource<1>; // Type B micro-ops
+def A57UnitI : ProcResource<2>; // Type I micro-ops
+def A57UnitM : ProcResource<1>; // Type M micro-ops
+def A57UnitL : ProcResource<1>; // Type L micro-ops
+def A57UnitS : ProcResource<1>; // Type S micro-ops
+def A57UnitX : ProcResource<1>; // Type X micro-ops
+def A57UnitW : ProcResource<1>; // Type W micro-ops
let SchedModel = CortexA57Model in {
def A57UnitV : ProcResGroup<[A57UnitX, A57UnitW]>; // Type V micro-ops
}
-
let SchedModel = CortexA57Model in {
//===----------------------------------------------------------------------===//
@@ -71,7 +83,7 @@ def : SchedAlias<WriteSTIdx, A57Write_1cyc_1I_1S>;
def : SchedAlias<WriteF, A57Write_3cyc_1V>;
def : SchedAlias<WriteFCmp, A57Write_3cyc_1V>;
def : SchedAlias<WriteFCvt, A57Write_5cyc_1V>;
-def : SchedAlias<WriteFCopy, A57Write_3cyc_1V>;
+def : SchedAlias<WriteFCopy, A57Write_5cyc_1L>;
def : SchedAlias<WriteFImm, A57Write_3cyc_1V>;
def : SchedAlias<WriteFMul, A57Write_5cyc_1V>;
def : SchedAlias<WriteFDiv, A57Write_18cyc_1X>;
@@ -85,13 +97,12 @@ def : WriteRes<WriteHint, []> { let Latency = 1; }
def : WriteRes<WriteLDHi, []> { let Latency = 4; }
-// Forwarding logic is not [yet] explicitly modeled beyond what is captured
-// in the latencies of the A57 Generic SchedWriteRes's.
+// Forwarding logic is only modeled for multiply and accumulate
def : ReadAdvance<ReadI, 0>;
def : ReadAdvance<ReadISReg, 0>;
def : ReadAdvance<ReadIEReg, 0>;
def : ReadAdvance<ReadIM, 0>;
-def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadIMA, 2, [WriteIM32, WriteIM64]>;
def : ReadAdvance<ReadID, 0>;
def : ReadAdvance<ReadExtrHi, 0>;
def : ReadAdvance<ReadAdrBase, 0>;
@@ -134,7 +145,13 @@ def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>;
// Cryptography Extensions
// -----------------------------------------------------------------------------
-def : InstRW<[A57Write_3cyc_1W], (instregex "CRC32")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA256SU0")>;
+def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA256(H|H2|SU1)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>;
// Vector Load
@@ -301,4 +318,330 @@ def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_PO
def : InstRW<[A57Write_8cyc_8S], (instregex "ST4Fourv(2d)$")>;
def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+// Vector - Integer
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v8i8, v4i16, v2i32
+// Q form - v16i8, v8i16, v4i32
+// D form - v1i8, v1i16, v1i32, v1i64
+// Q form - v16i8, v8i16, v4i32, v2i64
+// D form - v8i8_v8i16, v4i16_v4i32, v2i32_v2i64
+// Q form - v16i8_v8i16, v8i16_v4i32, v4i32_v2i64
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]ABAL")>;
+
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU]?ADDL?Vv16i8v$")>;
+
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[A57Write_7cyc_1V_1X], (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B
+def : InstRW<[A57Write_8cyc_2X], (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+
+// ASIMD multiply, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(P?MUL|SQR?DMULH)(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate, D-form
+def : InstRW<[A57Write_5cyc_1W], (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[A57Write_6cyc_2W], (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+def A57WriteIVMA : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
+def A57ReadIVMA4 : SchedReadAdvance<4, [A57WriteIVMA]>;
+def : InstRW<[A57WriteIVMA, A57ReadIVMA4], (instregex "^(S|U|SQD)ML[AS]L")>;
+
+// ASIMD multiply long
+def : InstRW<[A57Write_5cyc_1W], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[A57Write_5cyc_1W], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+def A57WriteIVA : SchedWriteRes<[A57UnitX]> { let Latency = 4; }
+def A57ReadIVA3 : SchedReadAdvance<3, [A57WriteIVA]>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^[SU]ADALP")>;
+def : InstRW<[A57WriteIVA, A57ReadIVA3], (instregex "^(S|SR|U|UR)SRA")>;
+
+// ASIMD shift by immed, complex
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[A57Write_4cyc_1X], (instregex "^SQSHLU")>;
+
+
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[A57Write_4cyc_2X], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD shift by register, complex, D-form
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU][QR]{1,2}SHL(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[A57Write_5cyc_2X], (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Vector - Floating Point
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v2f32
+// Q form - v4f32, v2f64
+// D form - 32, 64
+// D form - v1i32, v1i64
+// D form - v2i32
+// Q form - v4i32, v2i64
+
+// ASIMD FP arith, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FABD|FADD|FSUB)(v2f32|32|64|v2i32p)")>;
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FABD|FADD|FSUB)(v4f32|v2f64|v2i64p)")>;
+
+// ASIMD FP arith, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FADDP(v2f32|32|64|v2i32)")>;
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^FADDP(v4f32|v2f64|v2i64)")>;
+
+// ASIMD FP compare, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v2f32|32|64|v1i32|v2i32|v1i64)")>;
+// ASIMD FP compare, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[A57Write_8cyc_3V], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[A57Write_18cyc_1X], (instregex "FDIVv2f32")>;
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[A57Write_36cyc_2X], (instregex "FDIVv4f32")>;
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[A57Write_64cyc_2X], (instregex "FDIVv2f64")>;
+
+// Note: These were simply duplicated from ASIMD FDIV because of missing documentation
+// ASIMD FP square root, D-form, F32
+def : InstRW<[A57Write_18cyc_1X], (instregex "FSQRTv2f32")>;
+// ASIMD FP square root, Q-form, F32
+def : InstRW<[A57Write_36cyc_2X], (instregex "FSQRTv4f32")>;
+// ASIMD FP square root, Q-form, F64
+def : InstRW<[A57Write_64cyc_2X], (instregex "FSQRTv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?(v2f32)")>;
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^(FMAX|FMIN)(NM)?(v4f32|v2f64)")>;
+// ASIMD FP max/min, pairwise, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^(FMAX|FMIN)(NM)?P(v2f32|v2i32)")>;
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[A57Write_9cyc_3V], (instregex "^(FMAX|FMIN)(NM)?P(v4f32|v2f64|v2i64)")>;
+// ASIMD FP max/min, reduce
+def : InstRW<[A57Write_10cyc_3V], (instregex "^(FMAX|FMIN)(NM)?Vv")>;
+
+// ASIMD FP multiply, D-form, FZ
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD FP multiply, Q-form, FZ
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, D-form, FZ
+// ASIMD FP multiply accumulate, Q-form, FZ
+def A57WriteFPVMAD : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+def A57WriteFPVMAQ : SchedWriteRes<[A57UnitV, A57UnitV]> { let Latency = 10; }
+def A57ReadFPVMA5 : SchedReadAdvance<5, [A57WriteFPVMAD, A57WriteFPVMAQ]>;
+def : InstRW<[A57WriteFPVMAD, A57ReadFPVMA5], (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[A57WriteFPVMAQ, A57ReadFPVMA5], (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+
+// Vector - Miscellaneous
+// -----------------------------------------------------------------------------
+
+// Reference for forms in this group
+// D form - v8i8, v4i16, v2i32
+// Q form - v16i8, v8i16, v4i32
+// D form - v1i8, v1i16, v1i32, v1i64
+// Q form - v16i8, v8i16, v4i32, v2i64
+
+// ASIMD bitwise insert, Q-form
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+
+// ASIMD duplicate, gen reg, D-form and Q-form
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>;
+
+// ASIMD move, saturating
+def : InstRW<[A57Write_4cyc_1X], (instregex "^[SU]QXTU?N")>;
+
+// ASIMD reciprocal estimate, D-form
+def : InstRW<[A57Write_5cyc_1V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[A57Write_5cyc_2V], (instregex "^[FU](RECP|RSQRT)(E|X)(v2f64|v4f32|v4i32)")>;
+
+// ASIMD reciprocal step, D-form, FZ
+def : InstRW<[A57Write_9cyc_1V], (instregex "^F(RECP|RSQRT)S(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+// ASIMD reciprocal step, Q-form, FZ
+def : InstRW<[A57Write_9cyc_2V], (instregex "^F(RECP|RSQRT)S(v2f64|v4f32|v4i32)")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[A57Write_3cyc_1V], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[A57Write_6cyc_2V], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[A57Write_9cyc_3V], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[A57Write_12cyc_4V], (instregex "^TB[LX]v8i8Four")>;
+// ASIMD table lookup, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[A57Write_9cyc_5V], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[A57Write_12cyc_7V], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[A57Write_15cyc_9V], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[A57Write_6cyc_1I_1L], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, gen reg to element
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^INSv")>;
+
+// ASIMD unzip/zip, Q-form
+def : InstRW<[A57Write_6cyc_3V], (instregex "^(UZP|ZIP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
+
+
+// Remainder
+// -----------------------------------------------------------------------------
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(ADD|SUB)[DS]rr")>;
+
+def A57WriteFPMA : SchedWriteRes<[A57UnitV]> { let Latency = 9; }
+def A57ReadFPMA5 : SchedReadAdvance<5, [A57WriteFPMA]>;
+def A57ReadFPM : SchedReadAdvance<0>;
+def : InstRW<[A57WriteFPMA, A57ReadFPM, A57ReadFPM, A57ReadFPMA5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
+
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
+def : InstRW<[A57Write_10cyc_1L_1V], (instregex "^[SU]CVTF")>;
+
+def : InstRW<[A57Write_32cyc_1X], (instrs FDIVDrr)>;
+def : InstRW<[A57Write_18cyc_1X], (instrs FDIVSrr)>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^F(MAX|MIN).+rr")>;
+
+def : InstRW<[A57Write_5cyc_1V], (instregex "^FRINT.+r")>;
+
+def : InstRW<[A57Write_32cyc_1X], (instrs FSQRTDr)>;
+def : InstRW<[A57Write_18cyc_1X], (instrs FSQRTSr)>;
+
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>;
+def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
+def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>;
+def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>;
+def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURBi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURDi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURHi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURQi)>;
+def : InstRW<[A57Write_5cyc_1L], (instrs LDURSi)>;
+
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPDi)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STNPQi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STNPXi)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPDi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPDpre)>;
+def : InstRW<[A57Write_4cyc_1I_4S], (instrs STPQi)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_1I_4S], (instrs STPQpost)>;
+def : InstRW<[WriteAdr, A57Write_4cyc_2I_4S], (instrs STPQpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STPWpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STPXi)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STPXpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBBpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRBpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRBpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRBroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRDpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRDpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHHroX)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRHpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRHpre)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroW)>;
+def : InstRW<[A57Write_3cyc_1I_1S, ReadAdrBase], (instrs STRHroX)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQpost)>;
+def : InstRW<[WriteAdr, A57Write_2cyc_1I_2S], (instrs STRQpre)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroW)>;
+def : InstRW<[A57Write_2cyc_1I_2S, ReadAdrBase], (instrs STRQroX)>;
+def : InstRW<[A57Write_2cyc_1I_2S], (instrs STRQui)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRSpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S], (instrs STRSpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRWpre)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpost)>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1I_1S, ReadAdrBase], (instrs STRXpre)>;
+def : InstRW<[A57Write_2cyc_2S], (instrs STURQi)>;
+
} // SchedModel = CortexA57Model
diff --git a/lib/Target/AArch64/AArch64SchedA57WriteRes.td b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
index a8f421b..6f30108 100644
--- a/lib/Target/AArch64/AArch64SchedA57WriteRes.td
+++ b/lib/Target/AArch64/AArch64SchedA57WriteRes.td
@@ -28,14 +28,18 @@ def A57Write_5cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 5; }
def A57Write_5cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 5; }
def A57Write_5cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 5; }
def A57Write_10cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 10; }
-def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18; }
-def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19; }
+def A57Write_18cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 18;
+ let ResourceCycles = [18]; }
+def A57Write_19cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 19;
+ let ResourceCycles = [19]; }
def A57Write_1cyc_1B : SchedWriteRes<[A57UnitB]> { let Latency = 1; }
def A57Write_1cyc_1I : SchedWriteRes<[A57UnitI]> { let Latency = 1; }
def A57Write_1cyc_1S : SchedWriteRes<[A57UnitS]> { let Latency = 1; }
def A57Write_2cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 2; }
-def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32; }
-def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35; }
+def A57Write_32cyc_1X : SchedWriteRes<[A57UnitX]> { let Latency = 32;
+ let ResourceCycles = [32]; }
+def A57Write_35cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 35;
+ let ResourceCycles = [35]; }
def A57Write_3cyc_1M : SchedWriteRes<[A57UnitM]> { let Latency = 3; }
def A57Write_3cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 3; }
def A57Write_3cyc_1W : SchedWriteRes<[A57UnitW]> { let Latency = 3; }
@@ -53,6 +57,7 @@ def A57Write_6cyc_1V : SchedWriteRes<[A57UnitV]> { let Latency = 6; }
def A57Write_64cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 64;
let NumMicroOps = 2;
+ let ResourceCycles = [32, 32];
}
def A57Write_6cyc_1I_1L : SchedWriteRes<[A57UnitI,
A57UnitL]> {
@@ -137,6 +142,7 @@ def A57Write_2cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
def A57Write_36cyc_2X : SchedWriteRes<[A57UnitX, A57UnitX]> {
let Latency = 36;
let NumMicroOps = 2;
+ let ResourceCycles = [18, 18];
}
def A57Write_3cyc_1I_1M : SchedWriteRes<[A57UnitI,
A57UnitM]> {
@@ -153,6 +159,10 @@ def A57Write_3cyc_1S_1V : SchedWriteRes<[A57UnitS,
let Latency = 3;
let NumMicroOps = 2;
}
+def A57Write_3cyc_2V : SchedWriteRes<[A57UnitV, A57UnitV]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
def A57Write_4cyc_1I_1L : SchedWriteRes<[A57UnitI,
A57UnitL]> {
let Latency = 4;
@@ -295,6 +305,11 @@ def A57Write_9cyc_1L_3V : SchedWriteRes<[A57UnitL,
let Latency = 9;
let NumMicroOps = 4;
}
+def A57Write_12cyc_4V : SchedWriteRes<[A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+}
//===----------------------------------------------------------------------===//
@@ -334,6 +349,11 @@ def A57Write_9cyc_2L_3V : SchedWriteRes<[A57UnitL, A57UnitL,
let Latency = 9;
let NumMicroOps = 5;
}
+def A57Write_9cyc_5V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+}
//===----------------------------------------------------------------------===//
@@ -399,7 +419,7 @@ def A57Write_4cyc_1I_4S_2V : SchedWriteRes<[A57UnitI,
let Latency = 4;
let NumMicroOps = 7;
}
-def A57Write_6cyc_1I_6S : SchedWriteRes<[A57UnitI,
+def A57Write_6cyc_1I_6S : SchedWriteRes<[A57UnitI,
A57UnitS, A57UnitS, A57UnitS,
A57UnitS, A57UnitS, A57UnitS]> {
let Latency = 6;
@@ -412,6 +432,12 @@ def A57Write_9cyc_1I_2L_4V : SchedWriteRes<[A57UnitI,
let Latency = 9;
let NumMicroOps = 7;
}
+def A57Write_12cyc_7V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV]> {
+ let Latency = 12;
+ let NumMicroOps = 7;
+}
//===----------------------------------------------------------------------===//
@@ -443,11 +469,11 @@ def A57Write_8cyc_8S : SchedWriteRes<[A57UnitS, A57UnitS,
//===----------------------------------------------------------------------===//
// Define Generic 9 micro-op types
-def A57Write_8cyc_1I_8S : SchedWriteRes<[A57UnitI,
- A57UnitS, A57UnitS,
- A57UnitS, A57UnitS,
- A57UnitS, A57UnitS,
- A57UnitS, A57UnitS]> {
+def A57Write_8cyc_1I_8S : SchedWriteRes<[A57UnitI,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS,
+ A57UnitS, A57UnitS]> {
let Latency = 8;
let NumMicroOps = 9;
}
@@ -459,6 +485,12 @@ def A57Write_11cyc_1I_4L_4V : SchedWriteRes<[A57UnitI,
let Latency = 11;
let NumMicroOps = 9;
}
+def A57Write_15cyc_9V : SchedWriteRes<[A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV, A57UnitV,
+ A57UnitV, A57UnitV, A57UnitV]> {
+ let Latency = 15;
+ let NumMicroOps = 9;
+}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 1bf64fc..0cfd582 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -36,8 +36,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
// instead of memset.
if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
const AArch64TargetLowering &TLI =
- *static_cast<const AArch64TargetLowering *>(
- DAG.getTarget().getTargetLowering());
+ *DAG.getTarget().getSubtarget<AArch64Subtarget>().getTargetLowering();
EVT IntPtr = TLI.getPointerTy();
Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 1180eea..11932d2 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64SELECTIONDAGINFO_H
-#define AArch64SELECTIONDAGINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
#include "llvm/Target/TargetSelectionDAGInfo.h"
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 45f8ddb..0c36e8f 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -39,7 +39,7 @@ public:
static char ID;
AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
- virtual const char *getPassName() const override {
+ const char *getPassName() const override {
return "AArch64 Store Pair Suppression";
}
@@ -50,7 +50,7 @@ private:
bool isNarrowFPStore(const MachineInstr &MI);
- virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineTraceMetrics>();
AU.addPreserved<MachineTraceMetrics>();
@@ -85,8 +85,7 @@ bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB)
// If a subtarget does not define resources for STPQi, bail here.
if (SCDesc->isValid() && !SCDesc->isVariant()) {
- unsigned ResLenWithSTP = BBTrace.getResourceLength(
- ArrayRef<const MachineBasicBlock *>(), SCDesc);
+ unsigned ResLenWithSTP = BBTrace.getResourceLength(None, SCDesc);
if (ResLenWithSTP > ResLength) {
DEBUG(dbgs() << " Suppress STP in BB: " << BB->getNumber()
<< " resources " << ResLength << " -> " << ResLenWithSTP
@@ -118,12 +117,13 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
MF = &mf;
- TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
- TRI = MF->getTarget().getRegisterInfo();
+ TII =
+ static_cast<const AArch64InstrInfo *>(MF->getSubtarget().getInstrInfo());
+ TRI = MF->getSubtarget().getRegisterInfo();
MRI = &MF->getRegInfo();
const TargetSubtargetInfo &ST =
MF->getTarget().getSubtarget<TargetSubtargetInfo>();
- SchedModel.init(*ST.getSchedModel(), &ST, TII);
+ SchedModel.init(ST.getSchedModel(), &ST, TII);
Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index bb0b72c..47b5d54 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
+#include "AArch64PBQPRegAlloc.h"
#include "AArch64Subtarget.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineScheduler.h"
@@ -43,8 +44,8 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
AArch64Subtarget::AArch64Subtarget(const std::string &TT,
const std::string &CPU,
- const std::string &FS, TargetMachine &TM,
- bool LittleEndian)
+ const std::string &FS,
+ const TargetMachine &TM, bool LittleEndian)
: AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
@@ -64,13 +65,7 @@ AArch64Subtarget::AArch64Subtarget(const std::string &TT,
unsigned char
AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
const TargetMachine &TM) const {
-
- // Determine whether this is a reference to a definition or a declaration.
- // Materializable GVs (in JIT lazy compilation mode) do not require an extra
- // load from stub.
- bool isDecl = GV->hasAvailableExternallyLinkage();
- if (GV->isDeclaration() && !GV->isMaterializable())
- isDecl = true;
+ bool isDecl = GV->isDeclarationForLinker();
// MachO large model always goes via a GOT, simply to get a single 8-byte
// absolute relocation on all global addresses.
@@ -78,10 +73,15 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
return AArch64II::MO_GOT;
// The small code mode's direct accesses use ADRP, which cannot necessarily
- // produce the value 0 (if the code is above 4GB). Therefore they must use the
- // GOT.
- if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
- return AArch64II::MO_GOT;
+ // produce the value 0 (if the code is above 4GB).
+ if (TM.getCodeModel() == CodeModel::Small &&
+ GV->isWeakForLinker() && isDecl) {
+ // In PIC mode use the GOT, but in absolute mode use a constant pool load.
+ if (TM.getRelocationModel() == Reloc::Static)
+ return AArch64II::MO_CONSTPOOL;
+ else
+ return AArch64II::MO_GOT;
+ }
// If symbol visibility is hidden, the extra load is not needed if
// the symbol is definitely defined in the current translation unit.
@@ -128,3 +128,11 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
bool AArch64Subtarget::enableEarlyIfConversion() const {
return EnableEarlyIfConvert;
}
+
+std::unique_ptr<PBQPRAConstraint>
+AArch64Subtarget::getCustomPBQPConstraints() const {
+ if (!isCortexA57())
+ return nullptr;
+
+ return llvm::make_unique<A57ChainingConstraint>();
+}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 52124f6..e2740f1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -11,12 +11,12 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64SUBTARGET_H
-#define AArch64SUBTARGET_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64SUBTARGET_H
-#include "AArch64InstrInfo.h"
#include "AArch64FrameLowering.h"
#include "AArch64ISelLowering.h"
+#include "AArch64InstrInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64SelectionDAGInfo.h"
#include "llvm/IR/DataLayout.h"
@@ -69,18 +69,27 @@ public:
/// This constructor initializes the data members to match that
/// of the specified triple.
AArch64Subtarget(const std::string &TT, const std::string &CPU,
- const std::string &FS, TargetMachine &TM, bool LittleEndian);
+ const std::string &FS, const TargetMachine &TM,
+ bool LittleEndian);
- const AArch64SelectionDAGInfo *getSelectionDAGInfo() const { return &TSInfo; }
- const AArch64FrameLowering *getFrameLowering() const {
+ const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
+ return &TSInfo;
+ }
+ const AArch64FrameLowering *getFrameLowering() const override {
return &FrameLowering;
}
- const AArch64TargetLowering *getTargetLowering() const {
+ const AArch64TargetLowering *getTargetLowering() const override {
return &TLInfo;
}
- const AArch64InstrInfo *getInstrInfo() const { return &InstrInfo; }
- const DataLayout *getDataLayout() const { return &DL; }
+ const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+ const DataLayout *getDataLayout() const override { return &DL; }
+ const AArch64RegisterInfo *getRegisterInfo() const override {
+ return &getInstrInfo()->getRegisterInfo();
+ }
bool enableMachineScheduler() const override { return true; }
+ bool enablePostMachineScheduler() const override {
+ return isCortexA53() || isCortexA57();
+ }
bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
@@ -94,12 +103,19 @@ public:
bool isLittleEndian() const { return DL.isLittleEndian(); }
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+ bool isTargetIOS() const { return TargetTriple.isiOS(); }
+ bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+ bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
+ bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
bool isCyclone() const { return CPUString == "cyclone"; }
+ bool isCortexA57() const { return CPUString == "cortex-a57"; }
+ bool isCortexA53() const { return CPUString == "cortex-a53"; }
+
+ bool useAA() const override { return isCortexA53(); }
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.
@@ -126,7 +142,9 @@ public:
unsigned NumRegionInstrs) const override;
bool enableEarlyIfConversion() const override;
+
+ std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
};
} // End llvm namespace
-#endif // AArch64SUBTARGET_H
+#endif
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index f99b90b..d4f19d2 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -12,8 +12,11 @@
#include "AArch64.h"
#include "AArch64TargetMachine.h"
-#include "llvm/PassManager.h"
+#include "AArch64TargetObjectFile.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/IR/Function.h"
+#include "llvm/PassManager.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetOptions.h"
@@ -24,6 +27,10 @@ static cl::opt<bool>
EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> EnableMCR("aarch64-mcr",
+ cl::desc("Enable the machine combiner pass"),
+ cl::init(true), cl::Hidden);
+
static cl::opt<bool>
EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
cl::init(true), cl::Hidden);
@@ -59,13 +66,41 @@ EnableAtomicTidy("aarch64-atomic-cfg-tidy", cl::Hidden,
" to make use of cmpxchg flow-based information"),
cl::init(true));
+static cl::opt<bool>
+EnableEarlyIfConversion("aarch64-enable-early-ifcvt", cl::Hidden,
+ cl::desc("Run early if-conversion"),
+ cl::init(true));
+
+static cl::opt<bool>
+EnableCondOpt("aarch64-condopt",
+ cl::desc("Enable the condition optimizer pass"),
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
+ cl::desc("Work around Cortex-A53 erratum 835769"),
+ cl::init(false));
+
+static cl::opt<bool>
+EnableGEPOpt("aarch64-gep-opt", cl::Hidden,
+ cl::desc("Enable optimizations on complex GEPs"),
+ cl::init(true));
+
extern "C" void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+ RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target);
+}
+
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+ if (TT.isOSBinFormatMachO())
+ return make_unique<AArch64_MachoTargetObjectFile>();
- RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
- RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
+ return make_unique<AArch64_ELFTargetObjectFile>();
}
/// TargetMachine ctor - Create an AArch64 architecture model.
@@ -77,10 +112,39 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
CodeGenOpt::Level OL,
bool LittleEndian)
: LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
- Subtarget(TT, CPU, FS, *this, LittleEndian) {
+ TLOF(createTLOF(Triple(getTargetTriple()))),
+ Subtarget(TT, CPU, FS, *this, LittleEndian), isLittle(LittleEndian) {
initAsmInfo();
}
+AArch64TargetMachine::~AArch64TargetMachine() {}
+
+const AArch64Subtarget *
+AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
+ AttributeSet FnAttrs = F.getAttributes();
+ Attribute CPUAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+ Attribute FSAttr =
+ FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+
+ std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+ ? CPUAttr.getValueAsString().str()
+ : TargetCPU;
+ std::string FS = !FSAttr.hasAttribute(Attribute::None)
+ ? FSAttr.getValueAsString().str()
+ : TargetFS;
+
+ auto &I = SubtargetMap[CPU + FS];
+ if (!I) {
+ // This needs to be done before we create a new subtarget since any
+ // creation will depend on the TM and the code generation flags on the
+ // function that reside in TargetOptions.
+ resetTargetOptions(F);
+ I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this, isLittle);
+ }
+ return I.get();
+}
+
void AArch64leTargetMachine::anchor() { }
AArch64leTargetMachine::
@@ -104,7 +168,10 @@ namespace {
class AArch64PassConfig : public TargetPassConfig {
public:
AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
- : TargetPassConfig(TM, PM) {}
+ : TargetPassConfig(TM, PM) {
+ if (TM->getOptLevel() != CodeGenOpt::None)
+ substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
+ }
AArch64TargetMachine &getAArch64TargetMachine() const {
return getTM<AArch64TargetMachine>();
@@ -114,10 +181,10 @@ public:
bool addPreISel() override;
bool addInstSelector() override;
bool addILPOpts() override;
- bool addPreRegAlloc() override;
- bool addPostRegAlloc() override;
- bool addPreSched2() override;
- bool addPreEmitPass() override;
+ void addPreRegAlloc() override;
+ void addPostRegAlloc() override;
+ void addPreSched2() override;
+ void addPreEmitPass() override;
};
} // namespace
@@ -136,7 +203,7 @@ TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
void AArch64PassConfig::addIRPasses() {
// Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
// ourselves.
- addPass(createAtomicExpandLoadLinkedPass(TM));
+ addPass(createAtomicExpandPass(TM));
// Cmpxchg instructions are often used with a subsequent comparison to
// determine whether it succeeded. We can exploit existing control-flow in
@@ -145,6 +212,19 @@ void AArch64PassConfig::addIRPasses() {
addPass(createCFGSimplificationPass());
TargetPassConfig::addIRPasses();
+
+ if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
+ // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+ // and lower a GEP with multiple indices to either arithmetic operations or
+ // multiple GEPs with single index.
+ addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+ // Call EarlyCSE pass to find and remove subexpressions in the lowered
+ // result.
+ addPass(createEarlyCSEPass());
+ // Do loop invariant code motion in case part of the lowered result is
+ // invariant.
+ addPass(createLICMPass());
+ }
}
// Pass Pipeline Configuration
@@ -174,43 +254,56 @@ bool AArch64PassConfig::addInstSelector() {
}
bool AArch64PassConfig::addILPOpts() {
+ if (EnableCondOpt)
+ addPass(createAArch64ConditionOptimizerPass());
if (EnableCCMP)
addPass(createAArch64ConditionalCompares());
- addPass(&EarlyIfConverterID);
+ if (EnableMCR)
+ addPass(&MachineCombinerID);
+ if (EnableEarlyIfConversion)
+ addPass(&EarlyIfConverterID);
if (EnableStPairSuppress)
addPass(createAArch64StorePairSuppressPass());
return true;
}
-bool AArch64PassConfig::addPreRegAlloc() {
+void AArch64PassConfig::addPreRegAlloc() {
// Use AdvSIMD scalar instructions whenever profitable.
- if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+ if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar) {
addPass(createAArch64AdvSIMDScalar());
- return true;
+ // The AdvSIMD pass may produce copies that can be rewritten to
+ // be register coaleascer friendly.
+ addPass(&PeepholeOptimizerID);
+ }
}
-bool AArch64PassConfig::addPostRegAlloc() {
+void AArch64PassConfig::addPostRegAlloc() {
// Change dead register definitions to refer to the zero register.
if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
addPass(createAArch64DeadRegisterDefinitions());
- return true;
+ if (TM->getOptLevel() != CodeGenOpt::None &&
+ (TM->getSubtarget<AArch64Subtarget>().isCortexA53() ||
+ TM->getSubtarget<AArch64Subtarget>().isCortexA57()) &&
+ usingDefaultRegAlloc())
+ // Improve performance for some FP/SIMD code for A57.
+ addPass(createAArch64A57FPLoadBalancing());
}
-bool AArch64PassConfig::addPreSched2() {
+void AArch64PassConfig::addPreSched2() {
// Expand some pseudo instructions to allow proper scheduling.
addPass(createAArch64ExpandPseudoPass());
// Use load/store pair instructions when possible.
if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
addPass(createAArch64LoadStoreOptimizationPass());
- return true;
}
-bool AArch64PassConfig::addPreEmitPass() {
+void AArch64PassConfig::addPreEmitPass() {
+ if (EnableA53Fix835769)
+ addPass(createAArch64A53Fix835769());
// Relax conditional branch instructions if they're otherwise out of
// range of their destination.
addPass(createAArch64BranchRelaxation());
if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
addPass(createAArch64CollectLOHPass());
- return true;
}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 852cb3f..75c65c5 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64TARGETMACHINE_H
-#define AArch64TARGETMACHINE_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETMACHINE_H
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
@@ -23,7 +23,9 @@ namespace llvm {
class AArch64TargetMachine : public LLVMTargetMachine {
protected:
+ std::unique_ptr<TargetLoweringObjectFile> TLOF;
AArch64Subtarget Subtarget;
+ mutable StringMap<std::unique_ptr<AArch64Subtarget>> SubtargetMap;
public:
AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
@@ -31,33 +33,25 @@ public:
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL, bool IsLittleEndian);
+ ~AArch64TargetMachine() override;
+
const AArch64Subtarget *getSubtargetImpl() const override {
return &Subtarget;
}
- const AArch64TargetLowering *getTargetLowering() const override {
- return getSubtargetImpl()->getTargetLowering();
- }
- const DataLayout *getDataLayout() const override {
- return getSubtargetImpl()->getDataLayout();
- }
- const AArch64FrameLowering *getFrameLowering() const override {
- return getSubtargetImpl()->getFrameLowering();
- }
- const AArch64InstrInfo *getInstrInfo() const override {
- return getSubtargetImpl()->getInstrInfo();
- }
- const AArch64RegisterInfo *getRegisterInfo() const override {
- return &getInstrInfo()->getRegisterInfo();
- }
- const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
- return getSubtargetImpl()->getSelectionDAGInfo();
- }
+ const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
// Pass Pipeline Configuration
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
/// \brief Register AArch64 analysis passes with a pass manager.
void addAnalysisPasses(PassManagerBase &PM) override;
+
+ TargetLoweringObjectFile* getObjFileLowering() const override {
+ return TLOF.get();
+ }
+
+private:
+ bool isLittle;
};
// AArch64leTargetMachine - AArch64 little endian target machine.
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index de63cb4..2e595f9 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -7,8 +7,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64TARGETOBJECTFILE_H
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1dac14b..b1a2914 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -51,7 +51,7 @@ public:
AArch64TTI(const AArch64TargetMachine *TM)
: ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
- TLI(TM->getTargetLowering()) {
+ TLI(TM->getSubtargetImpl()->getTargetLowering()) {
initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
}
@@ -104,7 +104,7 @@ public:
return 64;
}
- unsigned getMaximumUnrollFactor() const override { return 2; }
+ unsigned getMaxInterleaveFactor() const override;
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
override;
@@ -112,10 +112,11 @@ public:
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
override;
- unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- OperandValueKind Opd1Info = OK_AnyValue,
- OperandValueKind Opd2Info = OK_AnyValue) const
- override;
+ unsigned getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
+ OperandValueKind Opd2Info = OK_AnyValue,
+ OperandValueProperties Opd1PropInfo = OP_None,
+ OperandValueProperties Opd2PropInfo = OP_None) const override;
unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
@@ -124,6 +125,13 @@ public:
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace) const override;
+
+ unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const override;
+
+ void getUnrollingPreferences(const Function *F, Loop *L,
+ UnrollingPreferences &UP) const override;
+
+
/// @}
};
@@ -400,18 +408,42 @@ unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
return 2;
}
-unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
- OperandValueKind Opd1Info,
- OperandValueKind Opd2Info) const {
+unsigned AArch64TTI::getArithmeticInstrCost(
+ unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+ OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
+ OperandValueProperties Opd2PropInfo) const {
// Legalize the type.
std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ if (ISD == ISD::SDIV &&
+ Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
+ Opd2PropInfo == TargetTransformInfo::OP_PowerOf2) {
+ // On AArch64, scalar signed division by constants power-of-two are
+ // normally expanded to the sequence ADD + CMP + SELECT + SRA.
+ // The OperandValue properties many not be same as that of previous
+ // operation; conservatively assume OP_None.
+ unsigned Cost =
+ getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
+ return Cost;
+ }
+
switch (ISD) {
default:
- return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
- Opd2Info);
+ return TargetTransformInfo::getArithmeticInstrCost(
+ Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:
@@ -498,3 +530,27 @@ unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
return LT.first;
}
+
+unsigned AArch64TTI::getCostOfKeepingLiveOverCall(ArrayRef<Type*> Tys) const {
+ unsigned Cost = 0;
+ for (auto *I : Tys) {
+ if (!I->isVectorTy())
+ continue;
+ if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
+ Cost += getMemoryOpCost(Instruction::Store, I, 128, 0) +
+ getMemoryOpCost(Instruction::Load, I, 128, 0);
+ }
+ return Cost;
+}
+
+unsigned AArch64TTI::getMaxInterleaveFactor() const {
+ if (ST->isCortexA57())
+ return 4;
+ return 2;
+}
+
+void AArch64TTI::getUnrollingPreferences(const Function *F, Loop *L,
+ UnrollingPreferences &UP) const {
+ // Disable partial & runtime unrolling on -Os.
+ UP.PartialOptSizeThreshold = 0;
+}
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 37e9296..8eb906b 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -10,27 +10,28 @@
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
#include <cstdio>
using namespace llvm;
@@ -42,7 +43,6 @@ class AArch64AsmParser : public MCTargetAsmParser {
private:
StringRef Mnemonic; ///< Instruction mnemonic.
MCSubtargetInfo &STI;
- MCAsmParser &Parser;
// Map of register aliases registers via the .req directive.
StringMap<std::pair<bool, unsigned> > RegisterReqs;
@@ -52,10 +52,7 @@ private:
return static_cast<AArch64TargetStreamer &>(TS);
}
- MCAsmParser &getParser() const { return Parser; }
- MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
- SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+ SMLoc getLoc() const { return getParser().getTok().getLoc(); }
bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
AArch64CC::CondCode parseCondCodeString(StringRef Cond);
@@ -69,11 +66,13 @@ private:
bool parseOperand(OperandVector &Operands, bool isCondCode,
bool invertCondCode);
- void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
- bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+ void Warning(SMLoc L, const Twine &Msg) { getParser().Warning(L, Msg); }
+ bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); }
bool showMatchError(SMLoc Loc, unsigned ErrCode);
bool parseDirectiveWord(unsigned Size, SMLoc L);
+ bool parseDirectiveInst(SMLoc L);
+
bool parseDirectiveTLSDescCall(SMLoc L);
bool parseDirectiveLOH(StringRef LOH, SMLoc L);
@@ -85,7 +84,7 @@ private:
bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands, MCStreamer &Out,
- unsigned &ErrorInfo,
+ uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
/// @name Auto-generated Match Functions
/// {
@@ -117,10 +116,11 @@ public:
AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
const MCInstrInfo &MII,
const MCTargetOptions &Options)
- : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+ : MCTargetAsmParser(), STI(_STI) {
MCAsmParserExtension::Initialize(_Parser);
- if (Parser.getStreamer().getTargetStreamer() == nullptr)
- new AArch64TargetStreamer(Parser.getStreamer());
+ MCStreamer &S = getParser().getStreamer();
+ if (S.getTargetStreamer() == nullptr)
+ new AArch64TargetStreamer(S);
// Initialize the set of available features.
setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -1875,6 +1875,7 @@ unsigned AArch64AsmParser::matchRegisterNameAlias(StringRef Name,
/// Identifier when called, and if it is a register name the token is eaten and
/// the register is added to the operand list.
int AArch64AsmParser::tryParseRegister() {
+ MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
@@ -1899,6 +1900,7 @@ int AArch64AsmParser::tryParseRegister() {
/// tryMatchVectorRegister - Try to parse a vector register name with optional
/// kind specifier. If it is a register specifier, eat the token and return it.
int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+ MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::Identifier)) {
TokError("vector register expected");
return -1;
@@ -1931,6 +1933,7 @@ int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
if (Parser.getTok().isNot(AsmToken::Identifier)) {
@@ -1960,6 +1963,7 @@ AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
/// tryParsePrefetch - Try to parse a prefetch operand.
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
// Either an identifier for named values or a 5-bit immediate.
@@ -2007,6 +2011,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
/// instruction.
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const MCExpr *Expr;
@@ -2057,6 +2062,7 @@ AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
/// instruction.
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const MCExpr *Expr;
@@ -2076,6 +2082,7 @@ AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
/// tryParseFPImm - A floating point immediate expression operand.
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
bool Hash = false;
@@ -2138,6 +2145,7 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
if (Parser.getTok().is(AsmToken::Hash))
@@ -2229,6 +2237,7 @@ AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
/// parseCondCode - Parse a Condition Code operand.
bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
bool invertCondCode) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
const AsmToken &Tok = Parser.getTok();
assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
@@ -2254,6 +2263,7 @@ bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
/// them if present.
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
std::string LowerID = Tok.getString().lower();
AArch64_AM::ShiftExtendType ShOp =
@@ -2299,10 +2309,11 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
if (Hash)
Parser.Lex(); // Eat the '#'.
- // Make sure we do actually have a number
- if (!Parser.getTok().is(AsmToken::Integer)) {
- Error(Parser.getTok().getLoc(),
- "expected integer shift amount");
+ // Make sure we do actually have a number or a parenthesized expression.
+ SMLoc E = Parser.getTok().getLoc();
+ if (!Parser.getTok().is(AsmToken::Integer) &&
+ !Parser.getTok().is(AsmToken::LParen)) {
+ Error(E, "expected integer shift amount");
return MatchOperand_ParseFail;
}
@@ -2312,11 +2323,11 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
if (!MCE) {
- TokError("expected #imm after shift specifier");
+ Error(E, "expected constant '#imm' after shift specifier");
return MatchOperand_ParseFail;
}
- SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+ E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
Operands.push_back(AArch64Operand::CreateShiftExtend(
ShOp, MCE->getValue(), true, S, E, getContext()));
return MatchOperand_Success;
@@ -2333,6 +2344,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
Operands.push_back(
AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
+ MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
StringRef Op = Tok.getString();
SMLoc S = Tok.getLoc();
@@ -2571,6 +2583,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
// Can be either a #imm style literal or an option name
@@ -2624,6 +2637,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
if (Tok.isNot(AsmToken::Identifier))
@@ -2638,6 +2652,7 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
/// tryParseVectorRegister - Parse a vector register operand.
bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::Identifier))
return true;
@@ -2686,6 +2701,7 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
/// parseRegister - Parse a non-vector register operand.
bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
// Try for a vector register.
if (!tryParseVectorRegister(Operands))
@@ -2728,6 +2744,7 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
}
bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+ MCAsmParser &Parser = getParser();
bool HasELFModifier = false;
AArch64MCExpr::VariantKind RefKind;
@@ -2806,6 +2823,7 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
SMLoc S = getLoc();
Parser.Lex(); // Eat left bracket token.
@@ -2904,6 +2922,7 @@ bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
AArch64AsmParser::OperandMatchResultTy
AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
if (!Tok.is(AsmToken::Identifier))
return MatchOperand_NoMatch;
@@ -2949,6 +2968,7 @@ AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
/// operand regardless of the mnemonic.
bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
bool invertCondCode) {
+ MCAsmParser &Parser = getParser();
// Check if the current operand has a custom associated parser, if so, try to
// custom parse the operand, or fallback to the general approach.
OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -3114,6 +3134,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
StringRef Name, SMLoc NameLoc,
OperandVector &Operands) {
+ MCAsmParser &Parser = getParser();
Name = StringSwitch<StringRef>(Name.lower())
.Case("beq", "b.eq")
.Case("bne", "b.ne")
@@ -3562,12 +3583,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
}
}
-static const char *getSubtargetFeatureName(unsigned Val);
+static const char *getSubtargetFeatureName(uint64_t Val);
bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
- unsigned &ErrorInfo,
+ uint64_t &ErrorInfo,
bool MatchingInlineAsm) {
assert(!Operands.empty() && "Unexpect empty operand list!");
AArch64Operand &Op = static_cast<AArch64Operand &>(*Operands[0]);
@@ -3817,7 +3838,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// Special case the error message for the very common case where only
// a single subtarget feature is missing (neon, e.g.).
std::string Msg = "instruction requires:";
- unsigned Mask = 1;
+ uint64_t Mask = 1;
for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
if (ErrorInfo & Mask) {
Msg += " ";
@@ -3831,7 +3852,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return showMatchError(IDLoc, MatchResult);
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
- if (ErrorInfo != ~0U) {
+ if (ErrorInfo != ~0ULL) {
if (ErrorInfo >= Operands.size())
return Error(IDLoc, "too few operands for instruction");
@@ -3906,11 +3927,15 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
llvm_unreachable("Implement any new match types added!");
- return true;
}
/// ParseDirective parses the arm specific directives
bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
+ const MCObjectFileInfo::Environment Format =
+ getContext().getObjectFileInfo()->getObjectFileType();
+ bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+ bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
+
StringRef IDVal = DirectiveID.getIdentifier();
SMLoc Loc = DirectiveID.getLoc();
if (IDVal == ".hword")
@@ -3926,12 +3951,18 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".unreq")
return parseDirectiveUnreq(DirectiveID.getLoc());
+ if (!IsMachO && !IsCOFF) {
+ if (IDVal == ".inst")
+ return parseDirectiveInst(Loc);
+ }
+
return parseDirectiveLOH(IDVal, Loc);
}
/// parseDirectiveWord
/// ::= .word [ expression (, expression)* ]
bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
+ MCAsmParser &Parser = getParser();
if (getLexer().isNot(AsmToken::EndOfStatement)) {
for (;;) {
const MCExpr *Value;
@@ -3954,6 +3985,47 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
return false;
}
+/// parseDirectiveInst
+/// ::= .inst opcode [, ...]
+bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) {
+ MCAsmParser &Parser = getParser();
+ if (getLexer().is(AsmToken::EndOfStatement)) {
+ Parser.eatToEndOfStatement();
+ Error(Loc, "expected expression following directive");
+ return false;
+ }
+
+ for (;;) {
+ const MCExpr *Expr;
+
+ if (getParser().parseExpression(Expr)) {
+ Error(Loc, "expected expression");
+ return false;
+ }
+
+ const MCConstantExpr *Value = dyn_cast_or_null<MCConstantExpr>(Expr);
+ if (!Value) {
+ Error(Loc, "expected constant expression");
+ return false;
+ }
+
+ getTargetStreamer().emitInst(Value->getValue());
+
+ if (getLexer().is(AsmToken::EndOfStatement))
+ break;
+
+ if (getLexer().isNot(AsmToken::Comma)) {
+ Error(Loc, "unexpected token in directive");
+ return false;
+ }
+
+ Parser.Lex(); // Eat comma.
+ }
+
+ Parser.Lex();
+ return false;
+}
+
// parseDirectiveTLSDescCall:
// ::= .tlsdesccall symbol
bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
@@ -3985,10 +4057,9 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
// We successfully get a numeric value for the identifier.
// Check if it is valid.
int64_t Id = getParser().getTok().getIntVal();
- Kind = (MCLOHType)Id;
- // Check that Id does not overflow MCLOHType.
- if (!isValidMCLOHType(Kind) || Id != Kind)
+ if (Id <= -1U && !isValidMCLOHType(Id))
return TokError("invalid numeric identifier in directive");
+ Kind = (MCLOHType)Id;
} else {
StringRef Name = getTok().getIdentifier();
// We successfully parse an identifier.
@@ -4036,6 +4107,7 @@ bool AArch64AsmParser::parseDirectiveLtorg(SMLoc L) {
/// parseDirectiveReq
/// ::= name .req registername
bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
+ MCAsmParser &Parser = getParser();
Parser.Lex(); // Eat the '.req' token.
SMLoc SRegLoc = getLoc();
unsigned RegNum = tryParseRegister();
@@ -4067,7 +4139,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
Parser.Lex(); // Consume the EndOfStatement
auto pair = std::make_pair(IsVector, RegNum);
- if (RegisterReqs.GetOrCreateValue(Name, pair).getValue() != pair)
+ if (!RegisterReqs.insert(std::make_pair(Name, pair)).second)
Warning(L, "ignoring redefinition of register alias '" + Name + "'");
return true;
@@ -4076,6 +4148,7 @@ bool AArch64AsmParser::parseDirectiveReq(StringRef Name, SMLoc L) {
/// parseDirectiveUneq
/// ::= .unreq registername
bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
+ MCAsmParser &Parser = getParser();
if (Parser.getTok().isNot(AsmToken::Identifier)) {
Error(Parser.getTok().getLoc(), "unexpected input in .unreq directive.");
Parser.eatToEndOfStatement();
@@ -4140,9 +4213,7 @@ AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
extern "C" void LLVMInitializeAArch64AsmParser() {
RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
-
- RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
- RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+ RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64Target);
}
#define GET_REGISTER_MATCHER
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 789d549..f26327f 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -2,7 +2,7 @@ set(LLVM_TARGET_DEFINITIONS AArch64.td)
tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter)
tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
@@ -15,6 +15,7 @@ tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
add_public_tablegen_target(AArch64CommonTableGen)
add_llvm_target(AArch64CodeGen
+ AArch64A57FPLoadBalancing.cpp
AArch64AddressTypePromotion.cpp
AArch64AdvSIMDScalarPass.cpp
AArch64AsmPrinter.cpp
@@ -25,13 +26,16 @@ add_llvm_target(AArch64CodeGen
AArch64DeadRegisterDefinitionsPass.cpp
AArch64ExpandPseudoInsts.cpp
AArch64FastISel.cpp
+ AArch64A53Fix835769.cpp
AArch64FrameLowering.cpp
+ AArch64ConditionOptimizer.cpp
AArch64ISelDAGToDAG.cpp
AArch64ISelLowering.cpp
AArch64InstrInfo.cpp
AArch64LoadStoreOptimizer.cpp
AArch64MCInstLower.cpp
AArch64PromoteConstant.cpp
+ AArch64PBQPRegAlloc.cpp
AArch64RegisterInfo.cpp
AArch64SelectionDAGInfo.cpp
AArch64StorePairSuppress.cpp
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 6de27d6..878e29c 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -15,12 +15,11 @@
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
@@ -200,26 +199,24 @@ static MCDisassembler *createAArch64Disassembler(const Target &T,
}
DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
- const MemoryObject &Region,
- uint64_t Address,
- raw_ostream &os,
- raw_ostream &cs) const {
- CommentStream = &cs;
-
- uint8_t bytes[4];
+ ArrayRef<uint8_t> Bytes,
+ uint64_t Address,
+ raw_ostream &OS,
+ raw_ostream &CS) const {
+ CommentStream = &CS;
Size = 0;
// We want to read exactly 4 bytes of data.
- if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+ if (Bytes.size() < 4)
return Fail;
Size = 4;
// Encoded as a small-endian 32-bit word in the stream.
- uint32_t insn =
- (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
+ uint32_t Insn =
+ (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
// Calling the auto-generated decoder function.
- return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
+ return decodeInstruction(DecoderTable32, MI, Insn, Address, this, STI);
}
static MCSymbolizer *
@@ -243,13 +240,9 @@ extern "C" void LLVMInitializeAArch64Disassembler() {
TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
createAArch64ExternalSymbolizer);
- TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
- createAArch64Disassembler);
- TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+ TargetRegistry::RegisterMCDisassembler(TheARM64Target,
createAArch64Disassembler);
- TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
- createAArch64ExternalSymbolizer);
- TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+ TargetRegistry::RegisterMCSymbolizer(TheARM64Target,
createAArch64ExternalSymbolizer);
}
@@ -592,7 +585,7 @@ static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
uint64_t Addr,
const void *Decoder) {
// scale{5} is asserted as 1 in tblgen.
- Imm |= 0x20;
+ Imm |= 0x20;
Inst.addOperand(MCOperand::CreateImm(64 - Imm));
return Success;
}
@@ -614,7 +607,7 @@ static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
if (ImmVal & (1 << (19 - 1)))
ImmVal |= ~((1LL << 19) - 1);
- if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+ if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal * 4, Addr,
Inst.getOpcode() != AArch64::LDRXl, 0, 4))
Inst.addOperand(MCOperand::CreateImm(ImmVal));
return Success;
@@ -630,35 +623,19 @@ static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
uint64_t Address,
const void *Decoder) {
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
- const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
- Imm |= 0x8000;
Inst.addOperand(MCOperand::CreateImm(Imm));
- bool ValidNamed;
- (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
- .toString(Imm, ValidNamed);
-
- return ValidNamed ? Success : Fail;
+ // Every system register in the encoding space is valid with the syntax
+ // S<op0>_<op1>_<Cn>_<Cm>_<op2>, so decoding system registers always succeeds.
+ return Success;
}
static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
uint64_t Address,
const void *Decoder) {
- const AArch64Disassembler *Dis =
- static_cast<const AArch64Disassembler *>(Decoder);
- const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
-
- Imm |= 0x8000;
Inst.addOperand(MCOperand::CreateImm(Imm));
- bool ValidNamed;
- (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
- .toString(Imm, ValidNamed);
-
- return ValidNamed ? Success : Fail;
+ return Success;
}
static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
@@ -1510,7 +1487,7 @@ static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
if (imm & (1 << (26 - 1)))
imm |= ~((1LL << 26) - 1);
- if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+ if (!Dis->tryAddingSymbolicOperand(Inst, imm * 4, Addr, true, 0, 4))
Inst.addOperand(MCOperand::CreateImm(imm));
return Success;
@@ -1530,7 +1507,7 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
bool ValidNamed;
(void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
-
+
return ValidNamed ? Success : Fail;
}
@@ -1552,7 +1529,7 @@ static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
else
DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
Inst.addOperand(MCOperand::CreateImm(bit));
- if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+ if (!Dis->tryAddingSymbolicOperand(Inst, dst * 4, Addr, true, 0, 4))
Inst.addOperand(MCOperand::CreateImm(dst));
return Success;
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 68d4867..7fb57ad 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -10,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64DISASSEMBLER_H
-#define AArch64DISASSEMBLER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
#include "llvm/MC/MCDisassembler.h"
@@ -28,11 +28,10 @@ public:
~AArch64Disassembler() {}
- /// getInstruction - See MCDisassembler.
MCDisassembler::DecodeStatus
- getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
- uint64_t address, raw_ostream &vStream,
- raw_ostream &cStream) const override;
+ getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+ uint64_t Address, raw_ostream &VStream,
+ raw_ostream &CStream) const override;
};
} // namespace llvm
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 171d31c..12b8450 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64EXTERNALSYMBOLIZER_H
-#define AArch64EXTERNALSYMBOLIZER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
+#define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
#include "llvm/MC/MCExternalSymbolizer.h"
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index a4224f4..62827e8 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = AArch64Disassembler
parent = AArch64
-required_libraries = AArch64Info AArch64Utils MC Support
+required_libraries = AArch64Info AArch64Utils MC MCDisassembler Support
add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 8a21f06..46a1d79 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -16,8 +16,8 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Format.h"
#include "llvm/Support/raw_ostream.h"
@@ -1223,7 +1223,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
// If the label has already been resolved to an immediate offset (say, when
// we're running the disassembler), just print the immediate.
if (Op.isImm()) {
- O << "#" << (Op.getImm() << 2);
+ O << "#" << (Op.getImm() * 4);
return;
}
@@ -1247,7 +1247,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
// If the label has already been resolved to an immediate offset (say, when
// we're running the disassembler), just print the immediate.
if (Op.isImm()) {
- O << "#" << (Op.getImm() << 12);
+ O << "#" << (Op.getImm() * (1 << 12));
return;
}
@@ -1276,24 +1276,20 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
unsigned Val = MI->getOperand(OpNo).getImm();
- bool Valid;
auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
- std::string Name = Mapper.toString(Val, Valid);
+ std::string Name = Mapper.toString(Val);
- if (Valid)
- O << StringRef(Name).upper();
+ O << StringRef(Name).upper();
}
void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
raw_ostream &O) {
unsigned Val = MI->getOperand(OpNo).getImm();
- bool Valid;
auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
- std::string Name = Mapper.toString(Val, Valid);
+ std::string Name = Mapper.toString(Val);
- if (Valid)
- O << StringRef(Name).upper();
+ O << StringRef(Name).upper();
}
void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index fe7666e..5f51621 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64INSTPRINTER_H
-#define AArch64INSTPRINTER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
+#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/StringRef.h"
@@ -127,8 +127,9 @@ public:
void printInstruction(const MCInst *MI, raw_ostream &O) override;
bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
- virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
- unsigned PrintMethodIdx, raw_ostream &O);
+ void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+ unsigned PrintMethodIdx,
+ raw_ostream &O) override;
StringRef getRegName(unsigned RegNo) const override {
return getRegisterName(RegNo);
}
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 642c183..573fa10 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
type = Library
name = AArch64CodeGen
parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 8b1e44e2..4db9dea 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
-#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ADDRESSINGMODES_H
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -51,7 +51,7 @@ enum ShiftExtendType {
/// getShiftName - Get the string encoding for the shift type.
static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
switch (ST) {
- default: assert(false && "unhandled shift type!");
+ default: llvm_unreachable("unhandled shift type!");
case AArch64_AM::LSL: return "lsl";
case AArch64_AM::LSR: return "lsr";
case AArch64_AM::ASR: return "asr";
@@ -210,67 +210,63 @@ static inline uint64_t ror(uint64_t elt, unsigned size) {
/// as the immediate operand of a logical instruction for the given register
/// size. If so, return true with "encoding" set to the encoded value in
/// the form N:immr:imms.
-static inline bool processLogicalImmediate(uint64_t imm, unsigned regSize,
- uint64_t &encoding) {
- if (imm == 0ULL || imm == ~0ULL ||
- (regSize != 64 && (imm >> regSize != 0 || imm == ~0U)))
+static inline bool processLogicalImmediate(uint64_t Imm, unsigned RegSize,
+ uint64_t &Encoding) {
+ if (Imm == 0ULL || Imm == ~0ULL ||
+ (RegSize != 64 && (Imm >> RegSize != 0 || Imm == ~0U)))
return false;
- unsigned size = 2;
- uint64_t eltVal = imm;
-
// First, determine the element size.
- while (size < regSize) {
- unsigned numElts = regSize / size;
- unsigned mask = (1ULL << size) - 1;
- uint64_t lowestEltVal = imm & mask;
-
- bool allMatched = true;
- for (unsigned i = 1; i < numElts; ++i) {
- uint64_t currEltVal = (imm >> (i*size)) & mask;
- if (currEltVal != lowestEltVal) {
- allMatched = false;
- break;
- }
- }
+ unsigned Size = RegSize;
+
+ do {
+ Size /= 2;
+ uint64_t Mask = (1ULL << Size) - 1;
- if (allMatched) {
- eltVal = lowestEltVal;
+ if ((Imm & Mask) != ((Imm >> Size) & Mask)) {
+ Size *= 2;
break;
}
-
- size *= 2;
- }
+ } while (Size > 2);
// Second, determine the rotation to make the element be: 0^m 1^n.
- for (unsigned i = 0; i < size; ++i) {
- eltVal = ror(eltVal, size);
- uint32_t clz = countLeadingZeros(eltVal) - (64 - size);
- uint32_t cto = CountTrailingOnes_64(eltVal);
-
- if (clz + cto == size) {
- // Encode in immr the number of RORs it would take to get *from* this
- // element value to our target value, where i+1 is the number of RORs
- // to go the opposite direction.
- unsigned immr = size - (i + 1);
-
- // If size has a 1 in the n'th bit, create a value that has zeroes in
- // bits [0, n] and ones above that.
- uint64_t nimms = ~(size-1) << 1;
-
- // Or the CTO value into the low bits, which must be below the Nth bit
- // bit mentioned above.
- nimms |= (cto-1);
-
- // Extract the seventh bit and toggle it to create the N field.
- unsigned N = ((nimms >> 6) & 1) ^ 1;
-
- encoding = (N << 12) | (immr << 6) | (nimms & 0x3f);
- return true;
- }
+ uint32_t CTO, I;
+ uint64_t Mask = ((uint64_t)-1LL) >> (64 - Size);
+ Imm &= Mask;
+
+ if (isShiftedMask_64(Imm)) {
+ I = countTrailingZeros(Imm);
+ assert(I < 64 && "undefined behavior");
+ CTO = CountTrailingOnes_64(Imm >> I);
+ } else {
+ Imm |= ~Mask;
+ if (!isShiftedMask_64(~Imm))
+ return false;
+
+ unsigned CLO = CountLeadingOnes_64(Imm);
+ I = 64 - CLO;
+ CTO = CLO + CountTrailingOnes_64(Imm) - (64 - Size);
}
- return false;
+ // Encode in Immr the number of RORs it would take to get *from* 0^m 1^n
+ // to our target value, where I is the number of RORs to go the opposite
+ // direction.
+ assert(Size > I && "I should be smaller than element size");
+ unsigned Immr = (Size - I) & (Size - 1);
+
+ // If size has a 1 in the n'th bit, create a value that has zeroes in
+ // bits [0, n] and ones above that.
+ uint64_t NImms = ~(Size-1) << 1;
+
+ // Or the CTO value into the low bits, which must be below the Nth bit
+ // bit mentioned above.
+ NImms |= (CTO-1);
+
+ // Extract the seventh bit and toggle it to create the N field.
+ unsigned N = ((NImms >> 6) & 1) ^ 1;
+
+ Encoding = (N << 12) | (Immr << 6) | (NImms & 0x3f);
+ return true;
}
/// isLogicalImmediate - Return true if the immediate is valid for a logical
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a917616..423da65 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -13,10 +13,11 @@
#include "llvm/ADT/Triple.h"
#include "llvm/MC/MCAsmBackend.h"
#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachO.h"
using namespace llvm;
@@ -131,7 +132,7 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
int64_t SignedValue = static_cast<int64_t>(Value);
switch (Kind) {
default:
- assert(false && "Unknown fixup kind!");
+ llvm_unreachable("Unknown fixup kind!");
case AArch64::fixup_aarch64_pcrel_adr_imm21:
if (SignedValue > 2097151 || SignedValue < -2097152)
report_fatal_error("fixup value out of range");
@@ -238,7 +239,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
MCInst &Res) const {
- assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+ llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
}
bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
@@ -316,42 +317,6 @@ public:
MachO::CPU_SUBTYPE_ARM64_ALL);
}
- bool doesSectionRequireSymbols(const MCSection &Section) const override {
- // Any section for which the linker breaks things into atoms needs to
- // preserve symbols, including assembler local symbols, to identify
- // those atoms. These sections are:
- // Sections of type:
- //
- // S_CSTRING_LITERALS (e.g. __cstring)
- // S_LITERAL_POINTERS (e.g. objc selector pointers)
- // S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
- //
- // Sections named:
- //
- // __TEXT,__eh_frame
- // __TEXT,__ustring
- // __DATA,__cfstring
- // __DATA,__objc_classrefs
- // __DATA,__objc_catlist
- //
- // FIXME: It would be better if the compiler used actual linker local
- // symbols for each of these sections rather than preserving what
- // are ostensibly assembler local symbols.
- const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
- return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
- SMO.getType() == MachO::S_4BYTE_LITERALS ||
- SMO.getType() == MachO::S_8BYTE_LITERALS ||
- SMO.getType() == MachO::S_16BYTE_LITERALS ||
- SMO.getType() == MachO::S_LITERAL_POINTERS ||
- (SMO.getSegmentName() == "__TEXT" &&
- (SMO.getSectionName() == "__eh_frame" ||
- SMO.getSectionName() == "__ustring")) ||
- (SMO.getSegmentName() == "__DATA" &&
- (SMO.getSectionName() == "__cfstring" ||
- SMO.getSectionName() == "__objc_classrefs" ||
- SMO.getSectionName() == "__objc_catlist")));
- }
-
/// \brief Generate the compact unwind encoding from the CFI directives.
uint32_t generateCompactUnwindEncoding(
ArrayRef<MCCFIInstruction> Instrs) const override {
@@ -534,8 +499,8 @@ void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
// store fixups in .eh_frame section in big endian order
if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
- const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
- if (SecELF->getSectionName() == ".eh_frame")
+ const MCSectionELF *SecELF = dyn_cast_or_null<const MCSectionELF>(Sec);
+ if (SecELF && SecELF->getSectionName() == ".eh_frame")
Value = ByteSwap_32(unsigned(Value));
}
AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
@@ -551,7 +516,8 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
return new DarwinAArch64AsmBackend(T, MRI);
assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
- return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true);
}
MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
@@ -561,6 +527,7 @@ MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
assert(TheTriple.isOSBinFormatELF() &&
"Big endian is only supported for ELF targets!");
- return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+ uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
+ return new ELFAArch64AsmBackend(T, OSABI,
/*IsLittleEndian=*/false);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index e05191e..5ea49c3 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -78,7 +78,7 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
- return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+ return ELF::R_AARCH64_TLSDESC_ADR_PAGE21;
llvm_unreachable("invalid symbol kind for ADRP relocation");
case AArch64::fixup_aarch64_pcrel_branch26:
return ELF::R_AARCH64_JUMP26;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index a79406d..8dc6c30 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -15,8 +15,10 @@
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/Twine.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
@@ -34,12 +36,42 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ELF.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
namespace {
+class AArch64ELFStreamer;
+
+class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
+ formatted_raw_ostream &OS;
+
+ void emitInst(uint32_t Inst) override;
+
+public:
+ AArch64TargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
+};
+
+AArch64TargetAsmStreamer::AArch64TargetAsmStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS)
+ : AArch64TargetStreamer(S), OS(OS) {}
+
+void AArch64TargetAsmStreamer::emitInst(uint32_t Inst) {
+ OS << "\t.inst\t0x" << utohexstr(Inst) << "\n";
+}
+
+class AArch64TargetELFStreamer : public AArch64TargetStreamer {
+private:
+ AArch64ELFStreamer &getStreamer();
+
+ void emitInst(uint32_t Inst) override;
+
+public:
+ AArch64TargetELFStreamer(MCStreamer &S) : AArch64TargetStreamer(S) {}
+};
+
/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
/// the appropriate points in the object files. These symbols are defined in the
/// AArch64 ELF ABI:
@@ -55,6 +87,8 @@ namespace {
/// by MachO. Beware!
class AArch64ELFStreamer : public MCELFStreamer {
public:
+ friend class AArch64TargetELFStreamer;
+
AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
MCCodeEmitter *Emitter)
: MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
@@ -82,6 +116,18 @@ public:
MCELFStreamer::EmitInstruction(Inst, STI);
}
+ void emitInst(uint32_t Inst) {
+ char Buffer[4];
+ const bool LittleEndian = getContext().getAsmInfo()->isLittleEndian();
+
+ EmitA64MappingSymbol();
+ for (unsigned II = 0; II != 4; ++II) {
+ const unsigned I = LittleEndian ? (4 - II - 1) : II;
+ Buffer[4 - II - 1] = uint8_t(Inst >> I * CHAR_BIT);
+ }
+ MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
+ }
+
/// This is one of the functions used to emit data into an ELF section, so the
/// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
/// if necessary.
@@ -131,7 +177,9 @@ private:
MCELF::SetType(SD, ELF::STT_NOTYPE);
MCELF::SetBinding(SD, ELF::STB_LOCAL);
SD.setExternal(false);
- Symbol->setSection(*getCurrentSection().first);
+ auto Sec = getCurrentSection().first;
+ assert(Sec && "need a section");
+ Symbol->setSection(*Sec);
const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
Symbol->setVariableValue(Value);
@@ -144,17 +192,35 @@ private:
/// @}
};
+} // end anonymous namespace
+
+AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
+ return static_cast<AArch64ELFStreamer &>(Streamer);
+}
+
+void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
+ getStreamer().emitInst(Inst);
}
namespace llvm {
+MCStreamer *
+createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+ bool isVerboseAsm, bool useDwarfDirectory,
+ MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+ MCAsmBackend *TAB, bool ShowInst) {
+ MCStreamer *S = llvm::createAsmStreamer(
+ Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
+ new AArch64TargetAsmStreamer(*S, OS);
+ return S;
+}
+
MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
raw_ostream &OS, MCCodeEmitter *Emitter,
- bool RelaxAll, bool NoExecStack) {
+ bool RelaxAll) {
AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+ new AArch64TargetELFStreamer(*S);
if (RelaxAll)
S->getAssembler().setRelaxAll(true);
- if (NoExecStack)
- S->getAssembler().setNoExecStack(true);
return S;
}
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index bc6973b..71b05cc 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AARCH64_ELF_STREAMER_H
-#define LLVM_AARCH64_ELF_STREAMER_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64ELFSTREAMER_H
#include "llvm/MC/MCELFStreamer.h"
@@ -20,7 +20,7 @@ namespace llvm {
MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
raw_ostream &OS, MCCodeEmitter *Emitter,
- bool RelaxAll, bool NoExecStack);
+ bool RelaxAll);
}
-#endif // AArch64_ELF_STREAMER_H
+#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index bf405fb..0f5b765 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -7,8 +7,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AArch64FIXUPKINDS_H
-#define LLVM_AArch64FIXUPKINDS_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64FIXUPKINDS_H
#include "llvm/MC/MCFixup.h"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 1763b40..f048474 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -13,8 +13,8 @@
#include "AArch64MCAsmInfo.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
@@ -37,6 +37,7 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
PrivateGlobalPrefix = "L";
+ PrivateLabelPrefix = "L";
SeparatorString = "%%";
CommentString = ";";
PointerSize = CalleeSaveStackSlotSize = 8;
@@ -66,7 +67,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
Triple T(TT);
- if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
+ if (T.getArch() == Triple::aarch64_be)
IsLittleEndian = false;
// We prefer NEON instructions to be printed in the short form.
@@ -79,6 +80,7 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
CommentString = "//";
PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
Code32Directive = ".code\t32";
Data16bitsDirective = "\t.hword\t";
@@ -89,7 +91,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
WeakRefDirective = "\t.weak\t";
- HasLEB128 = true;
SupportsDebugInformation = true;
// Exceptions handling
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 42a031d..5d03c21 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -11,8 +11,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64TARGETASMINFO_H
-#define AArch64TARGETASMINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
#include "llvm/MC/MCAsmInfoDarwin.h"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index f051357..4756a192 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -15,13 +15,13 @@
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -437,8 +437,7 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
return 3;
}
- assert(false && "Invalid value for vector shift amount!");
- return 0;
+ llvm_unreachable("Invalid value for vector shift amount!");
}
uint32_t
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 42a6787..e396df8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -90,8 +90,9 @@ const MCSection *AArch64MCExpr::FindAssociatedSection() const {
}
bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout) const {
- if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const {
+ if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout, Fixup))
return false;
Res =
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 5422f9d..db48ac9 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -12,8 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_AArch64MCEXPR_H
-#define LLVM_AArch64MCEXPR_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCEXPR_H
#include "llvm/MC/MCExpr.h"
#include "llvm/Support/ErrorHandling.h"
@@ -152,7 +152,8 @@ public:
const MCSection *FindAssociatedSection() const override;
bool EvaluateAsRelocatableImpl(MCValue &Res,
- const MCAsmLayout *Layout) const override;
+ const MCAsmLayout *Layout,
+ const MCFixup *Fixup) const override;
void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index ae698c5..0f7a6b8 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -126,15 +126,14 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
MCContext &Ctx, MCAsmBackend &TAB,
raw_ostream &OS, MCCodeEmitter *Emitter,
- const MCSubtargetInfo &STI, bool RelaxAll,
- bool NoExecStack) {
+ const MCSubtargetInfo &STI, bool RelaxAll) {
Triple TheTriple(TT);
if (TheTriple.isOSDarwin())
return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
/*LabelSections*/ true);
- return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+ return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
}
// Force static initialization.
@@ -142,17 +141,14 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
// Register the MC asm info.
RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
- RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
- RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
+ RegisterMCAsmInfoFn Z(TheARM64Target, createAArch64MCAsmInfo);
// Register the MC codegen info.
TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
createAArch64MCCodeGenInfo);
TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
createAArch64MCCodeGenInfo);
- TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
- createAArch64MCCodeGenInfo);
- TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+ TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
createAArch64MCCodeGenInfo);
// Register the MC instruction info.
@@ -160,9 +156,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
createAArch64MCInstrInfo);
TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
createAArch64MCInstrInfo);
- TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
- createAArch64MCInstrInfo);
- TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+ TargetRegistry::RegisterMCInstrInfo(TheARM64Target,
createAArch64MCInstrInfo);
// Register the MC register info.
@@ -170,9 +164,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
createAArch64MCRegisterInfo);
TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
createAArch64MCRegisterInfo);
- TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
- createAArch64MCRegisterInfo);
- TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+ TargetRegistry::RegisterMCRegInfo(TheARM64Target,
createAArch64MCRegisterInfo);
// Register the MC subtarget info.
@@ -180,9 +172,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
createAArch64MCSubtargetInfo);
TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
createAArch64MCSubtargetInfo);
- TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
- createAArch64MCSubtargetInfo);
- TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+ TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
createAArch64MCSubtargetInfo);
// Register the asm backend.
@@ -190,19 +180,15 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
createAArch64leAsmBackend);
TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
createAArch64beAsmBackend);
- TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+ TargetRegistry::RegisterMCAsmBackend(TheARM64Target,
createAArch64leAsmBackend);
- TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
- createAArch64beAsmBackend);
// Register the MC Code Emitter
TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
createAArch64MCCodeEmitter);
TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
createAArch64MCCodeEmitter);
- TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
- createAArch64MCCodeEmitter);
- TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+ TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
createAArch64MCCodeEmitter);
// Register the object streamer.
@@ -210,16 +196,21 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
createMCStreamer);
TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
createMCStreamer);
- TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
- TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
+ TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
+
+ // Register the asm streamer.
+ TargetRegistry::RegisterAsmStreamer(TheAArch64leTarget,
+ createAArch64MCAsmStreamer);
+ TargetRegistry::RegisterAsmStreamer(TheAArch64beTarget,
+ createAArch64MCAsmStreamer);
+ TargetRegistry::RegisterAsmStreamer(TheARM64Target,
+ createAArch64MCAsmStreamer);
// Register the MCInstPrinter.
TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
createAArch64MCInstPrinter);
TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
createAArch64MCInstPrinter);
- TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
- createAArch64MCInstPrinter);
- TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+ TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
createAArch64MCInstPrinter);
}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index d886ea2..1553115 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -11,19 +11,22 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64MCTARGETDESC_H
-#define AArch64MCTARGETDESC_H
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
#include "llvm/Support/DataTypes.h"
#include <string>
namespace llvm {
+class formatted_raw_ostream;
class MCAsmBackend;
class MCCodeEmitter;
class MCContext;
class MCInstrInfo;
+class MCInstPrinter;
class MCRegisterInfo;
class MCObjectWriter;
+class MCStreamer;
class MCSubtargetInfo;
class StringRef;
class Target;
@@ -31,8 +34,7 @@ class raw_ostream;
extern Target TheAArch64leTarget;
extern Target TheAArch64beTarget;
-extern Target TheARM64leTarget;
-extern Target TheARM64beTarget;
+extern Target TheARM64Target;
MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
@@ -51,6 +53,11 @@ MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
uint32_t CPUSubtype);
+MCStreamer *
+createAArch64MCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+ bool isVerboseAsm, bool useDwarfDirectory,
+ MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+ MCAsmBackend *TAB, bool ShowInst);
} // End llvm namespace
// Defines symbolic names for AArch64 registers. This defines a mapping from
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index ba95366..f6fab5d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -9,15 +9,16 @@
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCMachObjectWriter.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/ADT/Twine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachO.h"
using namespace llvm;
@@ -33,7 +34,7 @@ public:
: MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
/*UseAggressiveSymbolFolding=*/true) {}
- void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+ void RecordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
const MCAsmLayout &Layout, const MCFragment *Fragment,
const MCFixup &Fixup, MCValue Target,
uint64_t &FixedValue) override;
@@ -112,8 +113,25 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
}
}
+static bool canUseLocalRelocation(const MCSectionMachO &Section,
+ const MCSymbol &Symbol, unsigned Log2Size) {
+ // Debug info sections can use local relocations.
+ if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
+ return true;
+
+ // Otherwise, only pointer sized relocations are supported.
+ if (Log2Size != 3)
+ return false;
+
+ // But only if they don't point to a cstring.
+ if (!Symbol.isInSection())
+ return true;
+ const MCSectionMachO &RefSec = cast<MCSectionMachO>(Symbol.getSection());
+ return RefSec.getType() != MachO::S_CSTRING_LITERALS;
+}
+
void AArch64MachObjectWriter::RecordRelocation(
- MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+ MachObjectWriter *Writer, MCAssembler &Asm, const MCAsmLayout &Layout,
const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
uint64_t &FixedValue) {
unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
@@ -123,9 +141,9 @@ void AArch64MachObjectWriter::RecordRelocation(
unsigned Log2Size = 0;
int64_t Value = 0;
unsigned Index = 0;
- unsigned IsExtern = 0;
unsigned Type = 0;
unsigned Kind = Fixup.getKind();
+ const MCSymbolData *RelSymbol = nullptr;
FixupOffset += Fixup.getOffset();
@@ -171,10 +189,8 @@ void AArch64MachObjectWriter::RecordRelocation(
// FIXME: Should this always be extern?
// SymbolNum of 0 indicates the absolute section.
Type = MachO::ARM64_RELOC_UNSIGNED;
- Index = 0;
if (IsPCRel) {
- IsExtern = 1;
Asm.getContext().FatalError(Fixup.getLoc(),
"PC relative absolute relocation!");
@@ -198,15 +214,12 @@ void AArch64MachObjectWriter::RecordRelocation(
Layout.getSymbolOffset(&B_SD) ==
Layout.getFragmentOffset(Fragment) + Fixup.getOffset()) {
// SymB is the PC, so use a PC-rel pointer-to-GOT relocation.
- Index = A_Base->getIndex();
- IsExtern = 1;
Type = MachO::ARM64_RELOC_POINTER_TO_GOT;
IsPCRel = 1;
MachO::any_relocation_info MRE;
MRE.r_word0 = FixupOffset;
- MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
- (IsExtern << 27) | (Type << 28));
- Writer->addRelocation(Fragment->getParent(), MRE);
+ MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
return;
} else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
@@ -252,26 +265,31 @@ void AArch64MachObjectWriter::RecordRelocation(
? 0
: Writer->getSymbolAddress(B_Base, Layout));
- Index = A_Base->getIndex();
- IsExtern = 1;
Type = MachO::ARM64_RELOC_UNSIGNED;
MachO::any_relocation_info MRE;
MRE.r_word0 = FixupOffset;
- MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
- (IsExtern << 27) | (Type << 28));
- Writer->addRelocation(Fragment->getParent(), MRE);
+ MRE.r_word1 = (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(A_Base, Fragment->getParent(), MRE);
- Index = B_Base->getIndex();
- IsExtern = 1;
+ RelSymbol = B_Base;
Type = MachO::ARM64_RELOC_SUBTRACTOR;
} else { // A + constant
const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
- const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
- const MCSymbolData *Base = Asm.getAtom(&SD);
const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
Fragment->getParent()->getSection());
+ bool CanUseLocalRelocation =
+ canUseLocalRelocation(Section, *Symbol, Log2Size);
+ if (Symbol->isTemporary() && (Value || !CanUseLocalRelocation)) {
+ const MCSection &Sec = Symbol->getSection();
+ if (!Asm.getContext().getAsmInfo()->isSectionAtomizableBySymbols(Sec))
+ Asm.addLocalUsedInReloc(*Symbol);
+ }
+
+ const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+ const MCSymbolData *Base = Asm.getAtom(&SD);
+
// If the symbol is a variable and we weren't able to get a Base for it
// (i.e., it's not in the symbol table associated with a section) resolve
// the relocation based its expansion instead.
@@ -288,7 +306,8 @@ void AArch64MachObjectWriter::RecordRelocation(
// FIXME: Will the Target we already have ever have any data in it
// we need to preserve and merge with the new Target? How about
// the FixedValue?
- if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout))
+ if (!Symbol->getVariableValue()->EvaluateAsRelocatable(Target, &Layout,
+ &Fixup))
Asm.getContext().FatalError(Fixup.getLoc(),
"unable to resolve variable '" +
Symbol->getName() + "'");
@@ -309,16 +328,13 @@ void AArch64MachObjectWriter::RecordRelocation(
// sections, and for pointer-sized relocations (.quad), we allow section
// relocations. It's code sections that run into trouble.
if (Base) {
- Index = Base->getIndex();
- IsExtern = 1;
+ RelSymbol = Base;
// Add the local offset, if needed.
if (Base != &SD)
Value += Layout.getSymbolOffset(&SD) - Layout.getSymbolOffset(Base);
} else if (Symbol->isInSection()) {
- // Pointer-sized relocations can use a local relocation. Otherwise,
- // we have to be in a debug info section.
- if (!Section.hasAttribute(MachO::S_ATTR_DEBUG) && Log2Size != 3)
+ if (!CanUseLocalRelocation)
Asm.getContext().FatalError(
Fixup.getLoc(),
"unsupported relocation of local symbol '" + Symbol->getName() +
@@ -328,7 +344,6 @@ void AArch64MachObjectWriter::RecordRelocation(
const MCSectionData &SymSD =
Asm.getSectionData(SD.getSymbol().getSection());
Index = SymSD.getOrdinal() + 1;
- IsExtern = 0;
Value += Writer->getSymbolAddress(&SD, Layout);
if (IsPCRel)
@@ -361,16 +376,16 @@ void AArch64MachObjectWriter::RecordRelocation(
MachO::any_relocation_info MRE;
MRE.r_word0 = FixupOffset;
- MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
- (IsExtern << 27) | (Type << 28));
- Writer->addRelocation(Fragment->getParent(), MRE);
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
// Now set up the Addend relocation.
Type = MachO::ARM64_RELOC_ADDEND;
Index = Value;
+ RelSymbol = nullptr;
IsPCRel = 0;
Log2Size = 2;
- IsExtern = 0;
// Put zero into the instruction itself. The addend is in the relocation.
Value = 0;
@@ -382,9 +397,9 @@ void AArch64MachObjectWriter::RecordRelocation(
// struct relocation_info (8 bytes)
MachO::any_relocation_info MRE;
MRE.r_word0 = FixupOffset;
- MRE.r_word1 = ((Index << 0) | (IsPCRel << 24) | (Log2Size << 25) |
- (IsExtern << 27) | (Type << 28));
- Writer->addRelocation(Fragment->getParent(), MRE);
+ MRE.r_word1 =
+ (Index << 0) | (IsPCRel << 24) | (Log2Size << 25) | (Type << 28);
+ Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
}
MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index dcc1a3c..e3112fa 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -39,3 +39,5 @@ void AArch64TargetStreamer::emitCurrentConstantPool() {
// finish() - write out any non-empty assembler constant pools.
void AArch64TargetStreamer::finish() { ConstantPools->emitAll(Streamer); }
+
+void AArch64TargetStreamer::emitInst(uint32_t Inst) {}
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 3a382c1..f42ecb1 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -14,18 +14,19 @@ using namespace llvm;
namespace llvm {
Target TheAArch64leTarget;
Target TheAArch64beTarget;
-Target TheARM64leTarget;
-Target TheARM64beTarget;
+Target TheARM64Target;
} // end namespace llvm
extern "C" void LLVMInitializeAArch64TargetInfo() {
- RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
- "AArch64 (little endian)");
- RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
- "AArch64 (big endian)");
+ // Now register the "arm64" name for use with "-march". We don't want it to
+ // take possession of the Triple::aarch64 tag though.
+ TargetRegistry::RegisterTarget(TheARM64Target, "arm64",
+ "ARM64 (little endian)",
+ [](Triple::ArchType) { return false; }, true);
RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
+
}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 3c24bb3..bc6c7a9 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -791,22 +791,22 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
}
}
- // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name, where the bits
- // are: 11 xxx 1x11 xxxx xxx
- Regex GenericRegPattern("^s3_([0-7])_c(1[15])_c([0-9]|1[0-5])_([0-7])$");
+ // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
+ Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$");
- SmallVector<StringRef, 4> Ops;
+ SmallVector<StringRef, 5> Ops;
if (!GenericRegPattern.match(NameLower, &Ops)) {
Valid = false;
return -1;
}
- uint32_t Op0 = 3, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
+ uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
uint32_t Bits;
- Ops[1].getAsInteger(10, Op1);
- Ops[2].getAsInteger(10, CRn);
- Ops[3].getAsInteger(10, CRm);
- Ops[4].getAsInteger(10, Op2);
+ Ops[1].getAsInteger(10, Op0);
+ Ops[2].getAsInteger(10, Op1);
+ Ops[3].getAsInteger(10, CRn);
+ Ops[4].getAsInteger(10, CRm);
+ Ops[5].getAsInteger(10, Op2);
Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
Valid = true;
@@ -814,11 +814,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
}
std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits) const {
// First search the registers shared by all
for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
if (SysRegPairs[i].Value == Bits) {
- Valid = true;
return SysRegPairs[i].Name;
}
}
@@ -827,7 +826,6 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
if (FeatureBits & AArch64::ProcCyclone) {
for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
if (CycloneSysRegPairs[i].Value == Bits) {
- Valid = true;
return CycloneSysRegPairs[i].Name;
}
}
@@ -837,28 +835,18 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
// write-only).
for (unsigned i = 0; i < NumInstPairs; ++i) {
if (InstPairs[i].Value == Bits) {
- Valid = true;
return InstPairs[i].Name;
}
}
+ assert(Bits < 0x10000);
uint32_t Op0 = (Bits >> 14) & 0x3;
uint32_t Op1 = (Bits >> 11) & 0x7;
uint32_t CRn = (Bits >> 7) & 0xf;
uint32_t CRm = (Bits >> 3) & 0xf;
uint32_t Op2 = Bits & 0x7;
- // Only combinations matching: 11 xxx 1x11 xxxx xxx are valid for a generic
- // name.
- if (Op0 != 3 || (CRn != 11 && CRn != 15)) {
- Valid = false;
- return "";
- }
-
- assert(Op0 == 3 && (CRn == 11 || CRn == 15) && "Invalid generic sysreg");
-
- Valid = true;
- return "s3_" + utostr(Op1) + "_c" + utostr(CRn)
+ return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn)
+ "_c" + utostr(CRm) + "_" + utostr(Op2);
}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 9d2ce21..c60b09a 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -14,8 +14,8 @@
//
//===----------------------------------------------------------------------===//
-#ifndef AArch64BASEINFO_H
-#define AArch64BASEINFO_H
+#ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
+#define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
// FIXME: Is it easiest to fix this layering violation by moving the .inc
// #includes from AArch64MCTargetDesc.h to here?
@@ -1143,7 +1143,7 @@ namespace AArch64SysReg {
SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
uint32_t fromString(StringRef Name, bool &Valid) const;
- std::string toString(uint32_t Bits, bool &Valid) const;
+ std::string toString(uint32_t Bits) const;
};
struct MSRMapper : SysRegMapper {
@@ -1271,7 +1271,12 @@ namespace AArch64II {
/// thread-local symbol. On Darwin, only one type of thread-local access
/// exists (pre linker-relaxation), but on ELF the TLSModel used for the
/// referee will affect interpretation.
- MO_TLS = 0x20
+ MO_TLS = 0x20,
+
+ /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
+ /// the address of a constant pool entry for the symbol, rather than the
+ /// address of the symbol itself.
+ MO_CONSTPOOL = 0x40
};
} // end namespace AArch64II
OpenPOWER on IntegriCloud