summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/lib/Target/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/AArch64')
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64.h14
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64.td125
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp49
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp484
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp14
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp184
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h27
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td7
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp2
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp339
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp8
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp59
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp53
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp183
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp790
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp136
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp161
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def419
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp132
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp1185
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h57
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td59
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td69
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp595
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h99
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td171
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp365
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h49
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp143
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h7
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp112
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp15
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h2
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp166
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h24
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp2
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h7
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp359
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp296
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h84
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td20
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp22
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td2
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td6
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td92
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td1288
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td68
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td343
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td352
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td1745
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td852
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp14
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp134
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h66
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td134
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp144
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h5
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp10
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h6
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp266
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h38
-rw-r--r--contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp25
-rw-r--r--contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp584
-rw-r--r--contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp4
-rw-r--r--contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp288
-rw-r--r--contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h10
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp186
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp203
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp29
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h38
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp20
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h5
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp3
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp1
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h3
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp29
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h2
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp32
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp104
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp37
-rw-r--r--contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h43
-rw-r--r--contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h78
82 files changed, 10118 insertions, 4265 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.h b/contrib/llvm/lib/Target/AArch64/AArch64.h
index fd106a8..1dda746 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.h
@@ -22,12 +22,16 @@
namespace llvm {
+class AArch64RegisterBankInfo;
+class AArch64Subtarget;
class AArch64TargetMachine;
class FunctionPass;
+class InstructionSelector;
class MachineFunctionPass;
FunctionPass *createAArch64DeadRegisterDefinitions();
FunctionPass *createAArch64RedundantCopyEliminationPass();
+FunctionPass *createAArch64CondBrTuning();
FunctionPass *createAArch64ConditionalCompares();
FunctionPass *createAArch64AdvSIMDScalar();
FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
@@ -38,19 +42,23 @@ FunctionPass *createAArch64LoadStoreOptimizationPass();
FunctionPass *createAArch64VectorByElementOptPass();
ModulePass *createAArch64PromoteConstantPass();
FunctionPass *createAArch64ConditionOptimizerPass();
-FunctionPass *createAArch64AddressTypePromotionPass();
FunctionPass *createAArch64A57FPLoadBalancing();
FunctionPass *createAArch64A53Fix835769();
+FunctionPass *createFalkorHWPFFixPass();
+FunctionPass *createFalkorMarkStridedAccessesPass();
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &,
+ AArch64Subtarget &, AArch64RegisterBankInfo &);
void initializeAArch64A53Fix835769Pass(PassRegistry&);
void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
-void initializeAArch64AddressTypePromotionPass(PassRegistry&);
void initializeAArch64AdvSIMDScalarPass(PassRegistry&);
void initializeAArch64CollectLOHPass(PassRegistry&);
+void initializeAArch64CondBrTuningPass(PassRegistry &);
void initializeAArch64ConditionalComparesPass(PassRegistry&);
void initializeAArch64ConditionOptimizerPass(PassRegistry&);
void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
@@ -60,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&);
void initializeAArch64PromoteConstantPass(PassRegistry&);
void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeFalkorHWPFFixPass(PassRegistry&);
+void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td
index 91c335f..436bf11 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64.td
@@ -27,7 +27,7 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
"Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
- "Enable cryptographic instructions">;
+ "Enable cryptographic instructions", [FeatureNEON]>;
def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
"Enable ARMv8 CRC-32 checksum instructions">;
@@ -38,6 +38,9 @@ def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
def FeatureLSE : SubtargetFeature<"lse", "HasLSE", "true",
"Enable ARMv8.1 Large System Extension (LSE) atomic instructions">;
+def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
+ "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions">;
+
def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
"Enable ARMv8 PMUv3 Performance Monitors extension">;
@@ -47,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true",
def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true",
"Enable Statistical Profiling extension">;
+def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
+ "Enable Scalable Vector Extension (SVE) instructions">;
+
/// Cyclone has register move instructions which are "free".
def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
"Has zero-cycle register moves">;
@@ -100,6 +106,14 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
"arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
"CPU fuses arithmetic + cbz/cbnz operations">;
+def FeatureFuseAES : SubtargetFeature<
+ "fuse-aes", "HasFuseAES", "true",
+ "CPU fuses AES crypto operations">;
+
+def FeatureFuseLiterals : SubtargetFeature<
+ "fuse-literals", "HasFuseLiterals", "true",
+ "CPU fuses literal generation operations">;
+
def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
"Disable latency scheduling heuristic">;
@@ -108,12 +122,22 @@ def FeatureUseRSqrt : SubtargetFeature<
"use-reciprocal-square-root", "UseRSqrt", "true",
"Use the reciprocal square root approximation">;
+def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
+ "NegativeImmediates", "false",
+ "Convert immediates and instructions "
+ "to their negated or complemented "
+ "equivalent when the immediate does "
+ "not fit in the encoding.">;
+
+def FeatureLSLFast : SubtargetFeature<
+ "lsl-fast", "HasLSLFast", "true",
+ "CPU has a fastpath logical shift of up to 3 places">;
//===----------------------------------------------------------------------===//
// Architectures.
//
def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
- "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE]>;
+ "Support ARM v8.1a instructions", [FeatureCRC, FeatureLSE, FeatureRDM]>;
def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
"Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
@@ -123,6 +147,7 @@ def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
//===----------------------------------------------------------------------===//
include "AArch64RegisterInfo.td"
+include "AArch64RegisterBanks.td"
include "AArch64CallingConvention.td"
//===----------------------------------------------------------------------===//
@@ -149,7 +174,8 @@ include "AArch64SchedCyclone.td"
include "AArch64SchedFalkor.td"
include "AArch64SchedKryo.td"
include "AArch64SchedM1.td"
-include "AArch64SchedVulcan.td"
+include "AArch64SchedThunderX.td"
+include "AArch64SchedThunderX2T99.td"
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors", [
@@ -167,6 +193,7 @@ def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
@@ -180,6 +207,8 @@ def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
+ FeatureFuseAES,
+ FeatureFuseLiterals,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
@@ -191,6 +220,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon
]>;
@@ -200,6 +230,7 @@ def ProcA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
FeatureCRC,
FeatureCrypto,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon
]>;
@@ -226,6 +257,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
@@ -240,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
@@ -256,7 +289,8 @@ def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureZCZeroing
+ FeatureZCZeroing,
+ FeatureLSLFast
]>;
def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -269,33 +303,80 @@ def ProcFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
- FeatureZCZeroing
+ FeatureRDM,
+ FeatureZCZeroing,
+ FeatureLSLFast
]>;
-def ProcVulcan : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
- "Broadcom Vulcan processors", [
- FeatureCRC,
- FeatureCrypto,
- FeatureFPARMv8,
- FeatureArithmeticBccFusion,
- FeatureNEON,
- FeaturePostRAScheduler,
- FeaturePredictableSelectIsExpensive,
- HasV8_1aOps]>;
+def ProcThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
+ "ThunderX2T99",
+ "Cavium ThunderX2 processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeatureArithmeticBccFusion,
+ FeatureNEON,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureLSE,
+ HasV8_1aOps]>;
+
+def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
+ "Cavium ThunderX processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureNEON]>;
+
+def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
+ "ThunderXT88",
+ "Cavium ThunderX processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureNEON]>;
+
+def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
+ "ThunderXT81",
+ "Cavium ThunderX processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureNEON]>;
+
+def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
+ "ThunderXT83",
+ "Cavium ThunderX processors", [
+ FeatureCRC,
+ FeatureCrypto,
+ FeatureFPARMv8,
+ FeaturePerfMon,
+ FeaturePostRAScheduler,
+ FeaturePredictableSelectIsExpensive,
+ FeatureNEON]>;
def : ProcessorModel<"generic", NoSchedModel, [
- FeatureCRC,
FeatureFPARMv8,
+ FeatureFuseAES,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler
]>;
-// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
+// FIXME: Cortex-A35 is currently modeled as a Cortex-A53.
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+// FIXME: Cortex-A72 and Cortex-A73 are currently modeled as a Cortex-A57.
def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
@@ -304,7 +385,13 @@ def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
-def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
+// Cavium ThunderX/ThunderX T8X Processors
+def : ProcessorModel<"thunderx", ThunderXT8XModel, [ProcThunderX]>;
+def : ProcessorModel<"thunderxt88", ThunderXT8XModel, [ProcThunderXT88]>;
+def : ProcessorModel<"thunderxt81", ThunderXT8XModel, [ProcThunderXT81]>;
+def : ProcessorModel<"thunderxt83", ThunderXT8XModel, [ProcThunderXT83]>;
+// Cavium ThunderX2T9X Processors. Formerly Broadcom Vulcan.
+def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
//===----------------------------------------------------------------------===//
// Assembly parser
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 0aa597b..db1fbe0 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -493,43 +493,30 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
MachineBasicBlock &MBB) {
- RegScavenger RS;
- RS.enterBasicBlock(MBB);
- RS.forward(MachineBasicBlock::iterator(G->getStart()));
-
// Can we find an appropriate register that is available throughout the life
- // of the chain?
- unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
- BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
- for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
- RS.forward(I);
- AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
-
- // Remove any registers clobbered by a regmask or any def register that is
- // immediately dead.
- for (auto J : I->operands()) {
- if (J.isRegMask())
- AvailableRegs.clearBitsNotInMask(J.getRegMask());
-
- if (J.isReg() && J.isDef()) {
- MCRegAliasIterator AI(J.getReg(), TRI, /*IncludeSelf=*/true);
- if (J.isDead())
- for (; AI.isValid(); ++AI)
- AvailableRegs.reset(*AI);
-#ifndef NDEBUG
- else
- for (; AI.isValid(); ++AI)
- assert(!AvailableRegs[*AI] &&
- "Non-dead def should have been removed by now!");
-#endif
- }
- }
+ // of the chain? Simulate liveness backwards until the end of the chain.
+ LiveRegUnits Units(*TRI);
+ Units.addLiveOuts(MBB);
+ MachineBasicBlock::iterator I = MBB.end();
+ MachineBasicBlock::iterator ChainEnd = G->end();
+ while (I != ChainEnd) {
+ --I;
+ Units.stepBackward(*I);
}
+ // Check which register units are alive throughout the chain.
+ MachineBasicBlock::iterator ChainBegin = G->begin();
+ assert(ChainBegin != ChainEnd && "Chain should contain instructions");
+ do {
+ --I;
+ Units.accumulate(*I);
+ } while (I != ChainBegin);
+
// Make sure we allocate in-order, to get the cheapest registers first.
+ unsigned RegClassID = ChainBegin->getDesc().OpInfo[0].RegClass;
auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
for (auto Reg : Ord) {
- if (!AvailableRegs[Reg])
+ if (!Units.available(Reg))
continue;
if (C == getColor(Reg))
return Reg;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
deleted file mode 100644
index 0cbb2db..0000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ /dev/null
@@ -1,484 +0,0 @@
-//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass tries to promote the computations use to obtained a sign extended
-// value used into memory accesses.
-// E.g.
-// a = add nsw i32 b, 3
-// d = sext i32 a to i64
-// e = getelementptr ..., i64 d
-//
-// =>
-// f = sext i32 b to i64
-// a = add nsw i64 f, 3
-// e = getelementptr ..., i64 a
-//
-// This is legal to do if the computations are marked with either nsw or nuw
-// markers. Moreover, the current heuristic is simple: it does not create new
-// sext operations, i.e., it gives up when a sext would have forked (e.g., if a
-// = add i32 b, c, two sexts are required to promote the computation).
-//
-// FIXME: This pass may be useful for other targets too.
-// ===---------------------------------------------------------------------===//
-
-#include "AArch64.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "aarch64-type-promotion"
-
-static cl::opt<bool>
-EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
- cl::desc("Enable merging of redundant sexts when one is dominating"
- " the other."),
- cl::init(true));
-
-#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion"
-
-//===----------------------------------------------------------------------===//
-// AArch64AddressTypePromotion
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AArch64AddressTypePromotion : public FunctionPass {
-
-public:
- static char ID;
- AArch64AddressTypePromotion()
- : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
- initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return AARCH64_TYPE_PROMO_NAME; }
-
- /// Iterate over the functions and promote the computation of interesting
- // sext instructions.
- bool runOnFunction(Function &F) override;
-
-private:
- /// The current function.
- Function *Func;
- /// Filter out all sexts that does not have this type.
- /// Currently initialized with Int64Ty.
- Type *ConsideredSExtType;
-
- // This transformation requires dominator info.
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- FunctionPass::getAnalysisUsage(AU);
- }
-
- typedef SmallPtrSet<Instruction *, 32> SetOfInstructions;
- typedef SmallVector<Instruction *, 16> Instructions;
- typedef DenseMap<Value *, Instructions> ValueToInsts;
-
- /// Check if it is profitable to move a sext through this instruction.
- /// Currently, we consider it is profitable if:
- /// - Inst is used only once (no need to insert truncate).
- /// - Inst has only one operand that will require a sext operation (we do
- /// do not create new sext operation).
- bool shouldGetThrough(const Instruction *Inst);
-
- /// Check if it is possible and legal to move a sext through this
- /// instruction.
- /// Current heuristic considers that we can get through:
- /// - Arithmetic operation marked with the nsw or nuw flag.
- /// - Other sext operation.
- /// - Truncate operation if it was just dropping sign extended bits.
- bool canGetThrough(const Instruction *Inst);
-
- /// Move sext operations through safe to sext instructions.
- bool propagateSignExtension(Instructions &SExtInsts);
-
- /// Is this sext should be considered for code motion.
- /// We look for sext with ConsideredSExtType and uses in at least one
- // GetElementPtrInst.
- bool shouldConsiderSExt(const Instruction *SExt) const;
-
- /// Collect all interesting sext operations, i.e., the ones with the right
- /// type and used in memory accesses.
- /// More precisely, a sext instruction is considered as interesting if it
- /// is used in a "complex" getelementptr or it exits at least another
- /// sext instruction that sign extended the same initial value.
- /// A getelementptr is considered as "complex" if it has more than 2
- // operands.
- void analyzeSExtension(Instructions &SExtInsts);
-
- /// Merge redundant sign extension operations in common dominator.
- void mergeSExts(ValueToInsts &ValToSExtendedUses,
- SetOfInstructions &ToRemove);
-};
-} // end anonymous namespace.
-
-char AArch64AddressTypePromotion::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
- AARCH64_TYPE_PROMO_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
- AARCH64_TYPE_PROMO_NAME, false, false)
-
-FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
- return new AArch64AddressTypePromotion();
-}
-
-bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
- if (isa<SExtInst>(Inst))
- return true;
-
- const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
- if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
- (BinOp->hasNoUnsignedWrap() || BinOp->hasNoSignedWrap()))
- return true;
-
- // sext(trunc(sext)) --> sext
- if (isa<TruncInst>(Inst) && isa<SExtInst>(Inst->getOperand(0))) {
- const Instruction *Opnd = cast<Instruction>(Inst->getOperand(0));
- // Check that the truncate just drop sign extended bits.
- if (Inst->getType()->getIntegerBitWidth() >=
- Opnd->getOperand(0)->getType()->getIntegerBitWidth() &&
- Inst->getOperand(0)->getType()->getIntegerBitWidth() <=
- ConsideredSExtType->getIntegerBitWidth())
- return true;
- }
-
- return false;
-}
-
-bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
- // If the type of the sext is the same as the considered one, this sext
- // will become useless.
- // Otherwise, we will have to do something to preserve the original value,
- // unless it is used once.
- if (isa<SExtInst>(Inst) &&
- (Inst->getType() == ConsideredSExtType || Inst->hasOneUse()))
- return true;
-
- // If the Inst is used more that once, we may need to insert truncate
- // operations and we don't do that at the moment.
- if (!Inst->hasOneUse())
- return false;
-
- // This truncate is used only once, thus if we can get thourgh, it will become
- // useless.
- if (isa<TruncInst>(Inst))
- return true;
-
- // If both operands are not constant, a new sext will be created here.
- // Current heuristic is: each step should be profitable.
- // Therefore we don't allow to increase the number of sext even if it may
- // be profitable later on.
- if (isa<BinaryOperator>(Inst) && isa<ConstantInt>(Inst->getOperand(1)))
- return true;
-
- return false;
-}
-
-static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
- return !(isa<SelectInst>(Inst) && OpIdx == 0);
-}
-
-bool
-AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
- if (SExt->getType() != ConsideredSExtType)
- return false;
-
- for (const User *U : SExt->users()) {
- if (isa<GetElementPtrInst>(U))
- return true;
- }
-
- return false;
-}
-
-// Input:
-// - SExtInsts contains all the sext instructions that are used directly in
-// GetElementPtrInst, i.e., access to memory.
-// Algorithm:
-// - For each sext operation in SExtInsts:
-// Let var be the operand of sext.
-// while it is profitable (see shouldGetThrough), legal, and safe
-// (see canGetThrough) to move sext through var's definition:
-// * promote the type of var's definition.
-// * fold var into sext uses.
-// * move sext above var's definition.
-// * update sext operand to use the operand of var that should be sign
-// extended (by construction there is only one).
-//
-// E.g.,
-// a = ... i32 c, 3
-// b = sext i32 a to i64 <- is it legal/safe/profitable to get through 'a'
-// ...
-// = b
-// => Yes, update the code
-// b = sext i32 c to i64
-// a = ... i64 b, 3
-// ...
-// = a
-// Iterate on 'c'.
-bool
-AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
- DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
-
- bool LocalChange = false;
- SetOfInstructions ToRemove;
- ValueToInsts ValToSExtendedUses;
- while (!SExtInsts.empty()) {
- // Get through simple chain.
- Instruction *SExt = SExtInsts.pop_back_val();
-
- DEBUG(dbgs() << "Consider:\n" << *SExt << '\n');
-
- // If this SExt has already been merged continue.
- if (SExt->use_empty() && ToRemove.count(SExt)) {
- DEBUG(dbgs() << "No uses => marked as delete\n");
- continue;
- }
-
- // Now try to get through the chain of definitions.
- while (auto *Inst = dyn_cast<Instruction>(SExt->getOperand(0))) {
- DEBUG(dbgs() << "Try to get through:\n" << *Inst << '\n');
- if (!canGetThrough(Inst) || !shouldGetThrough(Inst)) {
- // We cannot get through something that is not an Instruction
- // or not safe to SExt.
- DEBUG(dbgs() << "Cannot get through\n");
- break;
- }
-
- LocalChange = true;
- // If this is a sign extend, it becomes useless.
- if (isa<SExtInst>(Inst) || isa<TruncInst>(Inst)) {
- DEBUG(dbgs() << "SExt or trunc, mark it as to remove\n");
- // We cannot use replaceAllUsesWith here because we may trigger some
- // assertion on the type as all involved sext operation may have not
- // been moved yet.
- while (!Inst->use_empty()) {
- Use &U = *Inst->use_begin();
- Instruction *User = dyn_cast<Instruction>(U.getUser());
- assert(User && "User of sext is not an Instruction!");
- User->setOperand(U.getOperandNo(), SExt);
- }
- ToRemove.insert(Inst);
- SExt->setOperand(0, Inst->getOperand(0));
- SExt->moveBefore(Inst);
- continue;
- }
-
- // Get through the Instruction:
- // 1. Update its type.
- // 2. Replace the uses of SExt by Inst.
- // 3. Sign extend each operand that needs to be sign extended.
-
- // Step #1.
- Inst->mutateType(SExt->getType());
- // Step #2.
- SExt->replaceAllUsesWith(Inst);
- // Step #3.
- Instruction *SExtForOpnd = SExt;
-
- DEBUG(dbgs() << "Propagate SExt to operands\n");
- for (int OpIdx = 0, EndOpIdx = Inst->getNumOperands(); OpIdx != EndOpIdx;
- ++OpIdx) {
- DEBUG(dbgs() << "Operand:\n" << *(Inst->getOperand(OpIdx)) << '\n');
- if (Inst->getOperand(OpIdx)->getType() == SExt->getType() ||
- !shouldSExtOperand(Inst, OpIdx)) {
- DEBUG(dbgs() << "No need to propagate\n");
- continue;
- }
- // Check if we can statically sign extend the operand.
- Value *Opnd = Inst->getOperand(OpIdx);
- if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
- DEBUG(dbgs() << "Statically sign extend\n");
- Inst->setOperand(OpIdx, ConstantInt::getSigned(SExt->getType(),
- Cst->getSExtValue()));
- continue;
- }
- // UndefValue are typed, so we have to statically sign extend them.
- if (isa<UndefValue>(Opnd)) {
- DEBUG(dbgs() << "Statically sign extend\n");
- Inst->setOperand(OpIdx, UndefValue::get(SExt->getType()));
- continue;
- }
-
- // Otherwise we have to explicity sign extend it.
- assert(SExtForOpnd &&
- "Only one operand should have been sign extended");
-
- SExtForOpnd->setOperand(0, Opnd);
-
- DEBUG(dbgs() << "Move before:\n" << *Inst << "\nSign extend\n");
- // Move the sign extension before the insertion point.
- SExtForOpnd->moveBefore(Inst);
- Inst->setOperand(OpIdx, SExtForOpnd);
- // If more sext are required, new instructions will have to be created.
- SExtForOpnd = nullptr;
- }
- if (SExtForOpnd == SExt) {
- DEBUG(dbgs() << "Sign extension is useless now\n");
- ToRemove.insert(SExt);
- break;
- }
- }
-
- // If the use is already of the right type, connect its uses to its argument
- // and delete it.
- // This can happen for an Instruction all uses of which are sign extended.
- if (!ToRemove.count(SExt) &&
- SExt->getType() == SExt->getOperand(0)->getType()) {
- DEBUG(dbgs() << "Sign extension is useless, attach its use to "
- "its argument\n");
- SExt->replaceAllUsesWith(SExt->getOperand(0));
- ToRemove.insert(SExt);
- } else
- ValToSExtendedUses[SExt->getOperand(0)].push_back(SExt);
- }
-
- if (EnableMerge)
- mergeSExts(ValToSExtendedUses, ToRemove);
-
- // Remove all instructions marked as ToRemove.
- for (Instruction *I: ToRemove)
- I->eraseFromParent();
- return LocalChange;
-}
-
-void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
- SetOfInstructions &ToRemove) {
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
- for (auto &Entry : ValToSExtendedUses) {
- Instructions &Insts = Entry.second;
- Instructions CurPts;
- for (Instruction *Inst : Insts) {
- if (ToRemove.count(Inst))
- continue;
- bool inserted = false;
- for (auto &Pt : CurPts) {
- if (DT.dominates(Inst, Pt)) {
- DEBUG(dbgs() << "Replace all uses of:\n" << *Pt << "\nwith:\n"
- << *Inst << '\n');
- Pt->replaceAllUsesWith(Inst);
- ToRemove.insert(Pt);
- Pt = Inst;
- inserted = true;
- break;
- }
- if (!DT.dominates(Pt, Inst))
- // Give up if we need to merge in a common dominator as the
- // expermients show it is not profitable.
- continue;
-
- DEBUG(dbgs() << "Replace all uses of:\n" << *Inst << "\nwith:\n"
- << *Pt << '\n');
- Inst->replaceAllUsesWith(Pt);
- ToRemove.insert(Inst);
- inserted = true;
- break;
- }
- if (!inserted)
- CurPts.push_back(Inst);
- }
- }
-}
-
-void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
- DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
-
- DenseMap<Value *, Instruction *> SeenChains;
-
- for (auto &BB : *Func) {
- for (auto &II : BB) {
- Instruction *SExt = &II;
-
- // Collect all sext operation per type.
- if (!isa<SExtInst>(SExt) || !shouldConsiderSExt(SExt))
- continue;
-
- DEBUG(dbgs() << "Found:\n" << (*SExt) << '\n');
-
- // Cases where we actually perform the optimization:
- // 1. SExt is used in a getelementptr with more than 2 operand =>
- // likely we can merge some computation if they are done on 64 bits.
- // 2. The beginning of the SExt chain is SExt several time. =>
- // code sharing is possible.
-
- bool insert = false;
- // #1.
- for (const User *U : SExt->users()) {
- const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
- if (Inst && Inst->getNumOperands() > 2) {
- DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
- << '\n');
- insert = true;
- break;
- }
- }
-
- // #2.
- // Check the head of the chain.
- Instruction *Inst = SExt;
- Value *Last;
- do {
- int OpdIdx = 0;
- const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
- if (BinOp && isa<ConstantInt>(BinOp->getOperand(0)))
- OpdIdx = 1;
- Last = Inst->getOperand(OpdIdx);
- Inst = dyn_cast<Instruction>(Last);
- } while (Inst && canGetThrough(Inst) && shouldGetThrough(Inst));
-
- DEBUG(dbgs() << "Head of the chain:\n" << *Last << '\n');
- DenseMap<Value *, Instruction *>::iterator AlreadySeen =
- SeenChains.find(Last);
- if (insert || AlreadySeen != SeenChains.end()) {
- DEBUG(dbgs() << "Insert\n");
- SExtInsts.push_back(SExt);
- if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
- DEBUG(dbgs() << "Insert chain member\n");
- SExtInsts.push_back(AlreadySeen->second);
- SeenChains[Last] = nullptr;
- }
- } else {
- DEBUG(dbgs() << "Record its chain membership\n");
- SeenChains[Last] = SExt;
- }
- }
- }
-}
-
-bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- if (F.isDeclaration())
- return false;
- Func = &F;
- ConsideredSExtType = Type::getInt64Ty(Func->getContext());
-
- DEBUG(dbgs() << "*** " << getPassName() << ": " << Func->getName() << '\n');
-
- Instructions SExtInsts;
- analyzeSExtension(SExtInsts);
- return propagateSignExtension(SExtInsts);
-}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index efc2218..5ce5792 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -12,13 +12,13 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/AArch64AddressingModes.h"
#include "AArch64.h"
#include "AArch64MCInstLower.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "InstPrinter/AArch64InstPrinter.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringSwitch.h"
@@ -35,11 +35,11 @@
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -320,6 +320,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
switch (ExtraCode[0]) {
default:
return true; // Unknown modifier.
+ case 'a': // Print 'a' modifier
+ PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+ return false;
case 'w': // Print W register
case 'x': // Print X register
if (MO.isReg())
@@ -388,7 +391,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
unsigned AsmVariant,
const char *ExtraCode,
raw_ostream &O) {
- if (ExtraCode && ExtraCode[0])
+ if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
return true; // Unknown modifier.
const MachineOperand &MO = MI->getOperand(OpNum);
@@ -580,8 +583,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
const MachineOperand &MO_Sym = MI->getOperand(0);
MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
- MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF |
- AArch64II::MO_NC);
+ MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
MCInstLowering.lowerOperand(MO_Sym, Sym);
MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
index a4950af..29f6d57 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//===--- AArch64CallLowering.cpp - Call lowering --------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -15,15 +15,36 @@
#include "AArch64CallLowering.h"
#include "AArch64ISelLowering.h"
-
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
using namespace llvm;
#ifndef LLVM_BUILD_GLOBAL_ISEL
@@ -31,12 +52,12 @@ using namespace llvm;
#endif
AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
- : CallLowering(&TLI) {
-}
+ : CallLowering(&TLI) {}
struct IncomingArgHandler : public CallLowering::ValueHandler {
- IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
- : ValueHandler(MIRBuilder, MRI) {}
+ IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
unsigned getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -45,6 +66,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
MIRBuilder.buildFrameIndex(AddrReg, FI);
+ StackUsed = std::max(StackUsed, Size + Offset);
return AddrReg;
}
@@ -67,11 +89,14 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
/// parameters (it's a basic-block live-in), and a call instruction
/// (it's an implicit-def of the BL).
virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+ uint64_t StackUsed;
};
struct FormalArgHandler : public IncomingArgHandler {
- FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI)
- : IncomingArgHandler(MIRBuilder, MRI) {}
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : IncomingArgHandler(MIRBuilder, MRI, AssignFn) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIRBuilder.getMBB().addLiveIn(PhysReg);
@@ -80,8 +105,8 @@ struct FormalArgHandler : public IncomingArgHandler {
struct CallReturnHandler : public IncomingArgHandler {
CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB)
- : IncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {}
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+ : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
void markPhysRegUsed(unsigned PhysReg) override {
MIB.addDef(PhysReg, RegState::Implicit);
@@ -92,8 +117,10 @@ struct CallReturnHandler : public IncomingArgHandler {
struct OutgoingArgHandler : public CallLowering::ValueHandler {
OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB)
- : ValueHandler(MIRBuilder, MRI), MIB(MIB) {}
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn,
+ CCAssignFn *AssignFnVarArg)
+ : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+ AssignFnVarArg(AssignFnVarArg), StackSize(0) {}
unsigned getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
@@ -126,14 +153,29 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
MIRBuilder.buildStore(ValVReg, Addr, *MMO);
}
+ bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info,
+ CCState &State) override {
+ bool Res;
+ if (Info.IsFixed)
+ Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ else
+ Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+
+ StackSize = State.getNextStackOffset();
+ return Res;
+ }
+
MachineInstrBuilder MIB;
+ CCAssignFn *AssignFnVarArg;
+ uint64_t StackSize;
};
-void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
- SmallVectorImpl<ArgInfo> &SplitArgs,
- const DataLayout &DL,
- MachineRegisterInfo &MRI,
- SplitArgTy PerformArgSplit) const {
+void AArch64CallLowering::splitToValueTypes(
+ const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, MachineRegisterInfo &MRI,
+ const SplitArgTy &PerformArgSplit) const {
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
LLVMContext &Ctx = OrigArg.Ty->getContext();
@@ -145,7 +187,7 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
// No splitting to do, but we want to replace the original type (e.g. [1 x
// double] -> double).
SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
- OrigArg.Flags);
+ OrigArg.Flags, OrigArg.IsFixed);
return;
}
@@ -154,19 +196,12 @@ void AArch64CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
// FIXME: set split flags if they're actually used (e.g. i128 on AAPCS).
Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
SplitArgs.push_back(
- ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy,
- OrigArg.Flags});
+ ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
+ SplitTy, OrigArg.Flags, OrigArg.IsFixed});
}
- SmallVector<uint64_t, 4> BitOffsets;
- for (auto Offset : Offsets)
- BitOffsets.push_back(Offset * 8);
-
- SmallVector<unsigned, 8> SplitRegs;
- for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I)
- SplitRegs.push_back(I->Reg);
-
- PerformArgSplit(SplitRegs, BitOffsets);
+ for (unsigned i = 0; i < Offsets.size(); ++i)
+ PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
}
bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -184,16 +219,16 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
auto &DL = F.getParent()->getDataLayout();
ArgInfo OrigArg{VReg, Val->getType()};
- setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F);
+ setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
SmallVector<ArgInfo, 8> SplitArgs;
splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
- MIRBuilder.buildExtract(Regs, Offsets, VReg);
+ [&](unsigned Reg, uint64_t Offset) {
+ MIRBuilder.buildExtract(Reg, VReg, Offset);
});
- OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
- Success = handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler);
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
+ Success = handleAssignments(MIRBuilder, SplitArgs, Handler);
}
MIRBuilder.insertInstr(MIB);
@@ -203,7 +238,6 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
const Function &F,
ArrayRef<unsigned> VRegs) const {
- auto &Args = F.getArgumentList();
MachineFunction &MF = MIRBuilder.getMF();
MachineBasicBlock &MBB = MIRBuilder.getMBB();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -211,13 +245,27 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
SmallVector<ArgInfo, 8> SplitArgs;
unsigned i = 0;
- for (auto &Arg : Args) {
+ for (auto &Arg : F.args()) {
ArgInfo OrigArg{VRegs[i], Arg.getType()};
- setArgFlags(OrigArg, i + 1, DL, F);
+ setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F);
+ bool Split = false;
+ LLT Ty = MRI.getType(VRegs[i]);
+ unsigned Dst = VRegs[i];
+
splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
- MIRBuilder.buildSequence(VRegs[i], Regs, Offsets);
+ [&](unsigned Reg, uint64_t Offset) {
+ if (!Split) {
+ Split = true;
+ Dst = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildUndef(Dst);
+ }
+ unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+ MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
+ Dst = Tmp;
});
+
+ if (Dst != VRegs[i])
+ MIRBuilder.buildCopy(VRegs[i], Dst);
++i;
}
@@ -228,10 +276,25 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
CCAssignFn *AssignFn =
TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
- FormalArgHandler Handler(MIRBuilder, MRI);
- if (!handleAssignments(MIRBuilder, AssignFn, SplitArgs, Handler))
+ FormalArgHandler Handler(MIRBuilder, MRI, AssignFn);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
+ if (F.isVarArg()) {
+ if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
+ // FIXME: we need to reimplement saveVarArgsRegisters from
+ // AArch64ISelLowering.
+ return false;
+ }
+
+ // We currently pass all varargs at 8-byte alignment.
+ uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+ }
+
// Move back to the end of the basic block.
MIRBuilder.setMBB(MBB);
@@ -239,6 +302,7 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
}
bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallingConv::ID CallConv,
const MachineOperand &Callee,
const ArgInfo &OrigRet,
ArrayRef<ArgInfo> OrigArgs) const {
@@ -250,21 +314,25 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
SmallVector<ArgInfo, 8> SplitArgs;
for (auto &OrigArg : OrigArgs) {
splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
- MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg);
+ [&](unsigned Reg, uint64_t Offset) {
+ MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset);
});
}
// Find out which ABI gets to decide where things go.
const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
- CCAssignFn *CallAssignFn =
- TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+ CCAssignFn *AssignFnFixed =
+ TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+ CCAssignFn *AssignFnVarArg =
+ TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true);
+
+ auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
auto MIB = MIRBuilder.buildInstrNoInsert(Callee.isReg() ? AArch64::BLR
: AArch64::BL);
- MIB.addOperand(Callee);
+ MIB.add(Callee);
// Tell the call which registers are clobbered.
auto TRI = MF.getSubtarget().getRegisterInfo();
@@ -272,8 +340,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Do the actual argument marshalling.
SmallVector<unsigned, 8> PhysRegs;
- OutgoingArgHandler Handler(MIRBuilder, MRI, MIB);
- if (!handleAssignments(MIRBuilder, CallAssignFn, SplitArgs, Handler))
+ OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
+ AssignFnVarArg);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
// Now we can add the actual call instruction to the correct basic block.
@@ -298,20 +367,23 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
SmallVector<uint64_t, 8> RegOffsets;
SmallVector<unsigned, 8> SplitRegs;
splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
- [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
- std::copy(Offsets.begin(), Offsets.end(),
- std::back_inserter(RegOffsets));
- std::copy(Regs.begin(), Regs.end(),
- std::back_inserter(SplitRegs));
+ [&](unsigned Reg, uint64_t Offset) {
+ RegOffsets.push_back(Offset);
+ SplitRegs.push_back(Reg);
});
- CallReturnHandler Handler(MIRBuilder, MRI, MIB);
- if (!handleAssignments(MIRBuilder, RetAssignFn, SplitArgs, Handler))
+ CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
if (!RegOffsets.empty())
MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets);
}
+ CallSeqStart.addImm(Handler.StackSize).addImm(0);
+ MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP)
+ .addImm(Handler.StackSize)
+ .addImm(0);
+
return true;
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
index ce66762..d96ce95 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallLowering.h
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//===--- AArch64CallLowering.h - Call lowering ------------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,18 +12,20 @@
///
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
-#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
+#include "llvm/ADT/ArrayRef.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/ValueTypes.h"
+#include <cstdint>
+#include <functional>
namespace llvm {
class AArch64TargetLowering;
class AArch64CallLowering: public CallLowering {
- public:
+public:
AArch64CallLowering(const AArch64TargetLowering &TLI);
bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
@@ -32,8 +34,8 @@ class AArch64CallLowering: public CallLowering {
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<unsigned> VRegs) const override;
- bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee,
- const ArgInfo &OrigRet,
+ bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+ const MachineOperand &Callee, const ArgInfo &OrigRet,
ArrayRef<ArgInfo> OrigArgs) const override;
private:
@@ -44,13 +46,14 @@ private:
typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)>
MemHandler;
- typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)>
- SplitArgTy;
+ typedef std::function<void(unsigned, uint64_t)> SplitArgTy;
void splitToValueTypes(const ArgInfo &OrigArgInfo,
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI,
- SplitArgTy SplitArg) const;
+ const SplitArgTy &SplitArg) const;
};
-} // End of namespace llvm;
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 938779d..291bc5e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
]>;
+// Vararg functions on windows pass floats in integer registers
+def CC_AArch64_Win64_VarArg : CallingConv<[
+ CCIfType<[f16, f32], CCPromoteToType<f64>>,
+ CCIfType<[f64], CCBitConvertToType<i64>>,
+ CCDelegateTo<CC_AArch64_AAPCS>
+]>;
+
// Darwin uses a calling convention which differs in only two ways
// from the standard one at this level:
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 6f8dd3e..b3b7385 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -113,7 +113,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
return Copy;
}
- // Create a virtal register in *TLSBaseAddrReg, and populate it by
+ // Create a virtual register in *TLSBaseAddrReg, and populate it by
// inserting a copy instruction after I. Returns the new instruction.
MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
MachineFunction *MF = I.getParent()->getParent();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
new file mode 100644
index 0000000..51700f9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -0,0 +1,339 @@
+//===-- AArch64CondBrTuning.cpp --- Conditional branch tuning for AArch64 -===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains a pass that transforms CBZ/CBNZ/TBZ/TBNZ instructions
+/// into a conditional branch (B.cond), when the NZCV flags can be set for
+/// "free". This is preferred on targets that have more flexibility when
+/// scheduling B.cond instructions as compared to CBZ/CBNZ/TBZ/TBNZ (assuming
+/// all other variables are equal). This can also reduce register pressure.
+///
+/// A few examples:
+///
+/// 1) add w8, w0, w1 -> cmn w0, w1 ; CMN is an alias of ADDS.
+/// cbz w8, .LBB_2 -> b.eq .LBB0_2
+///
+/// 2) add w8, w0, w1 -> adds w8, w0, w1 ; w8 has multiple uses.
+/// cbz w8, .LBB1_2 -> b.eq .LBB1_2
+///
+/// 3) sub w8, w0, w1 -> subs w8, w0, w1 ; w8 has multiple uses.
+/// tbz w8, #31, .LBB6_2 -> b.pl .LBB6_2
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-cond-br-tuning"
+#define AARCH64_CONDBR_TUNING_NAME "AArch64 Conditional Branch Tuning"
+
+namespace {
+class AArch64CondBrTuning : public MachineFunctionPass {
+ const AArch64InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+
+ MachineRegisterInfo *MRI;
+
+public:
+ static char ID;
+ AArch64CondBrTuning() : MachineFunctionPass(ID) {
+ initializeAArch64CondBrTuningPass(*PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override { return AARCH64_CONDBR_TUNING_NAME; }
+
+private:
+ MachineInstr *getOperandDef(const MachineOperand &MO);
+ MachineInstr *convertToFlagSetting(MachineInstr &MI, bool IsFlagSetting);
+ MachineInstr *convertToCondBr(MachineInstr &MI);
+ bool tryToTuneBranch(MachineInstr &MI, MachineInstr &DefMI);
+};
+} // end anonymous namespace
+
+char AArch64CondBrTuning::ID = 0;
+
+INITIALIZE_PASS(AArch64CondBrTuning, "aarch64-cond-br-tuning",
+ AARCH64_CONDBR_TUNING_NAME, false, false)
+
+void AArch64CondBrTuning::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *AArch64CondBrTuning::getOperandDef(const MachineOperand &MO) {
+ if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ return nullptr;
+ return MRI->getUniqueVRegDef(MO.getReg());
+}
+
+MachineInstr *AArch64CondBrTuning::convertToFlagSetting(MachineInstr &MI,
+ bool IsFlagSetting) {
+ // If this is already the flag setting version of the instruction (e.g., SUBS)
+ // just make sure the implicit-def of NZCV isn't marked dead.
+ if (IsFlagSetting) {
+ for (unsigned I = MI.getNumExplicitOperands(), E = MI.getNumOperands();
+ I != E; ++I) {
+ MachineOperand &MO = MI.getOperand(I);
+ if (MO.isReg() && MO.isDead() && MO.getReg() == AArch64::NZCV)
+ MO.setIsDead(false);
+ }
+ return &MI;
+ }
+ bool Is64Bit;
+ unsigned NewOpc = TII->convertToFlagSettingOpc(MI.getOpcode(), Is64Bit);
+ unsigned NewDestReg = MI.getOperand(0).getReg();
+ if (MRI->hasOneNonDBGUse(MI.getOperand(0).getReg()))
+ NewDestReg = Is64Bit ? AArch64::XZR : AArch64::WZR;
+
+ MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(NewOpc), NewDestReg);
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
+ MIB.add(MI.getOperand(I));
+
+ return MIB;
+}
+
+MachineInstr *AArch64CondBrTuning::convertToCondBr(MachineInstr &MI) {
+ AArch64CC::CondCode CC;
+ MachineBasicBlock *TargetMBB = TII->getBranchDestBlock(MI);
+ switch (MI.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ CC = AArch64CC::EQ;
+ break;
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ CC = AArch64CC::NE;
+ break;
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ CC = AArch64CC::PL;
+ break;
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ CC = AArch64CC::MI;
+ break;
+ }
+ return BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AArch64::Bcc))
+ .addImm(CC)
+ .addMBB(TargetMBB);
+}
+
+bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
+ MachineInstr &DefMI) {
+ // We don't want NZCV bits live across blocks.
+ if (MI.getParent() != DefMI.getParent())
+ return false;
+
+ bool IsFlagSetting = true;
+ unsigned MIOpc = MI.getOpcode();
+ MachineInstr *NewCmp = nullptr, *NewBr = nullptr;
+ switch (DefMI.getOpcode()) {
+ default:
+ return false;
+ case AArch64::ADDWri:
+ case AArch64::ADDWrr:
+ case AArch64::ADDWrs:
+ case AArch64::ADDWrx:
+ case AArch64::ANDWri:
+ case AArch64::ANDWrr:
+ case AArch64::ANDWrs:
+ case AArch64::BICWrr:
+ case AArch64::BICWrs:
+ case AArch64::SUBWri:
+ case AArch64::SUBWrr:
+ case AArch64::SUBWrs:
+ case AArch64::SUBWrx:
+ IsFlagSetting = false;
+ LLVM_FALLTHROUGH;
+ case AArch64::ADDSWri:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSWrx:
+ case AArch64::ANDSWri:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSWrs:
+ case AArch64::BICSWrr:
+ case AArch64::BICSWrs:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSWrx:
+ switch (MIOpc) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+
+ case AArch64::CBZW:
+ case AArch64::CBNZW:
+ case AArch64::TBZW:
+ case AArch64::TBNZW:
+ // Check to see if the TBZ/TBNZ is checking the sign bit.
+ if ((MIOpc == AArch64::TBZW || MIOpc == AArch64::TBNZW) &&
+ MI.getOperand(1).getImm() != 31)
+ return false;
+
+ // There must not be any instruction between DefMI and MI that clobbers or
+ // reads NZCV.
+ MachineBasicBlock::iterator I(DefMI), E(MI);
+ for (I = std::next(I); I != E; ++I) {
+ if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+ I->readsRegister(AArch64::NZCV, TRI))
+ return false;
+ }
+ DEBUG(dbgs() << " Replacing instructions:\n ");
+ DEBUG(DefMI.print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(MI.print(dbgs()));
+
+ NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+ NewBr = convertToCondBr(MI);
+ break;
+ }
+ break;
+
+ case AArch64::ADDXri:
+ case AArch64::ADDXrr:
+ case AArch64::ADDXrs:
+ case AArch64::ADDXrx:
+ case AArch64::ANDXri:
+ case AArch64::ANDXrr:
+ case AArch64::ANDXrs:
+ case AArch64::BICXrr:
+ case AArch64::BICXrs:
+ case AArch64::SUBXri:
+ case AArch64::SUBXrr:
+ case AArch64::SUBXrs:
+ case AArch64::SUBXrx:
+ IsFlagSetting = false;
+ LLVM_FALLTHROUGH;
+ case AArch64::ADDSXri:
+ case AArch64::ADDSXrr:
+ case AArch64::ADDSXrs:
+ case AArch64::ADDSXrx:
+ case AArch64::ANDSXri:
+ case AArch64::ANDSXrr:
+ case AArch64::ANDSXrs:
+ case AArch64::BICSXrr:
+ case AArch64::BICSXrs:
+ case AArch64::SUBSXri:
+ case AArch64::SUBSXrr:
+ case AArch64::SUBSXrs:
+ case AArch64::SUBSXrx:
+ switch (MIOpc) {
+ default:
+ llvm_unreachable("Unexpected opcode!");
+
+ case AArch64::CBZX:
+ case AArch64::CBNZX:
+ case AArch64::TBZX:
+ case AArch64::TBNZX: {
+ // Check to see if the TBZ/TBNZ is checking the sign bit.
+ if ((MIOpc == AArch64::TBZX || MIOpc == AArch64::TBNZX) &&
+ MI.getOperand(1).getImm() != 63)
+ return false;
+ // There must not be any instruction between DefMI and MI that clobbers or
+ // reads NZCV.
+ MachineBasicBlock::iterator I(DefMI), E(MI);
+ for (I = std::next(I); I != E; ++I) {
+ if (I->modifiesRegister(AArch64::NZCV, TRI) ||
+ I->readsRegister(AArch64::NZCV, TRI))
+ return false;
+ }
+ DEBUG(dbgs() << " Replacing instructions:\n ");
+ DEBUG(DefMI.print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(MI.print(dbgs()));
+
+ NewCmp = convertToFlagSetting(DefMI, IsFlagSetting);
+ NewBr = convertToCondBr(MI);
+ break;
+ }
+ }
+ break;
+ }
+ (void)NewCmp; (void)NewBr;
+ assert(NewCmp && NewBr && "Expected new instructions.");
+
+ DEBUG(dbgs() << " with instruction:\n ");
+ DEBUG(NewCmp->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(NewBr->print(dbgs()));
+
+ // If this was a flag setting version of the instruction, we use the original
+ // instruction by just clearing the dead marked on the implicit-def of NCZV.
+ // Therefore, we should not erase this instruction.
+ if (!IsFlagSetting)
+ DefMI.eraseFromParent();
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AArch64CondBrTuning::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(*MF.getFunction()))
+ return false;
+
+ DEBUG(dbgs() << "********** AArch64 Conditional Branch Tuning **********\n"
+ << "********** Function: " << MF.getName() << '\n');
+
+ TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+ TRI = MF.getSubtarget().getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ bool LocalChange = false;
+ for (MachineBasicBlock::iterator I = MBB.getFirstTerminator(),
+ E = MBB.end();
+ I != E; ++I) {
+ MachineInstr &MI = *I;
+ switch (MI.getOpcode()) {
+ default:
+ break;
+ case AArch64::CBZW:
+ case AArch64::CBZX:
+ case AArch64::CBNZW:
+ case AArch64::CBNZX:
+ case AArch64::TBZW:
+ case AArch64::TBZX:
+ case AArch64::TBNZW:
+ case AArch64::TBNZX:
+ MachineInstr *DefMI = getOperandDef(MI.getOperand(0));
+ LocalChange = (DefMI && tryToTuneBranch(MI, *DefMI));
+ break;
+ }
+ // If the optimization was successful, we can't optimize any other
+ // branches because doing so would clobber the NZCV flags.
+ if (LocalChange) {
+ Changed = true;
+ break;
+ }
+ }
+ }
+ return Changed;
+}
+
+FunctionPass *llvm::createAArch64CondBrTuning() {
+ return new AArch64CondBrTuning();
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 8b18632..2dfcd2d 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -265,10 +265,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
// Change immediate in comparison instruction (ADDS or SUBS).
BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Opc))
- .addOperand(CmpMI->getOperand(0))
- .addOperand(CmpMI->getOperand(1))
+ .add(CmpMI->getOperand(0))
+ .add(CmpMI->getOperand(1))
.addImm(Imm)
- .addOperand(CmpMI->getOperand(3));
+ .add(CmpMI->getOperand(3));
CmpMI->eraseFromParent();
// The fact that this comparison was picked ensures that it's related to the
@@ -278,7 +278,7 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
// Change condition in branch instruction.
BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
.addImm(Cmp)
- .addOperand(BrMI.getOperand(1));
+ .add(BrMI.getOperand(1));
BrMI.eraseFromParent();
MBB->updateTerminator();
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index da09b36..9eda56c 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -139,6 +140,7 @@ class SSACCmpConv {
const TargetInstrInfo *TII;
const TargetRegisterInfo *TRI;
MachineRegisterInfo *MRI;
+ const MachineBranchProbabilityInfo *MBPI;
public:
/// The first block containing a conditional branch, dominating everything
@@ -186,8 +188,10 @@ private:
public:
/// runOnMachineFunction - Initialize per-function data structures.
- void runOnMachineFunction(MachineFunction &MF) {
+ void runOnMachineFunction(MachineFunction &MF,
+ const MachineBranchProbabilityInfo *MBPI) {
this->MF = &MF;
+ this->MBPI = MBPI;
TII = MF.getSubtarget().getInstrInfo();
TRI = MF.getSubtarget().getRegisterInfo();
MRI = &MF.getRegInfo();
@@ -564,8 +568,40 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
// All CmpBB instructions are moved into Head, and CmpBB is deleted.
// Update the CFG first.
updateTailPHIs();
- Head->removeSuccessor(CmpBB, true);
- CmpBB->removeSuccessor(Tail, true);
+
+ // Save successor probabilties before removing CmpBB and Tail from their
+ // parents.
+ BranchProbability Head2CmpBB = MBPI->getEdgeProbability(Head, CmpBB);
+ BranchProbability CmpBB2Tail = MBPI->getEdgeProbability(CmpBB, Tail);
+
+ Head->removeSuccessor(CmpBB);
+ CmpBB->removeSuccessor(Tail);
+
+ // If Head and CmpBB had successor probabilties, udpate the probabilities to
+ // reflect the ccmp-conversion.
+ if (Head->hasSuccessorProbabilities() && CmpBB->hasSuccessorProbabilities()) {
+
+ // Head is allowed two successors. We've removed CmpBB, so the remaining
+ // successor is Tail. We need to increase the successor probability for
+ // Tail to account for the CmpBB path we removed.
+ //
+ // Pr(Tail|Head) += Pr(CmpBB|Head) * Pr(Tail|CmpBB).
+ assert(*Head->succ_begin() == Tail && "Head successor is not Tail");
+ BranchProbability Head2Tail = MBPI->getEdgeProbability(Head, Tail);
+ Head->setSuccProbability(Head->succ_begin(),
+ Head2Tail + Head2CmpBB * CmpBB2Tail);
+
+ // We will transfer successors of CmpBB to Head in a moment without
+ // normalizing the successor probabilities. Set the successor probabilites
+ // before doing so.
+ //
+ // Pr(I|Head) = Pr(CmpBB|Head) * Pr(I|CmpBB).
+ for (auto I = CmpBB->succ_begin(), E = CmpBB->succ_end(); I != E; ++I) {
+ BranchProbability CmpBB2I = MBPI->getEdgeProbability(CmpBB, *I);
+ CmpBB->setSuccProbability(I, Head2CmpBB * CmpBB2I);
+ }
+ }
+
Head->transferSuccessorsAndUpdatePHIs(CmpBB);
DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc();
TII->removeBranch(*Head);
@@ -594,7 +630,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
// Insert a SUBS Rn, #0 instruction instead of the cbz / cbnz.
BuildMI(*Head, Head->end(), TermDL, MCID)
.addReg(DestReg, RegState::Define | RegState::Dead)
- .addOperand(HeadCond[2])
+ .add(HeadCond[2])
.addImm(0)
.addImm(0);
// SUBS uses the GPR*sp register classes.
@@ -650,13 +686,12 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
if (CmpMI->getOperand(FirstOp + 1).isReg())
MRI->constrainRegClass(CmpMI->getOperand(FirstOp + 1).getReg(),
TII->getRegClass(MCID, 1, TRI, *MF));
- MachineInstrBuilder MIB =
- BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
- .addOperand(CmpMI->getOperand(FirstOp)); // Register Rn
+ MachineInstrBuilder MIB = BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), MCID)
+ .add(CmpMI->getOperand(FirstOp)); // Register Rn
if (isZBranch)
MIB.addImm(0); // cbz/cbnz Rn -> ccmp Rn, #0
else
- MIB.addOperand(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
+ MIB.add(CmpMI->getOperand(FirstOp + 1)); // Register Rm / Immediate
MIB.addImm(NZCV).addImm(HeadCmpBBCC);
// If CmpMI was a terminator, we need a new conditional branch to replace it.
@@ -666,7 +701,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
CmpMI->getOpcode() == AArch64::CBNZX;
BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
.addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
- .addOperand(CmpMI->getOperand(1)); // Branch target.
+ .add(CmpMI->getOperand(1)); // Branch target.
}
CmpMI->eraseFromParent();
Head->updateTerminator();
@@ -718,6 +753,7 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
namespace {
class AArch64ConditionalCompares : public MachineFunctionPass {
+ const MachineBranchProbabilityInfo *MBPI;
const TargetInstrInfo *TII;
const TargetRegisterInfo *TRI;
MCSchedModel SchedModel;
@@ -754,6 +790,7 @@ char AArch64ConditionalCompares::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
"AArch64 CCMP Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -764,6 +801,7 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
}
void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineBranchProbabilityInfo>();
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
AU.addRequired<MachineLoopInfo>();
@@ -893,12 +931,13 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
DomTree = &getAnalysis<MachineDominatorTree>();
Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+ MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
Traces = &getAnalysis<MachineTraceMetrics>();
MinInstr = nullptr;
MinSize = MF.getFunction()->optForMinSize();
bool Changed = false;
- CmpConv.runOnMachineFunction(MF);
+ CmpConv.runOnMachineFunction(MF, MBPI);
// Visit blocks in dominator tree pre-order. The pre-order enables multiple
// cmp-conversions from the same head block.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 30e2b23..b72f23b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -13,15 +13,17 @@
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
#define DEBUG_TYPE "aarch64-dead-defs"
@@ -84,6 +86,55 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock(
DEBUG(dbgs() << " Ignoring, XZR or WZR already used by the instruction\n");
continue;
}
+ if (MF.getSubtarget<AArch64Subtarget>().hasLSE()) {
+ // XZ/WZ for LSE can only be used when acquire semantics are not used,
+ // LDOPAL WZ is an invalid opcode.
+ switch (MI.getOpcode()) {
+ case AArch64::CASALb:
+ case AArch64::CASALh:
+ case AArch64::CASALs:
+ case AArch64::CASALd:
+ case AArch64::SWPALb:
+ case AArch64::SWPALh:
+ case AArch64::SWPALs:
+ case AArch64::SWPALd:
+ case AArch64::LDADDALb:
+ case AArch64::LDADDALh:
+ case AArch64::LDADDALs:
+ case AArch64::LDADDALd:
+ case AArch64::LDCLRALb:
+ case AArch64::LDCLRALh:
+ case AArch64::LDCLRALs:
+ case AArch64::LDCLRALd:
+ case AArch64::LDEORALb:
+ case AArch64::LDEORALh:
+ case AArch64::LDEORALs:
+ case AArch64::LDEORALd:
+ case AArch64::LDSETALb:
+ case AArch64::LDSETALh:
+ case AArch64::LDSETALs:
+ case AArch64::LDSETALd:
+ case AArch64::LDSMINALb:
+ case AArch64::LDSMINALh:
+ case AArch64::LDSMINALs:
+ case AArch64::LDSMINALd:
+ case AArch64::LDSMAXALb:
+ case AArch64::LDSMAXALh:
+ case AArch64::LDSMAXALs:
+ case AArch64::LDSMAXALd:
+ case AArch64::LDUMINALb:
+ case AArch64::LDUMINALh:
+ case AArch64::LDUMINALs:
+ case AArch64::LDUMINALd:
+ case AArch64::LDUMAXALb:
+ case AArch64::LDUMAXALh:
+ case AArch64::LDUMAXALs:
+ case AArch64::LDUMAXALd:
+ continue;
+ default:
+ break;
+ }
+ }
const MCInstrDesc &Desc = MI.getDesc();
for (int I = 0, E = Desc.getNumDefs(); I != E; ++I) {
MachineOperand &MO = MI.getOperand(I);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index fe1c0be..d52cd84 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -14,9 +14,10 @@
//
//===----------------------------------------------------------------------===//
-#include "MCTargetDesc/AArch64AddressingModes.h"
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -70,9 +71,9 @@ static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
const MachineOperand &MO = OldMI.getOperand(i);
assert(MO.isReg() && MO.getReg());
if (MO.isUse())
- UseMI.addOperand(MO);
+ UseMI.add(MO);
else
- DefMI.addOperand(MO);
+ DefMI.add(MO);
}
}
@@ -112,7 +113,7 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
// Create the ORR-immediate instruction.
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
- .addOperand(MI.getOperand(0))
+ .add(MI.getOperand(0))
.addReg(AArch64::XZR)
.addImm(Encoding);
@@ -179,7 +180,7 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
// Create the ORR-immediate instruction.
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
- .addOperand(MI.getOperand(0))
+ .add(MI.getOperand(0))
.addReg(AArch64::XZR)
.addImm(Encoding);
@@ -362,7 +363,7 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
- .addOperand(MI.getOperand(0))
+ .add(MI.getOperand(0))
.addReg(AArch64::XZR)
.addImm(Encoding);
@@ -425,7 +426,7 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
- .addOperand(MI.getOperand(0))
+ .add(MI.getOperand(0))
.addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
.addImm(Encoding);
transferImpOps(MI, MIB, MIB);
@@ -539,15 +540,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
if (Imm != 0) {
unsigned LZ = countLeadingZeros(Imm);
unsigned TZ = countTrailingZeros(Imm);
- Shift = ((63 - LZ) / 16) * 16;
- LastShift = (TZ / 16) * 16;
+ Shift = (TZ / 16) * 16;
+ LastShift = ((63 - LZ) / 16) * 16;
}
unsigned Imm16 = (Imm >> Shift) & Mask;
bool DstIsDead = MI.getOperand(0).isDead();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
.addReg(DstReg, RegState::Define |
- getDeadRegState(DstIsDead && Shift == LastShift))
+ getDeadRegState(DstIsDead && Shift == LastShift))
.addImm(Imm16)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
@@ -564,15 +565,15 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
MachineInstrBuilder MIB2;
unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
- while (Shift != LastShift) {
- Shift -= 16;
+ while (Shift < LastShift) {
+ Shift += 16;
Imm16 = (Imm >> Shift) & Mask;
if (Imm16 == (isNeg ? Mask : 0))
continue; // This 16-bit portion is already set correctly.
MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
.addReg(DstReg,
RegState::Define |
- getDeadRegState(DstIsDead && Shift == LastShift))
+ getDeadRegState(DstIsDead && Shift == LastShift))
.addReg(DstReg)
.addImm(Imm16)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
@@ -583,27 +584,21 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
return true;
}
-static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
- for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
- MBB->addLiveIn(*I);
-}
-
bool AArch64ExpandPseudo::expandCMP_SWAP(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
MachineBasicBlock::iterator &NextMBBI) {
MachineInstr &MI = *MBBI;
DebugLoc DL = MI.getDebugLoc();
- MachineOperand &Dest = MI.getOperand(0);
+ const MachineOperand &Dest = MI.getOperand(0);
unsigned StatusReg = MI.getOperand(1).getReg();
- MachineOperand &Addr = MI.getOperand(2);
- MachineOperand &Desired = MI.getOperand(3);
- MachineOperand &New = MI.getOperand(4);
-
- LivePhysRegs LiveRegs(&TII->getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- for (auto I = std::prev(MBB.end()); I != MBBI; --I)
- LiveRegs.stepBackward(*I);
+ bool StatusDead = MI.getOperand(1).isDead();
+ // Duplicating undef operands into 2 instructions does not guarantee the same
+ // value on both; However undef should be replaced by xzr anyway.
+ assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+ unsigned AddrReg = MI.getOperand(2).getReg();
+ unsigned DesiredReg = MI.getOperand(3).getReg();
+ unsigned NewReg = MI.getOperand(4).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -615,19 +610,18 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
MF->insert(++StoreBB->getIterator(), DoneBB);
// .Lloadcmp:
+ // mov wStatus, 0
// ldaxr xDest, [xAddr]
// cmp xDest, xDesired
// b.ne .Ldone
- LoadCmpBB->addLiveIn(Addr.getReg());
- LoadCmpBB->addLiveIn(Dest.getReg());
- LoadCmpBB->addLiveIn(Desired.getReg());
- addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
+ if (!StatusDead)
+ BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg)
+ .addImm(0).addImm(0);
BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
- .addReg(Addr.getReg());
+ .addReg(AddrReg);
BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
.addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
- .addOperand(Desired)
+ .addReg(DesiredReg)
.addImm(ExtendImm);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
.addImm(AArch64CC::NE)
@@ -639,27 +633,35 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
// .Lstore:
// stlxr wStatus, xNew, [xAddr]
// cbnz wStatus, .Lloadcmp
- StoreBB->addLiveIn(Addr.getReg());
- StoreBB->addLiveIn(New.getReg());
- addPostLoopLiveIns(StoreBB, LiveRegs);
-
BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
- .addOperand(New)
- .addOperand(Addr);
+ .addReg(NewReg)
+ .addReg(AddrReg);
BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
- .addReg(StatusReg, RegState::Kill)
+ .addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
StoreBB->addSuccessor(LoadCmpBB);
StoreBB->addSuccessor(DoneBB);
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
- addPostLoopLiveIns(DoneBB, LiveRegs);
MBB.addSuccessor(LoadCmpBB);
NextMBBI = MBB.end();
MI.eraseFromParent();
+
+ // Recompute livein lists.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ LivePhysRegs LiveRegs;
+ computeLiveIns(LiveRegs, MRI, *DoneBB);
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+ // Do an extra pass around the loop to get loop carried registers right.
+ StoreBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ LoadCmpBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
return true;
}
@@ -672,16 +674,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
MachineOperand &DestLo = MI.getOperand(0);
MachineOperand &DestHi = MI.getOperand(1);
unsigned StatusReg = MI.getOperand(2).getReg();
- MachineOperand &Addr = MI.getOperand(3);
- MachineOperand &DesiredLo = MI.getOperand(4);
- MachineOperand &DesiredHi = MI.getOperand(5);
- MachineOperand &NewLo = MI.getOperand(6);
- MachineOperand &NewHi = MI.getOperand(7);
-
- LivePhysRegs LiveRegs(&TII->getRegisterInfo());
- LiveRegs.addLiveOuts(MBB);
- for (auto I = std::prev(MBB.end()); I != MBBI; --I)
- LiveRegs.stepBackward(*I);
+ bool StatusDead = MI.getOperand(2).isDead();
+ // Duplicating undef operands into 2 instructions does not guarantee the same
+ // value on both; However undef should be replaced by xzr anyway.
+ assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
+ unsigned AddrReg = MI.getOperand(3).getReg();
+ unsigned DesiredLoReg = MI.getOperand(4).getReg();
+ unsigned DesiredHiReg = MI.getOperand(5).getReg();
+ unsigned NewLoReg = MI.getOperand(6).getReg();
+ unsigned NewHiReg = MI.getOperand(7).getReg();
MachineFunction *MF = MBB.getParent();
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -697,20 +698,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
// cmp xDestLo, xDesiredLo
// sbcs xDestHi, xDesiredHi
// b.ne .Ldone
- LoadCmpBB->addLiveIn(Addr.getReg());
- LoadCmpBB->addLiveIn(DestLo.getReg());
- LoadCmpBB->addLiveIn(DestHi.getReg());
- LoadCmpBB->addLiveIn(DesiredLo.getReg());
- LoadCmpBB->addLiveIn(DesiredHi.getReg());
- addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
.addReg(DestLo.getReg(), RegState::Define)
.addReg(DestHi.getReg(), RegState::Define)
- .addReg(Addr.getReg());
+ .addReg(AddrReg);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
.addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
- .addOperand(DesiredLo)
+ .addReg(DesiredLoReg)
.addImm(0);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
.addUse(AArch64::WZR)
@@ -718,14 +712,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
.addImm(AArch64CC::EQ);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
.addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
- .addOperand(DesiredHi)
+ .addReg(DesiredHiReg)
.addImm(0);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
.addUse(StatusReg, RegState::Kill)
.addUse(StatusReg, RegState::Kill)
.addImm(AArch64CC::EQ);
BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
- .addUse(StatusReg, RegState::Kill)
+ .addUse(StatusReg, getKillRegState(StatusDead))
.addMBB(DoneBB);
LoadCmpBB->addSuccessor(DoneBB);
LoadCmpBB->addSuccessor(StoreBB);
@@ -733,28 +727,36 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
// .Lstore:
// stlxp wStatus, xNewLo, xNewHi, [xAddr]
// cbnz wStatus, .Lloadcmp
- StoreBB->addLiveIn(Addr.getReg());
- StoreBB->addLiveIn(NewLo.getReg());
- StoreBB->addLiveIn(NewHi.getReg());
- addPostLoopLiveIns(StoreBB, LiveRegs);
BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
- .addOperand(NewLo)
- .addOperand(NewHi)
- .addOperand(Addr);
+ .addReg(NewLoReg)
+ .addReg(NewHiReg)
+ .addReg(AddrReg);
BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
- .addReg(StatusReg, RegState::Kill)
+ .addReg(StatusReg, getKillRegState(StatusDead))
.addMBB(LoadCmpBB);
StoreBB->addSuccessor(LoadCmpBB);
StoreBB->addSuccessor(DoneBB);
DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
DoneBB->transferSuccessors(&MBB);
- addPostLoopLiveIns(DoneBB, LiveRegs);
MBB.addSuccessor(LoadCmpBB);
NextMBBI = MBB.end();
MI.eraseFromParent();
+
+ // Recompute liveness bottom up.
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ LivePhysRegs LiveRegs;
+ computeLiveIns(LiveRegs, MRI, *DoneBB);
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+ // Do an extra pass in the loop to get the loop carried dependencies right.
+ StoreBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *StoreBB);
+ LoadCmpBB->clearLiveIns();
+ computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
return true;
}
@@ -825,8 +827,8 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
MI.getOperand(0).getReg())
- .addOperand(MI.getOperand(1))
- .addOperand(MI.getOperand(2))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
transferImpOps(MI, MIB1, MIB1);
MI.eraseFromParent();
@@ -842,7 +844,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
MachineInstrBuilder MIB2 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
- .addOperand(MI.getOperand(0))
+ .add(MI.getOperand(0))
.addReg(DstReg);
if (MO1.isGlobal()) {
@@ -878,19 +880,31 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
unsigned DstReg = MI.getOperand(0).getReg();
MachineInstrBuilder MIB1 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
- .addOperand(MI.getOperand(1));
+ .add(MI.getOperand(1));
MachineInstrBuilder MIB2 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
- .addOperand(MI.getOperand(0))
+ .add(MI.getOperand(0))
.addReg(DstReg)
- .addOperand(MI.getOperand(2))
+ .add(MI.getOperand(2))
.addImm(0);
transferImpOps(MI, MIB1, MIB2);
MI.eraseFromParent();
return true;
}
+ case AArch64::MOVbaseTLS: {
+ unsigned DstReg = MI.getOperand(0).getReg();
+ auto SysReg = AArch64SysReg::TPIDR_EL0;
+ MachineFunction *MF = MBB.getParent();
+ if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
+ MF->getTarget().getCodeModel() == CodeModel::Kernel)
+ SysReg = AArch64SysReg::TPIDR_EL1;
+ BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg)
+ .addImm(SysReg);
+ MI.eraseFromParent();
+ return true;
+ }
case AArch64::MOVi32imm:
return expandMOVImm(MBB, MBBI, 32);
@@ -931,6 +945,19 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
AArch64::XZR, NextMBBI);
case AArch64::CMP_SWAP_128:
return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
+
+ case AArch64::AESMCrrTied:
+ case AArch64::AESIMCrrTied: {
+ MachineInstrBuilder MIB =
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr :
+ AArch64::AESIMCrr))
+ .add(MI.getOperand(0))
+ .add(MI.getOperand(1));
+ transferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+ return true;
+ }
}
return false;
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
new file mode 100644
index 0000000..c0e2235
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -0,0 +1,790 @@
+//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
+/// that may inhibit the HW prefetching. This is done in two steps. Before
+/// ISel, we mark strided loads (i.e. those that will likely benefit from
+/// prefetching) with metadata. Then, after opcodes have been finalized, we
+/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
+// ===---------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "falkor-hwpf-fix"
+
+STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
+STATISTIC(NumCollisionsAvoided,
+ "Number of HW prefetch tag collisions avoided");
+STATISTIC(NumCollisionsNotAvoided,
+ "Number of HW prefetch tag collisions not avoided due to lack of regsiters");
+
+namespace {
+
+class FalkorMarkStridedAccesses {
+public:
+ FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
+ : LI(LI), SE(SE) {}
+
+ bool run();
+
+private:
+ bool runOnLoop(Loop &L);
+
+ LoopInfo &LI;
+ ScalarEvolution &SE;
+};
+
+class FalkorMarkStridedAccessesLegacy : public FunctionPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
+ initializeFalkorMarkStridedAccessesLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ // FIXME: For some reason, preserving SE here breaks LSR (even if
+ // this pass changes nothing).
+ // AU.addPreserved<ScalarEvolutionWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FalkorMarkStridedAccessesLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+ "Falkor HW Prefetch Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+ "Falkor HW Prefetch Fix", false, false)
+
+FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
+ return new FalkorMarkStridedAccessesLegacy();
+}
+
+bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
+ TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const AArch64Subtarget *ST =
+ TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
+ if (ST->getProcFamily() != AArch64Subtarget::Falkor)
+ return false;
+
+ if (skipFunction(F))
+ return false;
+
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+ FalkorMarkStridedAccesses LDP(LI, SE);
+ return LDP.run();
+}
+
+bool FalkorMarkStridedAccesses::run() {
+ bool MadeChange = false;
+
+ for (Loop *L : LI)
+ for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
+ MadeChange |= runOnLoop(**LIt);
+
+ return MadeChange;
+}
+
+bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
+ // Only mark strided loads in the inner-most loop
+ if (!L.empty())
+ return false;
+
+ bool MadeChange = false;
+
+ for (BasicBlock *BB : L.blocks()) {
+ for (Instruction &I : *BB) {
+ LoadInst *LoadI = dyn_cast<LoadInst>(&I);
+ if (!LoadI)
+ continue;
+
+ Value *PtrValue = LoadI->getPointerOperand();
+ if (L.isLoopInvariant(PtrValue))
+ continue;
+
+ const SCEV *LSCEV = SE.getSCEV(PtrValue);
+ const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+ if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+ continue;
+
+ LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
+ MDNode::get(LoadI->getContext(), {}));
+ ++NumStridedLoadsMarked;
+ DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+namespace {
+
+class FalkorHWPFFix : public MachineFunctionPass {
+public:
+ static char ID;
+
+ FalkorHWPFFix() : MachineFunctionPass(ID) {
+ initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &Fn) override;
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<MachineLoopInfo>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ MachineFunctionProperties getRequiredProperties() const override {
+ return MachineFunctionProperties().set(
+ MachineFunctionProperties::Property::NoVRegs);
+ }
+
+private:
+ void runOnLoop(MachineLoop &L, MachineFunction &Fn);
+
+ const AArch64InstrInfo *TII;
+ const TargetRegisterInfo *TRI;
+ DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
+ bool Modified;
+};
+
+/// Bits from load opcodes used to compute HW prefetcher instruction tags.
+struct LoadInfo {
+ LoadInfo()
+ : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr),
+ IsPrePost(false) {}
+ unsigned DestReg;
+ unsigned BaseReg;
+ int BaseRegIdx;
+ const MachineOperand *OffsetOpnd;
+ bool IsPrePost;
+};
+
+} // namespace
+
+char FalkorHWPFFix::ID = 0;
+
+INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
+ "Falkor HW Prefetch Fix Late Phase", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
+ "Falkor HW Prefetch Fix Late Phase", false, false)
+
+static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
+ return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
+}
+
+static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
+ int DestRegIdx;
+ int BaseRegIdx;
+ int OffsetIdx;
+ bool IsPrePost;
+
+ switch (MI.getOpcode()) {
+ default:
+ return None;
+
+ case AArch64::LD1i8:
+ case AArch64::LD1i16:
+ case AArch64::LD1i32:
+ case AArch64::LD1i64:
+ case AArch64::LD2i8:
+ case AArch64::LD2i16:
+ case AArch64::LD2i32:
+ case AArch64::LD2i64:
+ case AArch64::LD3i8:
+ case AArch64::LD3i16:
+ case AArch64::LD3i32:
+ case AArch64::LD4i8:
+ case AArch64::LD4i16:
+ case AArch64::LD4i32:
+ DestRegIdx = 0;
+ BaseRegIdx = 3;
+ OffsetIdx = -1;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LD3i64:
+ case AArch64::LD4i64:
+ DestRegIdx = -1;
+ BaseRegIdx = 3;
+ OffsetIdx = -1;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LD1Onev1d:
+ case AArch64::LD1Onev2s:
+ case AArch64::LD1Onev4h:
+ case AArch64::LD1Onev8b:
+ case AArch64::LD1Onev2d:
+ case AArch64::LD1Onev4s:
+ case AArch64::LD1Onev8h:
+ case AArch64::LD1Onev16b:
+ case AArch64::LD1Rv1d:
+ case AArch64::LD1Rv2s:
+ case AArch64::LD1Rv4h:
+ case AArch64::LD1Rv8b:
+ case AArch64::LD1Rv2d:
+ case AArch64::LD1Rv4s:
+ case AArch64::LD1Rv8h:
+ case AArch64::LD1Rv16b:
+ case AArch64::LD1Twov1d:
+ case AArch64::LD1Twov2s:
+ case AArch64::LD1Twov4h:
+ case AArch64::LD1Twov8b:
+ case AArch64::LD2Twov2s:
+ case AArch64::LD2Twov4s:
+ case AArch64::LD2Twov8b:
+ case AArch64::LD2Rv1d:
+ case AArch64::LD2Rv2s:
+ case AArch64::LD2Rv4s:
+ case AArch64::LD2Rv8b:
+ DestRegIdx = 0;
+ BaseRegIdx = 1;
+ OffsetIdx = -1;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LD1Twov2d:
+ case AArch64::LD1Twov4s:
+ case AArch64::LD1Twov8h:
+ case AArch64::LD1Twov16b:
+ case AArch64::LD1Threev1d:
+ case AArch64::LD1Threev2s:
+ case AArch64::LD1Threev4h:
+ case AArch64::LD1Threev8b:
+ case AArch64::LD1Threev2d:
+ case AArch64::LD1Threev4s:
+ case AArch64::LD1Threev8h:
+ case AArch64::LD1Threev16b:
+ case AArch64::LD1Fourv1d:
+ case AArch64::LD1Fourv2s:
+ case AArch64::LD1Fourv4h:
+ case AArch64::LD1Fourv8b:
+ case AArch64::LD1Fourv2d:
+ case AArch64::LD1Fourv4s:
+ case AArch64::LD1Fourv8h:
+ case AArch64::LD1Fourv16b:
+ case AArch64::LD2Twov2d:
+ case AArch64::LD2Twov4h:
+ case AArch64::LD2Twov8h:
+ case AArch64::LD2Twov16b:
+ case AArch64::LD2Rv2d:
+ case AArch64::LD2Rv4h:
+ case AArch64::LD2Rv8h:
+ case AArch64::LD2Rv16b:
+ case AArch64::LD3Threev2s:
+ case AArch64::LD3Threev4h:
+ case AArch64::LD3Threev8b:
+ case AArch64::LD3Threev2d:
+ case AArch64::LD3Threev4s:
+ case AArch64::LD3Threev8h:
+ case AArch64::LD3Threev16b:
+ case AArch64::LD3Rv1d:
+ case AArch64::LD3Rv2s:
+ case AArch64::LD3Rv4h:
+ case AArch64::LD3Rv8b:
+ case AArch64::LD3Rv2d:
+ case AArch64::LD3Rv4s:
+ case AArch64::LD3Rv8h:
+ case AArch64::LD3Rv16b:
+ case AArch64::LD4Fourv2s:
+ case AArch64::LD4Fourv4h:
+ case AArch64::LD4Fourv8b:
+ case AArch64::LD4Fourv2d:
+ case AArch64::LD4Fourv4s:
+ case AArch64::LD4Fourv8h:
+ case AArch64::LD4Fourv16b:
+ case AArch64::LD4Rv1d:
+ case AArch64::LD4Rv2s:
+ case AArch64::LD4Rv4h:
+ case AArch64::LD4Rv8b:
+ case AArch64::LD4Rv2d:
+ case AArch64::LD4Rv4s:
+ case AArch64::LD4Rv8h:
+ case AArch64::LD4Rv16b:
+ DestRegIdx = -1;
+ BaseRegIdx = 1;
+ OffsetIdx = -1;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LD1i8_POST:
+ case AArch64::LD1i16_POST:
+ case AArch64::LD1i32_POST:
+ case AArch64::LD1i64_POST:
+ case AArch64::LD2i8_POST:
+ case AArch64::LD2i16_POST:
+ case AArch64::LD2i32_POST:
+ case AArch64::LD2i64_POST:
+ case AArch64::LD3i8_POST:
+ case AArch64::LD3i16_POST:
+ case AArch64::LD3i32_POST:
+ case AArch64::LD4i8_POST:
+ case AArch64::LD4i16_POST:
+ case AArch64::LD4i32_POST:
+ DestRegIdx = 1;
+ BaseRegIdx = 4;
+ OffsetIdx = 5;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LD3i64_POST:
+ case AArch64::LD4i64_POST:
+ DestRegIdx = -1;
+ BaseRegIdx = 4;
+ OffsetIdx = 5;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LD1Onev1d_POST:
+ case AArch64::LD1Onev2s_POST:
+ case AArch64::LD1Onev4h_POST:
+ case AArch64::LD1Onev8b_POST:
+ case AArch64::LD1Onev2d_POST:
+ case AArch64::LD1Onev4s_POST:
+ case AArch64::LD1Onev8h_POST:
+ case AArch64::LD1Onev16b_POST:
+ case AArch64::LD1Rv1d_POST:
+ case AArch64::LD1Rv2s_POST:
+ case AArch64::LD1Rv4h_POST:
+ case AArch64::LD1Rv8b_POST:
+ case AArch64::LD1Rv2d_POST:
+ case AArch64::LD1Rv4s_POST:
+ case AArch64::LD1Rv8h_POST:
+ case AArch64::LD1Rv16b_POST:
+ case AArch64::LD1Twov1d_POST:
+ case AArch64::LD1Twov2s_POST:
+ case AArch64::LD1Twov4h_POST:
+ case AArch64::LD1Twov8b_POST:
+ case AArch64::LD2Twov2s_POST:
+ case AArch64::LD2Twov4s_POST:
+ case AArch64::LD2Twov8b_POST:
+ case AArch64::LD2Rv1d_POST:
+ case AArch64::LD2Rv2s_POST:
+ case AArch64::LD2Rv4s_POST:
+ case AArch64::LD2Rv8b_POST:
+ DestRegIdx = 1;
+ BaseRegIdx = 2;
+ OffsetIdx = 3;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LD1Twov2d_POST:
+ case AArch64::LD1Twov4s_POST:
+ case AArch64::LD1Twov8h_POST:
+ case AArch64::LD1Twov16b_POST:
+ case AArch64::LD1Threev1d_POST:
+ case AArch64::LD1Threev2s_POST:
+ case AArch64::LD1Threev4h_POST:
+ case AArch64::LD1Threev8b_POST:
+ case AArch64::LD1Threev2d_POST:
+ case AArch64::LD1Threev4s_POST:
+ case AArch64::LD1Threev8h_POST:
+ case AArch64::LD1Threev16b_POST:
+ case AArch64::LD1Fourv1d_POST:
+ case AArch64::LD1Fourv2s_POST:
+ case AArch64::LD1Fourv4h_POST:
+ case AArch64::LD1Fourv8b_POST:
+ case AArch64::LD1Fourv2d_POST:
+ case AArch64::LD1Fourv4s_POST:
+ case AArch64::LD1Fourv8h_POST:
+ case AArch64::LD1Fourv16b_POST:
+ case AArch64::LD2Twov2d_POST:
+ case AArch64::LD2Twov4h_POST:
+ case AArch64::LD2Twov8h_POST:
+ case AArch64::LD2Twov16b_POST:
+ case AArch64::LD2Rv2d_POST:
+ case AArch64::LD2Rv4h_POST:
+ case AArch64::LD2Rv8h_POST:
+ case AArch64::LD2Rv16b_POST:
+ case AArch64::LD3Threev2s_POST:
+ case AArch64::LD3Threev4h_POST:
+ case AArch64::LD3Threev8b_POST:
+ case AArch64::LD3Threev2d_POST:
+ case AArch64::LD3Threev4s_POST:
+ case AArch64::LD3Threev8h_POST:
+ case AArch64::LD3Threev16b_POST:
+ case AArch64::LD3Rv1d_POST:
+ case AArch64::LD3Rv2s_POST:
+ case AArch64::LD3Rv4h_POST:
+ case AArch64::LD3Rv8b_POST:
+ case AArch64::LD3Rv2d_POST:
+ case AArch64::LD3Rv4s_POST:
+ case AArch64::LD3Rv8h_POST:
+ case AArch64::LD3Rv16b_POST:
+ case AArch64::LD4Fourv2s_POST:
+ case AArch64::LD4Fourv4h_POST:
+ case AArch64::LD4Fourv8b_POST:
+ case AArch64::LD4Fourv2d_POST:
+ case AArch64::LD4Fourv4s_POST:
+ case AArch64::LD4Fourv8h_POST:
+ case AArch64::LD4Fourv16b_POST:
+ case AArch64::LD4Rv1d_POST:
+ case AArch64::LD4Rv2s_POST:
+ case AArch64::LD4Rv4h_POST:
+ case AArch64::LD4Rv8b_POST:
+ case AArch64::LD4Rv2d_POST:
+ case AArch64::LD4Rv4s_POST:
+ case AArch64::LD4Rv8h_POST:
+ case AArch64::LD4Rv16b_POST:
+ DestRegIdx = -1;
+ BaseRegIdx = 2;
+ OffsetIdx = 3;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LDRBBroW:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRBBui:
+ case AArch64::LDRBroW:
+ case AArch64::LDRBroX:
+ case AArch64::LDRBui:
+ case AArch64::LDRDl:
+ case AArch64::LDRDroW:
+ case AArch64::LDRDroX:
+ case AArch64::LDRDui:
+ case AArch64::LDRHHroW:
+ case AArch64::LDRHHroX:
+ case AArch64::LDRHHui:
+ case AArch64::LDRHroW:
+ case AArch64::LDRHroX:
+ case AArch64::LDRHui:
+ case AArch64::LDRQl:
+ case AArch64::LDRQroW:
+ case AArch64::LDRQroX:
+ case AArch64::LDRQui:
+ case AArch64::LDRSBWroW:
+ case AArch64::LDRSBWroX:
+ case AArch64::LDRSBWui:
+ case AArch64::LDRSBXroW:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSBXui:
+ case AArch64::LDRSHWroW:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRSHWui:
+ case AArch64::LDRSHXroW:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSHXui:
+ case AArch64::LDRSWl:
+ case AArch64::LDRSWroW:
+ case AArch64::LDRSWroX:
+ case AArch64::LDRSWui:
+ case AArch64::LDRSl:
+ case AArch64::LDRSroW:
+ case AArch64::LDRSroX:
+ case AArch64::LDRSui:
+ case AArch64::LDRWl:
+ case AArch64::LDRWroW:
+ case AArch64::LDRWroX:
+ case AArch64::LDRWui:
+ case AArch64::LDRXl:
+ case AArch64::LDRXroW:
+ case AArch64::LDRXroX:
+ case AArch64::LDRXui:
+ case AArch64::LDURBBi:
+ case AArch64::LDURBi:
+ case AArch64::LDURDi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURHi:
+ case AArch64::LDURQi:
+ case AArch64::LDURSBWi:
+ case AArch64::LDURSBXi:
+ case AArch64::LDURSHWi:
+ case AArch64::LDURSHXi:
+ case AArch64::LDURSWi:
+ case AArch64::LDURSi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi:
+ DestRegIdx = 0;
+ BaseRegIdx = 1;
+ OffsetIdx = 2;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LDRBBpost:
+ case AArch64::LDRBBpre:
+ case AArch64::LDRBpost:
+ case AArch64::LDRBpre:
+ case AArch64::LDRDpost:
+ case AArch64::LDRDpre:
+ case AArch64::LDRHHpost:
+ case AArch64::LDRHHpre:
+ case AArch64::LDRHpost:
+ case AArch64::LDRHpre:
+ case AArch64::LDRQpost:
+ case AArch64::LDRQpre:
+ case AArch64::LDRSBWpost:
+ case AArch64::LDRSBWpre:
+ case AArch64::LDRSBXpost:
+ case AArch64::LDRSBXpre:
+ case AArch64::LDRSHWpost:
+ case AArch64::LDRSHWpre:
+ case AArch64::LDRSHXpost:
+ case AArch64::LDRSHXpre:
+ case AArch64::LDRSWpost:
+ case AArch64::LDRSWpre:
+ case AArch64::LDRSpost:
+ case AArch64::LDRSpre:
+ case AArch64::LDRWpost:
+ case AArch64::LDRWpre:
+ case AArch64::LDRXpost:
+ case AArch64::LDRXpre:
+ DestRegIdx = 1;
+ BaseRegIdx = 2;
+ OffsetIdx = 3;
+ IsPrePost = true;
+ break;
+
+ case AArch64::LDPDi:
+ case AArch64::LDPQi:
+ DestRegIdx = -1;
+ BaseRegIdx = 2;
+ OffsetIdx = 3;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LDPSWi:
+ case AArch64::LDPSi:
+ case AArch64::LDPWi:
+ case AArch64::LDPXi:
+ DestRegIdx = 0;
+ BaseRegIdx = 2;
+ OffsetIdx = 3;
+ IsPrePost = false;
+ break;
+
+ case AArch64::LDPQpost:
+ case AArch64::LDPQpre:
+ DestRegIdx = -1;
+ BaseRegIdx = 3;
+ OffsetIdx = 4;
+ IsPrePost = true;
+ break;
+
+ case AArch64::LDPDpost:
+ case AArch64::LDPDpre:
+ case AArch64::LDPSWpost:
+ case AArch64::LDPSWpre:
+ case AArch64::LDPSpost:
+ case AArch64::LDPSpre:
+ case AArch64::LDPWpost:
+ case AArch64::LDPWpre:
+ case AArch64::LDPXpost:
+ case AArch64::LDPXpre:
+ DestRegIdx = 1;
+ BaseRegIdx = 3;
+ OffsetIdx = 4;
+ IsPrePost = true;
+ break;
+ }
+
+ LoadInfo LI;
+ LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
+ LI.BaseReg = MI.getOperand(BaseRegIdx).getReg();
+ LI.BaseRegIdx = BaseRegIdx;
+ LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
+ LI.IsPrePost = IsPrePost;
+ return LI;
+}
+
+static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
+ const MachineInstr &MI, const LoadInfo &LI) {
+ unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
+ unsigned Base = TRI->getEncodingValue(LI.BaseReg);
+ unsigned Off;
+ if (LI.OffsetOpnd == nullptr)
+ Off = 0;
+ else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
+ LI.OffsetOpnd->isCPI())
+ return None;
+ else if (LI.OffsetOpnd->isReg())
+ Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
+ else
+ Off = LI.OffsetOpnd->getImm() >> 2;
+
+ return makeTag(Dest, Base, Off);
+}
+
+void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
+ // Build the initial tag map for the whole loop.
+ TagMap.clear();
+ for (MachineBasicBlock *MBB : L.getBlocks())
+ for (MachineInstr &MI : *MBB) {
+ Optional<LoadInfo> LInfo = getLoadInfo(MI);
+ if (!LInfo)
+ continue;
+ Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
+ if (!Tag)
+ continue;
+ TagMap[*Tag].push_back(&MI);
+ }
+
+ bool AnyCollisions = false;
+ for (auto &P : TagMap) {
+ auto Size = P.second.size();
+ if (Size > 1) {
+ for (auto *MI : P.second) {
+ if (TII->isStridedAccess(*MI)) {
+ AnyCollisions = true;
+ break;
+ }
+ }
+ }
+ if (AnyCollisions)
+ break;
+ }
+ // Nothing to fix.
+ if (!AnyCollisions)
+ return;
+
+ MachineRegisterInfo &MRI = Fn.getRegInfo();
+
+ // Go through all the basic blocks in the current loop and fix any streaming
+ // loads to avoid collisions with any other loads.
+ LiveRegUnits LR(*TRI);
+ for (MachineBasicBlock *MBB : L.getBlocks()) {
+ LR.clear();
+ LR.addLiveOuts(*MBB);
+ for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
+ MachineInstr &MI = *I;
+ if (!TII->isStridedAccess(MI))
+ continue;
+
+ LoadInfo LdI = *getLoadInfo(MI);
+ unsigned OldTag = *getTag(TRI, MI, LdI);
+ auto &OldCollisions = TagMap[OldTag];
+ if (OldCollisions.size() <= 1)
+ continue;
+
+ bool Fixed = false;
+ DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
+
+ for (unsigned ScratchReg : AArch64::GPR64RegClass) {
+ if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
+ continue;
+
+ LoadInfo NewLdI(LdI);
+ NewLdI.BaseReg = ScratchReg;
+ unsigned NewTag = *getTag(TRI, MI, NewLdI);
+ // Scratch reg tag would collide too, so don't use it.
+ if (TagMap.count(NewTag))
+ continue;
+
+ DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI)
+ << '\n');
+
+ // Rewrite:
+ // Xd = LOAD Xb, off
+ // to:
+ // Xc = MOV Xb
+ // Xd = LOAD Xc, off
+ DebugLoc DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
+ .addReg(AArch64::XZR)
+ .addReg(LdI.BaseReg)
+ .addImm(0);
+ MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
+ BaseOpnd.setReg(ScratchReg);
+
+ // If the load does a pre/post increment, then insert a MOV after as
+ // well to update the real base register.
+ if (LdI.IsPrePost) {
+ DEBUG(dbgs() << "Doing post MOV of incremented reg: "
+ << PrintReg(ScratchReg, TRI) << '\n');
+ MI.getOperand(0).setReg(
+ ScratchReg); // Change tied operand pre/post update dest.
+ BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
+ TII->get(AArch64::ORRXrs), LdI.BaseReg)
+ .addReg(AArch64::XZR)
+ .addReg(ScratchReg)
+ .addImm(0);
+ }
+
+ for (int I = 0, E = OldCollisions.size(); I != E; ++I)
+ if (OldCollisions[I] == &MI) {
+ std::swap(OldCollisions[I], OldCollisions[E - 1]);
+ OldCollisions.pop_back();
+ break;
+ }
+
+ // Update TagMap to reflect instruction changes to reduce the number
+ // of later MOVs to be inserted. This needs to be done after
+ // OldCollisions is updated since it may be relocated by this
+ // insertion.
+ TagMap[NewTag].push_back(&MI);
+ ++NumCollisionsAvoided;
+ Fixed = true;
+ Modified = true;
+ break;
+ }
+ if (!Fixed)
+ ++NumCollisionsNotAvoided;
+ }
+ }
+}
+
+bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
+ auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
+ if (ST.getProcFamily() != AArch64Subtarget::Falkor)
+ return false;
+
+ if (skipFunction(*Fn.getFunction()))
+ return false;
+
+ TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
+ TRI = ST.getRegisterInfo();
+
+ assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
+ "Register liveness not available!");
+
+ MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
+
+ Modified = false;
+
+ for (MachineLoop *I : LI)
+ for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+ // Only process inner-loops
+ if (L->empty())
+ runOnLoop(**L, Fn);
+
+ return Modified;
+}
+
+FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index fe2c2d4..9739605 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -15,28 +15,62 @@
#include "AArch64.h"
#include "AArch64CallingConvention.h"
+#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
using namespace llvm;
namespace {
@@ -50,48 +84,55 @@ class AArch64FastISel final : public FastISel {
} BaseKind;
private:
- BaseKind Kind;
- AArch64_AM::ShiftExtendType ExtType;
+ BaseKind Kind = RegBase;
+ AArch64_AM::ShiftExtendType ExtType = AArch64_AM::InvalidShiftExtend;
union {
unsigned Reg;
int FI;
} Base;
- unsigned OffsetReg;
- unsigned Shift;
- int64_t Offset;
- const GlobalValue *GV;
+ unsigned OffsetReg = 0;
+ unsigned Shift = 0;
+ int64_t Offset = 0;
+ const GlobalValue *GV = nullptr;
public:
- Address() : Kind(RegBase), ExtType(AArch64_AM::InvalidShiftExtend),
- OffsetReg(0), Shift(0), Offset(0), GV(nullptr) { Base.Reg = 0; }
+ Address() { Base.Reg = 0; }
+
void setKind(BaseKind K) { Kind = K; }
BaseKind getKind() const { return Kind; }
void setExtendType(AArch64_AM::ShiftExtendType E) { ExtType = E; }
AArch64_AM::ShiftExtendType getExtendType() const { return ExtType; }
bool isRegBase() const { return Kind == RegBase; }
bool isFIBase() const { return Kind == FrameIndexBase; }
+
void setReg(unsigned Reg) {
assert(isRegBase() && "Invalid base register access!");
Base.Reg = Reg;
}
+
unsigned getReg() const {
assert(isRegBase() && "Invalid base register access!");
return Base.Reg;
}
+
void setOffsetReg(unsigned Reg) {
OffsetReg = Reg;
}
+
unsigned getOffsetReg() const {
return OffsetReg;
}
+
void setFI(unsigned FI) {
assert(isFIBase() && "Invalid base frame index access!");
Base.FI = FI;
}
+
unsigned getFI() const {
assert(isFIBase() && "Invalid base frame index access!");
return Base.FI;
}
+
void setOffset(int64_t O) { Offset = O; }
int64_t getOffset() { return Offset; }
void setShift(unsigned S) { Shift = S; }
@@ -417,7 +458,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
// MachO still uses GOT for large code-model accesses, but ELF requires
// movz/movk sequences, which FastISel doesn't handle yet.
- if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+ if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
return 0;
unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
@@ -531,23 +572,23 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
switch (Opcode) {
default:
break;
- case Instruction::BitCast: {
+ case Instruction::BitCast:
// Look through bitcasts.
return computeAddress(U->getOperand(0), Addr, Ty);
- }
- case Instruction::IntToPtr: {
+
+ case Instruction::IntToPtr:
// Look past no-op inttoptrs.
if (TLI.getValueType(DL, U->getOperand(0)->getType()) ==
TLI.getPointerTy(DL))
return computeAddress(U->getOperand(0), Addr, Ty);
break;
- }
- case Instruction::PtrToInt: {
+
+ case Instruction::PtrToInt:
// Look past no-op ptrtoints.
if (TLI.getValueType(DL, U->getType()) == TLI.getPointerTy(DL))
return computeAddress(U->getOperand(0), Addr, Ty);
break;
- }
+
case Instruction::GetElementPtr: {
Address SavedAddr = Addr;
uint64_t TmpOffset = Addr.getOffset();
@@ -563,7 +604,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
TmpOffset += SL->getElementOffset(Idx);
} else {
uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
- for (;;) {
+ while (true) {
if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
// Constant-offset addressing.
TmpOffset += CI->getSExtValue() * S;
@@ -1241,6 +1282,10 @@ unsigned AArch64FastISel::emitAddSub_rr(bool UseAdd, MVT RetVT, unsigned LHSReg,
bool WantResult) {
assert(LHSReg && RHSReg && "Invalid register number.");
+ if (LHSReg == AArch64::SP || LHSReg == AArch64::WSP ||
+ RHSReg == AArch64::SP || RHSReg == AArch64::WSP)
+ return 0;
+
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
@@ -1321,6 +1366,8 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg,
uint64_t ShiftImm, bool SetFlags,
bool WantResult) {
assert(LHSReg && RHSReg && "Invalid register number.");
+ assert(LHSReg != AArch64::SP && LHSReg != AArch64::WSP &&
+ RHSReg != AArch64::SP && RHSReg != AArch64::WSP);
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
@@ -1362,6 +1409,8 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg,
uint64_t ShiftImm, bool SetFlags,
bool WantResult) {
assert(LHSReg && RHSReg && "Invalid register number.");
+ assert(LHSReg != AArch64::XZR && LHSReg != AArch64::WZR &&
+ RHSReg != AArch64::XZR && RHSReg != AArch64::WZR);
if (RetVT != MVT::i32 && RetVT != MVT::i64)
return 0;
@@ -2065,7 +2114,7 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr,
switch (VT.SimpleTy) {
default: llvm_unreachable("Unexpected value type.");
- case MVT::i1: VTIsi1 = true;
+ case MVT::i1: VTIsi1 = true; LLVM_FALLTHROUGH;
case MVT::i8: Opc = OpcTable[Idx][0]; break;
case MVT::i16: Opc = OpcTable[Idx][1]; break;
case MVT::i32: Opc = OpcTable[Idx][2]; break;
@@ -2786,7 +2835,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
return false;
EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType(), true);
- if (SrcVT == MVT::f128)
+ if (SrcVT == MVT::f128 || SrcVT == MVT::f16)
return false;
unsigned Opc;
@@ -2813,8 +2862,12 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
MVT DestVT;
if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
return false;
- assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
- "Unexpected value type.");
+ // Let regular ISEL handle FP16
+ if (DestVT == MVT::f16)
+ return false;
+
+ assert((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+ "Unexpected value type.");
unsigned SrcReg = getRegForValue(I->getOperand(0));
if (!SrcReg)
@@ -2866,16 +2919,13 @@ bool AArch64FastISel::fastLowerArguments() {
// Only handle simple cases of up to 8 GPR and FPR each.
unsigned GPRCnt = 0;
unsigned FPRCnt = 0;
- unsigned Idx = 0;
for (auto const &Arg : F->args()) {
- // The first argument is at index 1.
- ++Idx;
- if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
- F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
- F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
- F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
- F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
- F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+ if (Arg.hasAttribute(Attribute::ByVal) ||
+ Arg.hasAttribute(Attribute::InReg) ||
+ Arg.hasAttribute(Attribute::StructRet) ||
+ Arg.hasAttribute(Attribute::SwiftSelf) ||
+ Arg.hasAttribute(Attribute::SwiftError) ||
+ Arg.hasAttribute(Attribute::Nest))
return false;
Type *ArgTy = Arg.getType();
@@ -2976,7 +3026,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
// Issue CALLSEQ_START
unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
- .addImm(NumBytes);
+ .addImm(NumBytes).addImm(0);
// Process the args.
for (CCValAssign &VA : ArgLocs) {
@@ -3106,8 +3156,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
return false;
CodeModel::Model CM = TM.getCodeModel();
- // Only support the small and large code model.
- if (CM != CodeModel::Small && CM != CodeModel::Large)
+ // Only support the small-addressing and large code models.
+ if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
return false;
// FIXME: Add large code model support for ELF.
@@ -3158,7 +3208,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Issue the call.
MachineInstrBuilder MIB;
- if (CM == CodeModel::Small) {
+ if (Subtarget->useSmallAddressing()) {
const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
if (Symbol)
@@ -3369,8 +3419,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
MFI.setFrameAddressIsTaken(true);
- const AArch64RegisterInfo *RegInfo =
- static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo());
+ const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -3521,11 +3570,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
updateValueMap(II, ResultReg);
return true;
}
- case Intrinsic::trap: {
+ case Intrinsic::trap:
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
.addImm(1);
return true;
- }
+
case Intrinsic::sqrt: {
Type *RetTy = II->getCalledFunction()->getReturnType();
@@ -5089,11 +5138,14 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) {
return selectOperator(I, I->getOpcode());
// Silence warnings.
(void)&CC_AArch64_DarwinPCS_VarArg;
+ (void)&CC_AArch64_Win64_VarArg;
}
namespace llvm {
-llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
+
+FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
const TargetLibraryInfo *LibInfo) {
return new AArch64FastISel(FuncInfo, LibInfo);
}
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index f5b8c35..7c6a999 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -41,6 +41,10 @@
// | |
// |-----------------------------------|
// | |
+// | (Win64 only) varargs from reg |
+// | |
+// |-----------------------------------|
+// | |
// | prev_fp, prev_lr |
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
@@ -90,21 +94,42 @@
#include "AArch64FrameLowering.h"
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
+#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
+#include "llvm/MC/MCDwarf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
using namespace llvm;
@@ -116,6 +141,34 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+/// Look at each instruction that references stack frames and return the stack
+/// size limit beyond which some of these instructions will require a scratch
+/// register during their expansion later.
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
+ // FIXME: For now, just conservatively guestimate based on unscaled indexing
+ // range. We'll end up allocating an unnecessary spill slot a lot, but
+ // realistically that's not a big deal at this stage of the game.
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.isDebugValue() || MI.isPseudo() ||
+ MI.getOpcode() == AArch64::ADDXri ||
+ MI.getOpcode() == AArch64::ADDSXri)
+ continue;
+
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ if (!MI.getOperand(i).isFI())
+ continue;
+
+ int Offset = 0;
+ if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
+ AArch64FrameOffsetCannotUpdate)
+ return 0;
+ }
+ }
+ }
+ return 255;
+}
+
bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
if (!EnableRedZone)
return false;
@@ -245,14 +298,13 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
if (&MF->front() == MBB)
return AArch64::X9;
- const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
- LivePhysRegs LiveRegs(&TRI);
+ const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+ const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+ LivePhysRegs LiveRegs(TRI);
LiveRegs.addLiveIns(*MBB);
// Mark callee saved registers as used so we will not choose them.
- const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
- const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+ const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
for (unsigned i = 0; CSRegs[i]; ++i)
LiveRegs.addReg(CSRegs[i]);
@@ -319,7 +371,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
-
unsigned NewOpc;
bool NewIsUnscaled = false;
switch (MBBI->getOpcode()) {
@@ -362,7 +413,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
unsigned OpndIdx = 0;
for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
++OpndIdx)
- MIB.addOperand(MBBI->getOperand(OpndIdx));
+ MIB.add(MBBI->getOperand(OpndIdx));
assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
"Unexpected immediate offset in first/last callee-save save/restore "
@@ -455,19 +506,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
return;
}
- auto CSStackSize = AFI->getCalleeSavedStackSize();
+ bool IsWin64 =
+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+ auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
// All of the remaining stack allocations are for locals.
- AFI->setLocalStackSize(NumBytes - CSStackSize);
+ AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
if (CombineSPBump) {
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
MachineInstr::FrameSetup);
NumBytes = 0;
- } else if (CSStackSize != 0) {
+ } else if (PrologueSaveSize != 0) {
MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
- -CSStackSize);
- NumBytes -= CSStackSize;
+ -PrologueSaveSize);
+ NumBytes -= PrologueSaveSize;
}
assert(NumBytes >= 0 && "Negative stack allocation size!?");
@@ -481,8 +536,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
++MBBI;
}
if (HasFP) {
- // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
- int FPOffset = CSStackSize - 16;
+ // Only set up FP if we actually need to. Frame pointer is fp =
+ // sp - fixedobject - 16.
+ int FPOffset = AFI->getCalleeSavedStackSize() - 16;
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
@@ -621,8 +677,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (HasFP) {
// Define the current CFA rule to use the provided FP.
unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
- unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+ nullptr, Reg, 2 * StackGrowth - FixedObject));
BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
.addCFIIndex(CFIIndex)
.setMIFlags(MachineInstr::FrameSetup);
@@ -708,12 +764,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
// it as the 2nd argument of AArch64ISD::TC_RETURN.
- auto CSStackSize = AFI->getCalleeSavedStackSize();
+ bool IsWin64 =
+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+ auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
- if (!CombineSPBump && CSStackSize != 0)
+ if (!CombineSPBump && PrologueSaveSize != 0)
convertCalleeSaveRestoreToSPPrePostIncDec(
- MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+ MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
// Move past the restores of the callee-saved registers.
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
@@ -735,7 +795,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
return;
}
- NumBytes -= CSStackSize;
+ NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
if (!hasFP(MF)) {
@@ -745,7 +805,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
if (RedZone && ArgumentPopSize == 0)
return;
- bool NoCalleeSaveRestore = CSStackSize == 0;
+ bool NoCalleeSaveRestore = PrologueSaveSize == 0;
int StackRestoreBytes = RedZone ? 0 : NumBytes;
if (NoCalleeSaveRestore)
StackRestoreBytes += ArgumentPopSize;
@@ -764,7 +824,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
// be able to save any instructions.
if (MFI.hasVarSizedObjects() || AFI->isStackRealigned())
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
- -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+ -AFI->getCalleeSavedStackSize() + 16, TII,
+ MachineInstr::FrameDestroy);
else if (NumBytes)
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
MachineInstr::FrameDestroy);
@@ -794,7 +855,11 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
MF.getSubtarget().getRegisterInfo());
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- int FPOffset = MFI.getObjectOffset(FI) + 16;
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ bool IsWin64 =
+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+ int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
bool isFixed = MFI.isFixedObjectIndex(FI);
@@ -863,22 +928,26 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
static bool produceCompactUnwindFrame(MachineFunction &MF) {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- AttributeSet Attrs = MF.getFunction()->getAttributes();
+ AttributeList Attrs = MF.getFunction()->getAttributes();
return Subtarget.isTargetMachO() &&
!(Subtarget.getTargetLowering()->supportSwiftError() &&
Attrs.hasAttrSomewhere(Attribute::SwiftError));
}
namespace {
+
struct RegPairInfo {
- RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
- unsigned Reg1;
- unsigned Reg2;
+ unsigned Reg1 = AArch64::NoRegister;
+ unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
bool IsGPR;
+
+ RegPairInfo() = default;
+
bool isPaired() const { return Reg2 != AArch64::NoRegister; }
};
+
} // end anonymous namespace
static void computeCalleeSaveRegisterPairs(
@@ -899,7 +968,7 @@ static void computeCalleeSaveRegisterPairs(
CC == CallingConv::PreserveMost ||
(Count & 1) == 0) &&
"Odd number of callee-saved regs to spill!");
- unsigned Offset = AFI->getCalleeSavedStackSize();
+ int Offset = AFI->getCalleeSavedStackSize();
for (unsigned i = 0; i < Count; ++i) {
RegPairInfo RPI;
@@ -968,6 +1037,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
SmallVector<RegPairInfo, 8> RegPairs;
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
++RPII) {
@@ -999,9 +1069,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
dbgs() << ")\n");
MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
- MBB.addLiveIn(Reg1);
+ if (!MRI.isReserved(Reg1))
+ MBB.addLiveIn(Reg1);
if (RPI.isPaired()) {
- MBB.addLiveIn(Reg2);
+ if (!MRI.isReserved(Reg2))
+ MBB.addLiveIn(Reg2);
MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
MIB.addMemOperand(MF.getMachineMemOperand(
MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
@@ -1102,7 +1174,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
if (RegInfo->hasBasePointer(MF))
BasePointerReg = RegInfo->getBaseRegister();
- bool ExtraCSSpill = false;
+ unsigned ExtraCSSpill = 0;
const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
// Figure out which callee-saved registers to save/restore.
for (unsigned i = 0; CSRegs[i]; ++i) {
@@ -1130,13 +1202,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(PairedReg);
if (AArch64::GPR64RegClass.contains(PairedReg) &&
!RegInfo->isReservedReg(MF, PairedReg))
- ExtraCSSpill = true;
+ ExtraCSSpill = PairedReg;
}
}
DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
- for (int Reg = SavedRegs.find_first(); Reg != -1;
- Reg = SavedRegs.find_next(Reg))
+ for (unsigned Reg : SavedRegs.set_bits())
dbgs() << ' ' << PrintReg(Reg, RegInfo);
dbgs() << "\n";);
@@ -1144,16 +1215,13 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
unsigned NumRegsSpilled = SavedRegs.count();
bool CanEliminateFrame = NumRegsSpilled == 0;
- // FIXME: Set BigStack if any stack slot references may be out of range.
- // For now, just conservatively guestimate based on unscaled indexing
- // range. We'll end up allocating an unnecessary spill slot a lot, but
- // realistically that's not a big deal at this stage of the game.
// The CSR spill slots have not been allocated yet, so estimateStackSize
// won't include them.
MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
- bool BigStack = (CFSize >= 256);
+ unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
+ bool BigStack = (CFSize > EstimatedStackSizeLimit);
if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
AFI->setHasStackFrame(true);
@@ -1163,8 +1231,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// register scavenging. If we already spilled an extra callee-saved register
// above to keep the number of spills even, we don't need to do anything else
// here.
- if (BigStack && !ExtraCSSpill) {
- if (UnspilledCSGPR != AArch64::NoRegister) {
+ if (BigStack) {
+ if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
<< " to get a scratch register.\n");
SavedRegs.set(UnspilledCSGPR);
@@ -1173,15 +1241,18 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// store the pair.
if (produceCompactUnwindFrame(MF))
SavedRegs.set(UnspilledCSGPRPaired);
- ExtraCSSpill = true;
+ ExtraCSSpill = UnspilledCSGPRPaired;
NumRegsSpilled = SavedRegs.count();
}
// If we didn't find an extra callee-saved register to spill, create
// an emergency spill slot.
- if (!ExtraCSSpill) {
- const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
- int FI = MFI.CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+ if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
+ const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+ const TargetRegisterClass &RC = AArch64::GPR64RegClass;
+ unsigned Size = TRI->getSpillSize(RC);
+ unsigned Align = TRI->getSpillAlignment(RC);
+ int FI = MFI.CreateStackObject(Size, Align, false);
RS->addScavengingFrameIndex(FI);
DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
<< " as the emergency spill slot.\n");
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index d472a54..8b1c974 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/contrib/llvm/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -16,281 +16,198 @@
#endif
namespace llvm {
-namespace AArch64 {
-
-const uint32_t GPRCoverageData[] = {
- // Classes 0-31
- (1u << AArch64::GPR32allRegClassID) | (1u << AArch64::GPR32RegClassID) |
- (1u << AArch64::GPR32spRegClassID) |
- (1u << AArch64::GPR32commonRegClassID) |
- (1u << AArch64::GPR32sponlyRegClassID) |
- (1u << AArch64::GPR64allRegClassID) | (1u << AArch64::GPR64RegClassID) |
- (1u << AArch64::GPR64spRegClassID) |
- (1u << AArch64::GPR64commonRegClassID) |
- (1u << AArch64::tcGPR64RegClassID) |
- (1u << AArch64::GPR64sponlyRegClassID),
- // Classes 32-63
- 0,
- // FIXME: The entries below this point can be safely removed once this is
- // tablegenerated. It's only needed because of the hardcoded register class
- // limit.
- // Classes 64-96
- 0,
- // Classes 97-128
- 0,
- // Classes 129-160
- 0,
- // Classes 161-192
- 0,
- // Classes 193-224
- 0,
-};
-
-const uint32_t FPRCoverageData[] = {
- // Classes 0-31
- (1u << AArch64::FPR8RegClassID) | (1u << AArch64::FPR16RegClassID) |
- (1u << AArch64::FPR32RegClassID) | (1u << AArch64::FPR64RegClassID) |
- (1u << AArch64::DDRegClassID) | (1u << AArch64::FPR128RegClassID) |
- (1u << AArch64::FPR128_loRegClassID) | (1u << AArch64::DDDRegClassID) |
- (1u << AArch64::DDDDRegClassID),
- // Classes 32-63
- (1u << (AArch64::QQRegClassID - 32)) |
- (1u << (AArch64::QQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
- (1u << (AArch64::QQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
- (1u
- << (AArch64::
- QQQ_with_qsub1_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
- 32)) |
- (1u
- << (AArch64::
- QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub2_in_FPR128_loRegClassID -
- 32)) |
- (1u << (AArch64::QQQQRegClassID - 32)) |
- (1u << (AArch64::QQQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
- (1u << (AArch64::QQQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
- (1u << (AArch64::QQQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
- (1u << (AArch64::QQQQ_with_qsub3_in_FPR128_loRegClassID - 32)) |
- (1u
- << (AArch64::
- QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub1_in_FPR128_loRegClassID -
- 32)) |
- (1u
- << (AArch64::
- QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
- 32)) |
- (1u
- << (AArch64::
- QQQQ_with_qsub2_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
- 32)) |
- (1u
- << (AArch64::
- QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub2_in_FPR128_loRegClassID -
- 32)) |
- (1u
- << (AArch64::
- QQQQ_with_qsub1_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
- 32)) |
- (1u
- << (AArch64::
- QQQQ_with_qsub0_in_FPR128_lo_and_QQQQ_with_qsub3_in_FPR128_loRegClassID -
- 32)) |
- (1u
- << (AArch64::
- QQ_with_qsub0_in_FPR128_lo_and_QQ_with_qsub1_in_FPR128_loRegClassID -
- 32)) |
- (1u << (AArch64::QQQRegClassID - 32)) |
- (1u << (AArch64::QQQ_with_qsub0_in_FPR128_loRegClassID - 32)) |
- (1u << (AArch64::QQQ_with_qsub1_in_FPR128_loRegClassID - 32)) |
- (1u << (AArch64::QQQ_with_qsub2_in_FPR128_loRegClassID - 32)) |
- (1u
- << (AArch64::
- QQQ_with_qsub0_in_FPR128_lo_and_QQQ_with_qsub1_in_FPR128_loRegClassID -
- 32)),
- // FIXME: The entries below this point can be safely removed once this
- // is tablegenerated. It's only needed because of the hardcoded register
- // class limit.
- // Classes 64-96
- 0,
- // Classes 97-128
- 0,
- // Classes 129-160
- 0,
- // Classes 161-192
- 0,
- // Classes 193-224
- 0,
-};
-
-const uint32_t CCRCoverageData[] = {
- // Classes 0-31
- 1u << AArch64::CCRRegClassID,
- // Classes 32-63
- 0,
- // FIXME: The entries below this point can be safely removed once this
- // is tablegenerated. It's only needed because of the hardcoded register
- // class limit.
- // Classes 64-96
- 0,
- // Classes 97-128
- 0,
- // Classes 129-160
- 0,
- // Classes 161-192
- 0,
- // Classes 193-224
- 0,
-};
-
-RegisterBank GPRRegBank(AArch64::GPRRegBankID, "GPR", 64, GPRCoverageData);
-RegisterBank FPRRegBank(AArch64::FPRRegBankID, "FPR", 512, FPRCoverageData);
-RegisterBank CCRRegBank(AArch64::CCRRegBankID, "CCR", 32, CCRCoverageData);
-
-RegisterBank *RegBanks[] = {&GPRRegBank, &FPRRegBank, &CCRRegBank};
-
-// PartialMappings.
-enum PartialMappingIdx {
- PMI_None = -1,
- PMI_GPR32 = 1,
- PMI_GPR64,
- PMI_FPR32,
- PMI_FPR64,
- PMI_FPR128,
- PMI_FPR256,
- PMI_FPR512,
- PMI_FirstGPR = PMI_GPR32,
- PMI_LastGPR = PMI_GPR64,
- PMI_FirstFPR = PMI_FPR32,
- PMI_LastFPR = PMI_FPR512,
- PMI_Min = PMI_FirstGPR,
-};
-
-static unsigned getRegBankBaseIdxOffset(unsigned Size) {
- assert(Size && "0-sized type!!");
- // Make anything smaller than 32 gets 32
- Size = ((Size + 31) / 32) * 32;
- // 32 is 0, 64 is 1, 128 is 2, and so on.
- return Log2_32(Size) - /*Log2_32(32)=*/ 5;
-}
-
-RegisterBankInfo::PartialMapping PartMappings[] {
- /* StartIdx, Length, RegBank */
- // 0: GPR 32-bit value.
- {0, 32, GPRRegBank},
- // 1: GPR 64-bit value.
- {0, 64, GPRRegBank},
- // 2: FPR 32-bit value.
- {0, 32, FPRRegBank},
- // 3: FPR 64-bit value.
- {0, 64, FPRRegBank},
- // 4: FPR 128-bit value.
- {0, 128, FPRRegBank},
- // 5: FPR 256-bit value.
- {0, 256, FPRRegBank},
- // 6: FPR 512-bit value.
- {0, 512, FPRRegBank}
-};
-
-enum ValueMappingIdx {
- First3OpsIdx = 0,
- Last3OpsIdx = 18,
- DistanceBetweenRegBanks = 3,
- FirstCrossRegCpyIdx = 21,
- LastCrossRegCpyIdx = 27,
- DistanceBetweenCrossRegCpy = 2
+RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{
+ /* StartIdx, Length, RegBank */
+ // 0: FPR 32-bit value.
+ {0, 32, AArch64::FPRRegBank},
+ // 1: FPR 64-bit value.
+ {0, 64, AArch64::FPRRegBank},
+ // 2: FPR 128-bit value.
+ {0, 128, AArch64::FPRRegBank},
+ // 3: FPR 256-bit value.
+ {0, 256, AArch64::FPRRegBank},
+ // 4: FPR 512-bit value.
+ {0, 512, AArch64::FPRRegBank},
+ // 5: GPR 32-bit value.
+ {0, 32, AArch64::GPRRegBank},
+ // 6: GPR 64-bit value.
+ {0, 64, AArch64::GPRRegBank},
};
// ValueMappings.
-RegisterBankInfo::ValueMapping ValMappings[]{
+RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{
/* BreakDown, NumBreakDowns */
+ // 0: invalid
+ {nullptr, 0},
// 3-operands instructions (all binary operations should end up with one of
// those mapping).
- // 0: GPR 32-bit value. <-- This must match First3OpsIdx.
- {&PartMappings[PMI_GPR32 - PMI_Min], 1},
- {&PartMappings[PMI_GPR32 - PMI_Min], 1},
- {&PartMappings[PMI_GPR32 - PMI_Min], 1},
- // 3: GPR 64-bit value.
- {&PartMappings[PMI_GPR64 - PMI_Min], 1},
- {&PartMappings[PMI_GPR64 - PMI_Min], 1},
- {&PartMappings[PMI_GPR64 - PMI_Min], 1},
- // 6: FPR 32-bit value.
- {&PartMappings[PMI_FPR32 - PMI_Min], 1},
- {&PartMappings[PMI_FPR32 - PMI_Min], 1},
- {&PartMappings[PMI_FPR32 - PMI_Min], 1},
- // 9: FPR 64-bit value.
- {&PartMappings[PMI_FPR64 - PMI_Min], 1},
- {&PartMappings[PMI_FPR64 - PMI_Min], 1},
- {&PartMappings[PMI_FPR64 - PMI_Min], 1},
- // 12: FPR 128-bit value.
- {&PartMappings[PMI_FPR128 - PMI_Min], 1},
- {&PartMappings[PMI_FPR128 - PMI_Min], 1},
- {&PartMappings[PMI_FPR128 - PMI_Min], 1},
- // 15: FPR 256-bit value.
- {&PartMappings[PMI_FPR256 - PMI_Min], 1},
- {&PartMappings[PMI_FPR256 - PMI_Min], 1},
- {&PartMappings[PMI_FPR256 - PMI_Min], 1},
- // 18: FPR 512-bit value. <-- This must match Last3OpsIdx.
- {&PartMappings[PMI_FPR512 - PMI_Min], 1},
- {&PartMappings[PMI_FPR512 - PMI_Min], 1},
- {&PartMappings[PMI_FPR512 - PMI_Min], 1},
+ // 1: FPR 32-bit value. <-- This must match First3OpsIdx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+ // 4: FPR 64-bit value.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ // 7: FPR 128-bit value.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
+ // 10: FPR 256-bit value.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
+ // 13: FPR 512-bit value.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
+ // 16: GPR 32-bit value.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+ // 19: GPR 64-bit value. <-- This must match Last3OpsIdx.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
// Cross register bank copies.
- // 21: GPR 32-bit value to FPR 32-bit value. <-- This must match
+ // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match
// FirstCrossRegCpyIdx.
- {&PartMappings[PMI_GPR32 - PMI_Min], 1},
- {&PartMappings[PMI_FPR32 - PMI_Min], 1},
- // 23: GPR 64-bit value to FPR 64-bit value.
- {&PartMappings[PMI_GPR64 - PMI_Min], 1},
- {&PartMappings[PMI_FPR64 - PMI_Min], 1},
- // 25: FPR 32-bit value to GPR 32-bit value.
- {&PartMappings[PMI_FPR32 - PMI_Min], 1},
- {&PartMappings[PMI_GPR32 - PMI_Min], 1},
- // 27: FPR 64-bit value to GPR 64-bit value. <-- This must match
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+ // 24: FPR 64-bit value to GPR 64-bit value.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+ // 26: FPR 128-bit value to GPR 128-bit value (invalid)
+ {nullptr, 1},
+ {nullptr, 1},
+ // 28: FPR 256-bit value to GPR 256-bit value (invalid)
+ {nullptr, 1},
+ {nullptr, 1},
+ // 30: FPR 512-bit value to GPR 512-bit value (invalid)
+ {nullptr, 1},
+ {nullptr, 1},
+ // 32: GPR 32-bit value to FPR 32-bit value.
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
+ // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match
// LastCrossRegCpyIdx.
- {&PartMappings[PMI_FPR64 - PMI_Min], 1},
- {&PartMappings[PMI_GPR64 - PMI_Min], 1}
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
+ {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
};
-/// Get the pointer to the ValueMapping representing the RegisterBank
-/// at \p RBIdx with a size of \p Size.
-///
-/// The returned mapping works for instructions with the same kind of
-/// operands for up to 3 operands.
-///
-/// \pre \p RBIdx != PartialMappingIdx::None
+bool AArch64GenRegisterBankInfo::checkPartialMap(unsigned Idx,
+ unsigned ValStartIdx,
+ unsigned ValLength,
+ const RegisterBank &RB) {
+ const PartialMapping &Map = PartMappings[Idx - PartialMappingIdx::PMI_Min];
+ return Map.StartIdx == ValStartIdx && Map.Length == ValLength &&
+ Map.RegBank == &RB;
+}
+
+bool AArch64GenRegisterBankInfo::checkValueMapImpl(unsigned Idx,
+ unsigned FirstInBank,
+ unsigned Size,
+ unsigned Offset) {
+ unsigned PartialMapBaseIdx = Idx - PartialMappingIdx::PMI_Min;
+ const ValueMapping &Map =
+ AArch64GenRegisterBankInfo::getValueMapping((PartialMappingIdx)FirstInBank, Size)[Offset];
+ return Map.BreakDown == &PartMappings[PartialMapBaseIdx] &&
+ Map.NumBreakDowns == 1;
+}
+
+bool AArch64GenRegisterBankInfo::checkPartialMappingIdx(
+ PartialMappingIdx FirstAlias, PartialMappingIdx LastAlias,
+ ArrayRef<PartialMappingIdx> Order) {
+ if (Order.front() != FirstAlias)
+ return false;
+ if (Order.back() != LastAlias)
+ return false;
+ if (Order.front() > Order.back())
+ return false;
+
+ PartialMappingIdx Previous = Order.front();
+ bool First = true;
+ for (const auto &Current : Order) {
+ if (First) {
+ First = false;
+ continue;
+ }
+ if (Previous + 1 != Current)
+ return false;
+ Previous = Current;
+ }
+ return true;
+}
+
+unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
+ unsigned Size) {
+ if (RBIdx == PMI_FirstGPR) {
+ if (Size <= 32)
+ return 0;
+ if (Size <= 64)
+ return 1;
+ return -1;
+ }
+ if (RBIdx == PMI_FirstFPR) {
+ if (Size <= 32)
+ return 0;
+ if (Size <= 64)
+ return 1;
+ if (Size <= 128)
+ return 2;
+ if (Size <= 256)
+ return 3;
+ if (Size <= 512)
+ return 4;
+ return -1;
+ }
+ return -1;
+}
+
const RegisterBankInfo::ValueMapping *
-getValueMapping(PartialMappingIdx RBIdx, unsigned Size) {
+AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx,
+ unsigned Size) {
assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
- unsigned ValMappingIdx = First3OpsIdx +
- (RBIdx - AArch64::PartialMappingIdx::PMI_Min +
- getRegBankBaseIdxOffset(Size)) *
- ValueMappingIdx::DistanceBetweenRegBanks;
- assert(ValMappingIdx >= AArch64::First3OpsIdx &&
- ValMappingIdx <= AArch64::Last3OpsIdx && "Mapping out of bound");
+ unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size);
+ if (BaseIdxOffset == -1u)
+ return &ValMappings[InvalidIdx];
+
+ unsigned ValMappingIdx =
+ First3OpsIdx + (RBIdx - PartialMappingIdx::PMI_Min + BaseIdxOffset) *
+ ValueMappingIdx::DistanceBetweenRegBanks;
+ assert(ValMappingIdx >= First3OpsIdx && ValMappingIdx <= Last3OpsIdx &&
+ "Mapping out of bound");
return &ValMappings[ValMappingIdx];
}
-/// Get the pointer to the ValueMapping of the operands of a copy
-/// instruction from a GPR or FPR register to a GPR or FPR register
-/// with a size of \p Size.
-///
-/// If \p DstIsGPR is true, the destination of the copy is on GPR,
-/// otherwise it is on FPR. Same thing for \p SrcIsGPR.
+AArch64GenRegisterBankInfo::PartialMappingIdx
+ AArch64GenRegisterBankInfo::BankIDToCopyMapIdx[]{
+ PMI_None, // CCR
+ PMI_FirstFPR, // FPR
+ PMI_FirstGPR, // GPR
+ };
+
const RegisterBankInfo::ValueMapping *
-getCopyMapping(bool DstIsGPR, bool SrcIsGPR, unsigned Size) {
- PartialMappingIdx DstRBIdx = DstIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
- PartialMappingIdx SrcRBIdx = SrcIsGPR ? PMI_FirstGPR : PMI_FirstFPR;
+AArch64GenRegisterBankInfo::getCopyMapping(unsigned DstBankID,
+ unsigned SrcBankID, unsigned Size) {
+ assert(DstBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
+ assert(SrcBankID < AArch64::NumRegisterBanks && "Invalid bank ID");
+ PartialMappingIdx DstRBIdx = BankIDToCopyMapIdx[DstBankID];
+ PartialMappingIdx SrcRBIdx = BankIDToCopyMapIdx[SrcBankID];
+ assert(DstRBIdx != PMI_None && "No such mapping");
+ assert(SrcRBIdx != PMI_None && "No such mapping");
+
if (DstRBIdx == SrcRBIdx)
return getValueMapping(DstRBIdx, Size);
+
assert(Size <= 64 && "GPR cannot handle that size");
unsigned ValMappingIdx =
FirstCrossRegCpyIdx +
- (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(Size)) *
+ (DstRBIdx - PMI_Min + getRegBankBaseIdxOffset(DstRBIdx, Size)) *
ValueMappingIdx::DistanceBetweenCrossRegCpy;
- assert(ValMappingIdx >= AArch64::FirstCrossRegCpyIdx &&
- ValMappingIdx <= AArch64::LastCrossRegCpyIdx &&
- "Mapping out of bound");
+ assert(ValMappingIdx >= FirstCrossRegCpyIdx &&
+ ValMappingIdx <= LastCrossRegCpyIdx && "Mapping out of bound");
return &ValMappings[ValMappingIdx];
}
-
-} // End AArch64 namespace.
} // End llvm namespace.
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3099383..06005f6 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -20,6 +20,7 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
@@ -200,7 +201,7 @@ private:
bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
- void SelectCMP_SWAP(SDNode *N);
+ bool SelectCMP_SWAP(SDNode *N);
};
} // end anonymous namespace
@@ -238,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
case InlineAsm::Constraint_i:
case InlineAsm::Constraint_m:
case InlineAsm::Constraint_Q:
- // Require the address to be in a register. That is safe for all AArch64
- // variants and it is hard to do anything much smarter without knowing
- // how the operand is used.
- OutOps.push_back(Op);
+ // We need to make sure that this one operand does not end up in XZR, thus
+ // require the address to be in a PointerRegClass register.
+ const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF);
+ SDLoc dl(Op);
+ SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64);
+ SDValue NewOp =
+ SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, Op.getValueType(),
+ Op, RC), 0);
+ OutOps.push_back(NewOp);
return false;
}
return true;
@@ -328,11 +336,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
}
}
+/// \brief Determine whether it is worth it to fold SHL into the addressing
+/// mode.
+static bool isWorthFoldingSHL(SDValue V) {
+ assert(V.getOpcode() == ISD::SHL && "invalid opcode");
+ // It is worth folding logical shift of up to three places.
+ auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (!CSD)
+ return false;
+ unsigned ShiftVal = CSD->getZExtValue();
+ if (ShiftVal > 3)
+ return false;
+
+ // Check if this particular node is reused in any non-memory related
+ // operation. If yes, do not try to fold this node into the address
+ // computation, since the computation will be kept.
+ const SDNode *Node = V.getNode();
+ for (SDNode *UI : Node->uses())
+ if (!isa<MemSDNode>(*UI))
+ for (SDNode *UII : UI->uses())
+ if (!isa<MemSDNode>(*UII))
+ return false;
+ return true;
+}
+
/// \brief Determine whether it is worth to fold V into an extended register.
bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
- // it hurts if the value is used at least twice, unless we are optimizing
- // for code size.
- return ForCodeSize || V.hasOneUse();
+ // Trivial if we are optimizing for code size or if there is only
+ // one use of the value.
+ if (ForCodeSize || V.hasOneUse())
+ return true;
+ // If a subtarget has a fastpath LSL we can fold a logical shift into
+ // the addressing mode and save a cycle.
+ if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+ isWorthFoldingSHL(V))
+ return true;
+ if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+ const SDValue LHS = V.getOperand(0);
+ const SDValue RHS = V.getOperand(1);
+ if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
+ return true;
+ if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
+ return true;
+ }
+
+ // It hurts otherwise, since the value will be reused.
+ return false;
}
/// SelectShiftedRegister - Select a "shifted register" operand. If the value
@@ -1811,20 +1860,20 @@ static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
OpUsefulBits = 1;
if (MSB >= Imm) {
- OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+ OpUsefulBits <<= MSB - Imm + 1;
--OpUsefulBits;
// The interesting part will be in the lower part of the result
getUsefulBits(Op, OpUsefulBits, Depth + 1);
// The interesting part was starting at Imm in the argument
- OpUsefulBits = OpUsefulBits.shl(Imm);
+ OpUsefulBits <<= Imm;
} else {
- OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+ OpUsefulBits <<= MSB + 1;
--OpUsefulBits;
// The interesting part will be shifted in the result
- OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+ OpUsefulBits <<= OpUsefulBits.getBitWidth() - Imm;
getUsefulBits(Op, OpUsefulBits, Depth + 1);
// The interesting part was at zero in the argument
- OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+ OpUsefulBits.lshrInPlace(OpUsefulBits.getBitWidth() - Imm);
}
UsefulBits &= OpUsefulBits;
@@ -1851,17 +1900,17 @@ static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
// Shift Left
uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
- Mask = Mask.shl(ShiftAmt);
+ Mask <<= ShiftAmt;
getUsefulBits(Op, Mask, Depth + 1);
- Mask = Mask.lshr(ShiftAmt);
+ Mask.lshrInPlace(ShiftAmt);
} else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
// Shift Right
// We do not handle AArch64_AM::ASR, because the sign will change the
// number of useful bits
uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
- Mask = Mask.lshr(ShiftAmt);
+ Mask.lshrInPlace(ShiftAmt);
getUsefulBits(Op, Mask, Depth + 1);
- Mask = Mask.shl(ShiftAmt);
+ Mask <<= ShiftAmt;
} else
return;
@@ -1889,13 +1938,13 @@ static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
uint64_t Width = MSB - Imm + 1;
uint64_t LSB = Imm;
- OpUsefulBits = OpUsefulBits.shl(Width);
+ OpUsefulBits <<= Width;
--OpUsefulBits;
if (Op.getOperand(1) == Orig) {
// Copy the low bits from the result to bits starting from LSB.
Mask = ResultUsefulBits & OpUsefulBits;
- Mask = Mask.shl(LSB);
+ Mask <<= LSB;
}
if (Op.getOperand(0) == Orig)
@@ -1906,14 +1955,14 @@ static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
uint64_t Width = MSB + 1;
uint64_t LSB = UsefulBits.getBitWidth() - Imm;
- OpUsefulBits = OpUsefulBits.shl(Width);
+ OpUsefulBits <<= Width;
--OpUsefulBits;
- OpUsefulBits = OpUsefulBits.shl(LSB);
+ OpUsefulBits <<= LSB;
if (Op.getOperand(1) == Orig) {
// Copy the bits from the result to the zero bits.
Mask = ResultUsefulBits & OpUsefulBits;
- Mask = Mask.lshr(LSB);
+ Mask.lshrInPlace(LSB);
}
if (Op.getOperand(0) == Orig)
@@ -2037,18 +2086,18 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
(void)BitWidth;
assert(BitWidth == 32 || BitWidth == 64);
- APInt KnownZero, KnownOne;
- CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+ KnownBits Known;
+ CurDAG->computeKnownBits(Op, Known);
// Non-zero in the sense that they're not provably zero, which is the key
// point if we want to use this value
- uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+ uint64_t NonZeroBits = (~Known.Zero).getZExtValue();
// Discard a constant AND mask if present. It's safe because the node will
// already have been factored into the computeKnownBits calculation above.
uint64_t AndImm;
if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
- assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+ assert((~APInt(BitWidth, AndImm) & ~Known.Zero) == 0);
Op = Op.getOperand(0);
}
@@ -2117,15 +2166,15 @@ static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
// Compute the Known Zero for the AND as this allows us to catch more general
// cases than just looking for AND with imm.
- APInt KnownZero, KnownOne;
- CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+ KnownBits Known;
+ CurDAG->computeKnownBits(And, Known);
// Non-zero in the sense that they're not provably zero, which is the key
// point if we want to use this value.
- uint64_t NotKnownZero = (~KnownZero).getZExtValue();
+ uint64_t NotKnownZero = (~Known.Zero).getZExtValue();
// The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
- if (!isShiftedMask(KnownZero.getZExtValue(), VT))
+ if (!isShiftedMask(Known.Zero.getZExtValue(), VT))
return false;
// The bits being inserted must only set those bits that are known to be zero.
@@ -2259,15 +2308,15 @@ static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
// This allows to catch more general case than just looking for
// AND with imm. Indeed, simplify-demanded-bits may have removed
// the AND instruction because it proves it was useless.
- APInt KnownZero, KnownOne;
- CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+ KnownBits Known;
+ CurDAG->computeKnownBits(OrOpd1Val, Known);
// Check if there is enough room for the second operand to appear
// in the first one
APInt BitsToBeInserted =
- APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+ APInt::getBitsSet(Known.getBitWidth(), DstLSB, DstLSB + Width);
- if ((BitsToBeInserted & ~KnownZero) != 0)
+ if ((BitsToBeInserted & ~Known.Zero) != 0)
continue;
// Set the first operand
@@ -2524,7 +2573,7 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
// pstatefield for the MSR (immediate) instruction, we also require that an
// immediate value has been provided as an argument, we know that this is
// the case as it has been ensured by semantic checking.
- auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());;
+ auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());
if (PMapper) {
assert (isa<ConstantSDNode>(N->getOperand(2))
&& "Expected a constant integer expression.");
@@ -2567,9 +2616,13 @@ bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
}
/// We've got special pseudo-instructions for these
-void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
unsigned Opcode;
EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+
+ // Leave IR for LSE if subtarget supports it.
+ if (Subtarget->hasLSE()) return false;
+
if (MemTy == MVT::i8)
Opcode = AArch64::CMP_SWAP_8;
else if (MemTy == MVT::i16)
@@ -2595,6 +2648,8 @@ void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
CurDAG->RemoveDeadNode(N);
+
+ return true;
}
void AArch64DAGToDAGISel::Select(SDNode *Node) {
@@ -2618,8 +2673,9 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
break;
case ISD::ATOMIC_CMP_SWAP:
- SelectCMP_SWAP(Node);
- return;
+ if (SelectCMP_SWAP(Node))
+ return;
+ break;
case ISD::READ_REGISTER:
if (tryReadRegister(Node))
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 849058b..9d87988 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11,9 +11,9 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64ISelLowering.h"
#include "AArch64CallingConvention.h"
#include "AArch64MachineFunctionInfo.h"
-#include "AArch64ISelLowering.h"
#include "AArch64PerfectShuffle.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
@@ -22,13 +22,14 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -50,10 +51,10 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/Type.h"
@@ -66,6 +67,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetCallingConv.h"
@@ -90,6 +92,7 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
static cl::opt<bool>
EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
@@ -104,6 +107,12 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
+static cl::opt<bool>
+EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
+ cl::desc("Enable AArch64 logical imm instruction "
+ "optimization"),
+ cl::init(true));
+
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
@@ -372,7 +381,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
- setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
setOperationAction(ISD::FREM, MVT::v4f16, Expand);
setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
@@ -404,7 +412,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
- setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
setOperationAction(ISD::FREM, MVT::v8f16, Expand);
setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
@@ -544,7 +551,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
- setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -554,8 +560,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setSchedulingPreference(Sched::Hybrid);
- // Enable TBZ/TBNZ
- MaskAndBranchFoldingIsLegal = true;
EnableExtLdPromotion = true;
// Set required alignment.
@@ -652,6 +656,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
+ // Vector reductions
+ for (MVT VT : MVT::integer_valuetypes()) {
+ setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+ }
+ for (MVT VT : MVT::fp_valuetypes()) {
+ setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
+ setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+ }
+
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
@@ -707,7 +724,6 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
- setOperationAction(ISD::FPOWI, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
@@ -751,6 +767,9 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ if (!VT.isFloatingPoint())
+ setOperationAction(ISD::ABS, VT, Legal);
+
// [SU][MIN|MAX] are available for all NEON types apart from i64.
if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
@@ -788,21 +807,157 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
return VT.changeVectorElementTypeToInteger();
}
+static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
+ const APInt &Demanded,
+ TargetLowering::TargetLoweringOpt &TLO,
+ unsigned NewOpc) {
+ uint64_t OldImm = Imm, NewImm, Enc;
+ uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
+
+ // Return if the immediate is already all zeros, all ones, a bimm32 or a
+ // bimm64.
+ if (Imm == 0 || Imm == Mask ||
+ AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
+ return false;
+
+ unsigned EltSize = Size;
+ uint64_t DemandedBits = Demanded.getZExtValue();
+
+ // Clear bits that are not demanded.
+ Imm &= DemandedBits;
+
+ while (true) {
+ // The goal here is to set the non-demanded bits in a way that minimizes
+ // the number of switching between 0 and 1. In order to achieve this goal,
+ // we set the non-demanded bits to the value of the preceding demanded bits.
+ // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
+ // non-demanded bit), we copy bit0 (1) to the least significant 'x',
+ // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
+ // The final result is 0b11000011.
+ uint64_t NonDemandedBits = ~DemandedBits;
+ uint64_t InvertedImm = ~Imm & DemandedBits;
+ uint64_t RotatedImm =
+ ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
+ NonDemandedBits;
+ uint64_t Sum = RotatedImm + NonDemandedBits;
+ bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
+ uint64_t Ones = (Sum + Carry) & NonDemandedBits;
+ NewImm = (Imm | Ones) & Mask;
+
+ // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
+ // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
+ // we halve the element size and continue the search.
+ if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
+ break;
+
+ // We cannot shrink the element size any further if it is 2-bits.
+ if (EltSize == 2)
+ return false;
+
+ EltSize /= 2;
+ Mask >>= EltSize;
+ uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
+
+ // Return if there is mismatch in any of the demanded bits of Imm and Hi.
+ if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
+ return false;
+
+ // Merge the upper and lower halves of Imm and DemandedBits.
+ Imm |= Hi;
+ DemandedBits |= DemandedBitsHi;
+ }
+
+ ++NumOptimizedImms;
+
+ // Replicate the element across the register width.
+ while (EltSize < Size) {
+ NewImm |= NewImm << EltSize;
+ EltSize *= 2;
+ }
+
+ (void)OldImm;
+ assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
+ "demanded bits should never be altered");
+ assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
+
+ // Create the new constant immediate node.
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue New;
+
+ // If the new constant immediate is all-zeros or all-ones, let the target
+ // independent DAG combine optimize this node.
+ if (NewImm == 0 || NewImm == OrigMask) {
+ New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
+ TLO.DAG.getConstant(NewImm, DL, VT));
+ // Otherwise, create a machine node so that target independent DAG combine
+ // doesn't undo this optimization.
+ } else {
+ Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+ SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+ New = SDValue(
+ TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+ }
+
+ return TLO.CombineTo(Op, New);
+}
+
+bool AArch64TargetLowering::targetShrinkDemandedConstant(
+ SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+ // Delay this optimization to as late as possible.
+ if (!TLO.LegalOps)
+ return false;
+
+ if (!EnableOptimizeLogicalImm)
+ return false;
+
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return false;
+
+ unsigned Size = VT.getSizeInBits();
+ assert((Size == 32 || Size == 64) &&
+ "i32 or i64 is expected after legalization.");
+
+ // Exit early if we demand all bits.
+ if (Demanded.countPopulation() == Size)
+ return false;
+
+ unsigned NewOpc;
+ switch (Op.getOpcode()) {
+ default:
+ return false;
+ case ISD::AND:
+ NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
+ break;
+ case ISD::OR:
+ NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
+ break;
+ case ISD::XOR:
+ NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
+ break;
+ }
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!C)
+ return false;
+ uint64_t Imm = C->getZExtValue();
+ return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+}
+
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
-/// Mask are known to be either zero or one and return them in the
-/// KnownZero/KnownOne bitsets.
+/// Mask are known to be either zero or one and return them Known.
void AArch64TargetLowering::computeKnownBitsForTargetNode(
- const SDValue Op, APInt &KnownZero, APInt &KnownOne,
- const SelectionDAG &DAG, unsigned Depth) const {
+ const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
switch (Op.getOpcode()) {
default:
break;
case AArch64ISD::CSEL: {
- APInt KnownZero2, KnownOne2;
- DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
- DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
- KnownZero &= KnownZero2;
- KnownOne &= KnownOne2;
+ KnownBits Known2;
+ DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
+ DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
+ Known.Zero &= Known2.Zero;
+ Known.One &= Known2.One;
break;
}
case ISD::INTRINSIC_W_CHAIN: {
@@ -812,10 +967,10 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
default: return;
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
- unsigned BitWidth = KnownOne.getBitWidth();
+ unsigned BitWidth = Known.getBitWidth();
EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
unsigned MemBits = VT.getScalarSizeInBits();
- KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+ Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
return;
}
}
@@ -834,15 +989,15 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
// bits larger than the element datatype. 32-bit or larget doesn't need
// this as those are legal types and will be handled by isel directly.
MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
- unsigned BitWidth = KnownZero.getBitWidth();
+ unsigned BitWidth = Known.getBitWidth();
if (VT == MVT::v8i8 || VT == MVT::v16i8) {
assert(BitWidth >= 8 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
- KnownZero |= Mask;
+ Known.Zero |= Mask;
} else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
assert(BitWidth >= 16 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
- KnownZero |= Mask;
+ Known.Zero |= Mask;
}
break;
} break;
@@ -2113,8 +2268,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
Entry.Node = Arg;
Entry.Ty = ArgTy;
- Entry.isSExt = false;
- Entry.isZExt = false;
+ Entry.IsSExt = false;
+ Entry.IsZExt = false;
Args.push_back(Entry);
const char *LibcallName =
@@ -2122,10 +2277,11 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
SDValue Callee =
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
- StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
+ StructType *RetTy = StructType::get(ArgTy, ArgTy);
TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
- .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+ CLI.setDebugLoc(dl)
+ .setChain(DAG.getEntryNode())
+ .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.first;
@@ -2231,19 +2387,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
}
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
- if (N->getOpcode() == ISD::SIGN_EXTEND)
- return true;
- if (isExtendedBUILD_VECTOR(N, DAG, true))
- return true;
- return false;
+ return N->getOpcode() == ISD::SIGN_EXTEND ||
+ isExtendedBUILD_VECTOR(N, DAG, true);
}
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
- if (N->getOpcode() == ISD::ZERO_EXTEND)
- return true;
- if (isExtendedBUILD_VECTOR(N, DAG, false))
- return true;
- return false;
+ return N->getOpcode() == ISD::ZERO_EXTEND ||
+ isExtendedBUILD_VECTOR(N, DAG, false);
}
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -2347,6 +2497,9 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
}
+ case Intrinsic::aarch64_neon_abs:
+ return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
+ Op.getOperand(1));
case Intrinsic::aarch64_neon_smax:
return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
@@ -2465,6 +2618,14 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
return LowerMUL(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ return LowerVECREDUCE(Op, DAG);
}
}
@@ -2489,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::PreserveMost:
case CallingConv::CXX_FAST_TLS:
case CallingConv::Swift:
+ if (Subtarget->isTargetWindows() && IsVarArg)
+ return CC_AArch64_Win64_VarArg;
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
+ case CallingConv::Win64:
+ return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
}
}
@@ -2507,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
@@ -2663,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
// varargs
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
if (isVarArg) {
- if (!Subtarget->isTargetDarwin()) {
+ if (!Subtarget->isTargetDarwin() || IsWin64) {
// The AAPCS variadic function ABI is identical to the non-variadic
// one. As a result there may be more arguments in registers and we should
// save them for future reference.
+ // Win64 variadic functions also pass arguments in registers, but all float
+ // arguments are passed in integer registers.
saveVarArgRegisters(CCInfo, DAG, DL, Chain);
}
@@ -2708,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
MachineFrameInfo &MFI = MF.getFrameInfo();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
+ bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());
SmallVector<SDValue, 8> MemOps;
@@ -2720,7 +2889,13 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
int GPRIdx = 0;
if (GPRSaveSize != 0) {
- GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+ if (IsWin64) {
+ GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
+ if (GPRSaveSize & 15)
+ // The extra size here, if triggered, will always be 8.
+ MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
+ } else
+ GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
@@ -2729,7 +2904,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
- MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
+ IsWin64
+ ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+ GPRIdx,
+ (i - FirstVariadicGPR) * 8)
+ : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
MemOps.push_back(Store);
FIN =
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2738,7 +2917,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
FuncInfo->setVarArgsGPRIndex(GPRIdx);
FuncInfo->setVarArgsGPRSize(GPRSaveSize);
- if (Subtarget->hasFPARMv8()) {
+ if (Subtarget->hasFPARMv8() && !IsWin64) {
static const MCPhysReg FPRArgRegs[] = {
AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
@@ -3108,9 +3287,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
- Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
- true),
- DL);
+ Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
getPointerTy(DAG.getDataLayout()));
@@ -3245,30 +3422,26 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
// node so that legalize doesn't hack it.
- if (getTargetMachine().getCodeModel() == CodeModel::Large &&
- Subtarget->isTargetMachO()) {
- if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ auto GV = G->getGlobal();
+ if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
+ AArch64II::MO_GOT) {
+ Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+ Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+ } else {
const GlobalValue *GV = G->getGlobal();
- bool InternalLinkage = GV->hasInternalLinkage();
- if (InternalLinkage)
- Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
- else {
- Callee =
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
- Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
- }
- } else if (ExternalSymbolSDNode *S =
- dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+ }
+ } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ Subtarget->isTargetMachO()) {
const char *Sym = S->getSymbol();
Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
+ } else {
+ const char *Sym = S->getSymbol();
+ Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
}
- } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
- const GlobalValue *GV = G->getGlobal();
- Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
- } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- const char *Sym = S->getSymbol();
- Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
}
// We don't usually want to end the call-sequence here because we would tidy
@@ -3428,11 +3601,75 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Other Lowering Code
//===----------------------------------------------------------------------===//
+SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+ N->getOffset(), Flag);
+}
+
+SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
+ SelectionDAG &DAG,
+ unsigned Flag) const {
+ return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
+}
+
+// (loadGOT sym)
+template <class NodeTy>
+SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const {
+ DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
+ SDLoc DL(N);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT);
+ // FIXME: Once remat is capable of dealing with instructions with register
+ // operands, expand this into two nodes instead of using a wrapper node.
+ return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
+}
+
+// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
+template <class NodeTy>
+SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG)
+ const {
+ DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
+ SDLoc DL(N);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ const unsigned char MO_NC = AArch64II::MO_NC;
+ return DAG.getNode(
+ AArch64ISD::WrapperLarge, DL, Ty,
+ getTargetNode(N, Ty, DAG, AArch64II::MO_G3),
+ getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC),
+ getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC),
+ getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC));
+}
+
+// (addlow (adrp %hi(sym)) %lo(sym))
+template <class NodeTy>
+SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const {
+ DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
+ SDLoc DL(N);
+ EVT Ty = getPointerTy(DAG.getDataLayout());
+ SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE);
+ SDValue Lo = getTargetNode(N, Ty, DAG,
+ AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
+ return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
+}
+
SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDLoc DL(Op);
- const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
+ GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GN->getGlobal();
unsigned char OpFlags =
Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
@@ -3440,32 +3677,15 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
"unexpected offset in global node");
- // This also catched the large code model case for Darwin.
+ // This also catches the large code model case for Darwin.
if ((OpFlags & AArch64II::MO_GOT) != 0) {
- SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
- // FIXME: Once remat is capable of dealing with instructions with register
- // operands, expand this into two nodes instead of using a wrapper node.
- return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+ return getGOT(GN, DAG);
}
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
- const unsigned char MO_NC = AArch64II::MO_NC;
- return DAG.getNode(
- AArch64ISD::WrapperLarge, DL, PtrVT,
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
- DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+ return getAddrLarge(GN, DAG);
} else {
- // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
- // the only correct model on Darwin.
- SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
- OpFlags | AArch64II::MO_PAGE);
- unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
- SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
-
- SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
- return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ return getAddr(GN, DAG);
}
}
@@ -3578,7 +3798,7 @@ SDValue
AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetELF() && "This function expects an ELF target");
- assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+ assert(Subtarget->useSmallAddressing() &&
"ELF TLS only supported in small memory model");
// Different choices can be made for the maximum size of the TLS area for a
// module. For the small address model, the default TLS size is 16MiB and the
@@ -3679,7 +3899,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
if (Subtarget->isTargetDarwin())
return LowerDarwinGlobalTLSAddress(Op, DAG);
- else if (Subtarget->isTargetELF())
+ if (Subtarget->isTargetELF())
return LowerELFGlobalTLSAddress(Op, DAG);
llvm_unreachable("Unexpected platform trying to use TLS");
@@ -4242,90 +4462,37 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
// Jump table entries as PC relative offsets. No additional tweaking
// is necessary here. Just get the address of the jump table.
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDLoc DL(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
- const unsigned char MO_NC = AArch64II::MO_NC;
- return DAG.getNode(
- AArch64ISD::WrapperLarge, DL, PtrVT,
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
- AArch64II::MO_G0 | MO_NC));
+ return getAddrLarge(JT, DAG);
}
-
- SDValue Hi =
- DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
- SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
- AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
- SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
- return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ return getAddr(JT, DAG);
}
SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDLoc DL(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
// Use the GOT for the large code model on iOS.
if (Subtarget->isTargetMachO()) {
- SDValue GotAddr = DAG.getTargetConstantPool(
- CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
- AArch64II::MO_GOT);
- return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+ return getGOT(CP, DAG);
}
-
- const unsigned char MO_NC = AArch64II::MO_NC;
- return DAG.getNode(
- AArch64ISD::WrapperLarge, DL, PtrVT,
- DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
- CP->getOffset(), AArch64II::MO_G3),
- DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
- CP->getOffset(), AArch64II::MO_G2 | MO_NC),
- DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
- CP->getOffset(), AArch64II::MO_G1 | MO_NC),
- DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
- CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+ return getAddrLarge(CP, DAG);
} else {
- // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
- // ELF, the only valid one on Darwin.
- SDValue Hi =
- DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
- CP->getOffset(), AArch64II::MO_PAGE);
- SDValue Lo = DAG.getTargetConstantPool(
- CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
- AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
- SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
- return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ return getAddr(CP, DAG);
}
}
SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
- const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
- EVT PtrVT = getPointerTy(DAG.getDataLayout());
- SDLoc DL(Op);
+ BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
- const unsigned char MO_NC = AArch64II::MO_NC;
- return DAG.getNode(
- AArch64ISD::WrapperLarge, DL, PtrVT,
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
- DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+ return getAddrLarge(BA, DAG);
} else {
- SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
- SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
- AArch64II::MO_NC);
- SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
- return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+ return getAddr(BA, DAG);
}
}
@@ -4342,6 +4509,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
MachinePointerInfo(SV));
}
+SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
+ SelectionDAG &DAG) const {
+ AArch64FunctionInfo *FuncInfo =
+ DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+ SDLoc DL(Op);
+ SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
+ ? FuncInfo->getVarArgsGPRIndex()
+ : FuncInfo->getVarArgsStackIndex(),
+ getPointerTy(DAG.getDataLayout()));
+ const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+ return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+ MachinePointerInfo(SV));
+}
+
SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
SelectionDAG &DAG) const {
// The layout of the va_list struct is specified in the AArch64 Procedure Call
@@ -4413,8 +4595,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
SelectionDAG &DAG) const {
- return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
- : LowerAAPCS_VASTART(Op, DAG);
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
+ return LowerWin64_VASTART(Op, DAG);
+ else if (Subtarget->isTargetDarwin())
+ return LowerDarwin_VASTART(Op, DAG);
+ else
+ return LowerAAPCS_VASTART(Op, DAG);
}
SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
@@ -4422,7 +4610,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
// pointer.
SDLoc DL(Op);
- unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+ unsigned VaListSize =
+ Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
@@ -4516,7 +4705,12 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
.Case("sp", AArch64::SP)
+ .Case("x18", AArch64::X18)
+ .Case("w18", AArch64::W18)
.Default(0);
+ if ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
+ !Subtarget->isX18Reserved())
+ Reg = 0;
if (Reg)
return Reg;
report_fatal_error(Twine("Invalid register name \""
@@ -4717,9 +4911,9 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
- &Flags);
- Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
- Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+ Flags);
+ Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
}
if (!Reciprocal) {
@@ -4728,7 +4922,7 @@ SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
- Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
// Correct the result if the operand is 0.0.
Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
VT, Eq, Operand, Estimate);
@@ -4757,8 +4951,8 @@ SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
// AArch64 reciprocal iteration instruction: (2 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
- Estimate, &Flags);
- Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
+ Estimate, Flags);
+ Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
}
ExtraSteps = 0;
@@ -6591,21 +6785,20 @@ FailedModImm:
if (!isConstant && !usesOnlyOneValue) {
SDValue Vec = DAG.getUNDEF(VT);
SDValue Op0 = Op.getOperand(0);
- unsigned ElemSize = VT.getScalarSizeInBits();
unsigned i = 0;
- // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+
+ // Use SCALAR_TO_VECTOR for lane zero to
// a) Avoid a RMW dependency on the full vector register, and
// b) Allow the register coalescer to fold away the copy if the
- // value is already in an S or D register.
- // Do not do this for UNDEF/LOAD nodes because we have better patterns
- // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
- if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
- (ElemSize == 32 || ElemSize == 64)) {
- unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
- MachineSDNode *N =
- DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
- DAG.getTargetConstant(SubIdx, dl, MVT::i32));
- Vec = SDValue(N, 0);
+ // value is already in an S or D register, and we're forced to emit an
+ // INSERT_SUBREG that we can't fold anywhere.
+ //
+ // We also allow types like i8 and i16 which are illegal scalar but legal
+ // vector element types. After type-legalization the inserted value is
+ // extended (i32) and it is safe to cast them to the vector type by ignoring
+ // the upper bits of the lowest lane (e.g. v8i8, v4i16).
+ if (!Op0.isUndef()) {
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
++i;
}
for (; i < NumElts; ++i) {
@@ -6995,6 +7188,47 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
return Cmp;
}
+static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
+ SelectionDAG &DAG) {
+ SDValue VecOp = ScalarOp.getOperand(0);
+ auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
+ DAG.getConstant(0, DL, MVT::i64));
+}
+
+SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ switch (Op.getOpcode()) {
+ case ISD::VECREDUCE_ADD:
+ return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
+ case ISD::VECREDUCE_SMAX:
+ return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
+ case ISD::VECREDUCE_SMIN:
+ return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
+ case ISD::VECREDUCE_UMAX:
+ return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
+ case ISD::VECREDUCE_UMIN:
+ return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
+ case ISD::VECREDUCE_FMAX: {
+ assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
+ DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
+ Op.getOperand(0));
+ }
+ case ISD::VECREDUCE_FMIN: {
+ assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
+ DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
+ Op.getOperand(0));
+ }
+ default:
+ llvm_unreachable("Unhandled reduction");
+ }
+}
+
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
@@ -7132,7 +7366,7 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
if (I->getOpcode() != Instruction::FMul)
return true;
- if (I->getNumUses() != 1)
+ if (!I->hasOneUse())
return true;
Instruction *User = I->user_back();
@@ -7249,6 +7483,41 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
return NumBits == 32 || NumBits == 64;
}
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+unsigned
+AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
+ const DataLayout &DL) const {
+ return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
+MachineMemOperand::Flags
+AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+ if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
+ I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
+ return MOStridedAccess;
+ return MachineMemOperand::MONone;
+}
+
+bool AArch64TargetLowering::isLegalInterleavedAccessType(
+ VectorType *VecTy, const DataLayout &DL) const {
+
+ unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+ unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+
+ // Ensure the number of vector elements is greater than 1.
+ if (VecTy->getNumElements() < 2)
+ return false;
+
+ // Ensure the element type is legal.
+ if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
+ return false;
+
+ // Ensure the total vector size is 64 or a multiple of 128. Types larger than
+ // 128 will be split into multiple interleaved accesses.
+ return VecSize == 64 || VecSize % 128 == 0;
+}
+
/// \brief Lower an interleaved load into a ldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
@@ -7272,12 +7541,15 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
const DataLayout &DL = LI->getModule()->getDataLayout();
VectorType *VecTy = Shuffles[0]->getType();
- unsigned VecSize = DL.getTypeSizeInBits(VecTy);
- // Skip if we do not have NEON and skip illegal vector types.
- if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
+ // Skip if we do not have NEON and skip illegal vector types. We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
return false;
+ unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
Type *EltTy = VecTy->getVectorElementType();
@@ -7285,6 +7557,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
VecTy =
VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+ IRBuilder<> Builder(LI);
+
+ // The base address of the load.
+ Value *BaseAddr = LI->getPointerOperand();
+
+ if (NumLoads > 1) {
+ // If we're going to generate more than one load, reset the sub-vector type
+ // to something legal.
+ VecTy = VectorType::get(VecTy->getVectorElementType(),
+ VecTy->getVectorNumElements() / NumLoads);
+
+ // We will compute the pointer operand of each load from the original base
+ // address using GEPs. Cast the base address to a pointer to the scalar
+ // element type.
+ BaseAddr = Builder.CreateBitCast(
+ BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+ LI->getPointerAddressSpace()));
+ }
+
Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
Type *Tys[2] = {VecTy, PtrTy};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
@@ -7293,39 +7584,50 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
Function *LdNFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
- IRBuilder<> Builder(LI);
- Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+ // Holds sub-vectors extracted from the load intrinsic return values. The
+ // sub-vectors are associated with the shufflevector instructions they will
+ // replace.
+ DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
- CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+ for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
- // Replace uses of each shufflevector with the corresponding vector loaded
- // by ldN.
- for (unsigned i = 0; i < Shuffles.size(); i++) {
- ShuffleVectorInst *SVI = Shuffles[i];
- unsigned Index = Indices[i];
+ // If we're generating more than one load, compute the base address of
+ // subsequent loads as an offset from the previous.
+ if (LoadCount > 0)
+ BaseAddr = Builder.CreateConstGEP1_32(
+ BaseAddr, VecTy->getVectorNumElements() * Factor);
- Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+ CallInst *LdN = Builder.CreateCall(
+ LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
- // Convert the integer vector to pointer vector if the element is pointer.
- if (EltTy->isPointerTy())
- SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+ // Extract and store the sub-vectors returned by the load intrinsic.
+ for (unsigned i = 0; i < Shuffles.size(); i++) {
+ ShuffleVectorInst *SVI = Shuffles[i];
+ unsigned Index = Indices[i];
- SVI->replaceAllUsesWith(SubVec);
- }
+ Value *SubVec = Builder.CreateExtractValue(LdN, Index);
- return true;
-}
+ // Convert the integer vector to pointer vector if the element is pointer.
+ if (EltTy->isPointerTy())
+ SubVec = Builder.CreateIntToPtr(
+ SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
+ VecTy->getVectorNumElements()));
+ SubVecs[SVI].push_back(SubVec);
+ }
+ }
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
- unsigned NumElts) {
- SmallVector<Constant *, 16> Mask;
- for (unsigned i = 0; i < NumElts; i++)
- Mask.push_back(Builder.getInt32(Start + i));
+ // Replace uses of the shufflevector instructions with the sub-vectors
+ // returned by the load intrinsic. If a shufflevector instruction is
+ // associated with more than one sub-vector, those sub-vectors will be
+ // concatenated into a single wide vector.
+ for (ShuffleVectorInst *SVI : Shuffles) {
+ auto &SubVec = SubVecs[SVI];
+ auto *WideVec =
+ SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+ SVI->replaceAllUsesWith(WideVec);
+ }
- return ConstantVector::get(Mask);
+ return true;
}
/// \brief Lower an interleaved store into a stN intrinsic.
@@ -7369,12 +7671,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
- unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
- // Skip if we do not have NEON and skip illegal vector types.
- if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
+ // Skip if we do not have NEON and skip illegal vector types. We can
+ // "legalize" wide vector types into multiple interleaved accesses as long as
+ // the vector types are divisible by 128.
+ if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
return false;
+ unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI);
@@ -7394,6 +7699,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
SubVecTy = VectorType::get(IntTy, LaneLen);
}
+ // The base address of the store.
+ Value *BaseAddr = SI->getPointerOperand();
+
+ if (NumStores > 1) {
+ // If we're going to generate more than one store, reset the lane length
+ // and sub-vector type to something legal.
+ LaneLen /= NumStores;
+ SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+ // We will compute the pointer operand of each store from the original base
+ // address using GEPs. Cast the base address to a pointer to the scalar
+ // element type.
+ BaseAddr = Builder.CreateBitCast(
+ BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+ SI->getPointerAddressSpace()));
+ }
+
+ auto Mask = SVI->getShuffleMask();
+
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
Type *Tys[2] = {SubVecTy, PtrTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
@@ -7402,34 +7726,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
Function *StNFunc =
Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
- SmallVector<Value *, 5> Ops;
+ for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
- // Split the shufflevector operands into sub vectors for the new stN call.
- auto Mask = SVI->getShuffleMask();
- for (unsigned i = 0; i < Factor; i++) {
- if (Mask[i] >= 0) {
- Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
- } else {
- unsigned StartMask = 0;
- for (unsigned j = 1; j < LaneLen; j++) {
- if (Mask[j*Factor + i] >= 0) {
- StartMask = Mask[j*Factor + i] - j;
- break;
+ SmallVector<Value *, 5> Ops;
+
+ // Split the shufflevector operands into sub vectors for the new stN call.
+ for (unsigned i = 0; i < Factor; i++) {
+ unsigned IdxI = StoreCount * LaneLen * Factor + i;
+ if (Mask[IdxI] >= 0) {
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+ } else {
+ unsigned StartMask = 0;
+ for (unsigned j = 1; j < LaneLen; j++) {
+ unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+ if (Mask[IdxJ * Factor + IdxI] >= 0) {
+ StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+ break;
+ }
}
+ // Note: Filling undef gaps with random elements is ok, since
+ // those elements were being written anyway (with undefs).
+ // In the case of all undefs we're defaulting to using elems from 0
+ // Note: StartMask cannot be negative, it's checked in
+ // isReInterleaveMask
+ Ops.push_back(Builder.CreateShuffleVector(
+ Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
}
- // Note: If all elements in a chunk are undefs, StartMask=0!
- // Note: Filling undef gaps with random elements is ok, since
- // those elements were being written anyway (with undefs).
- // In the case of all undefs we're defaulting to using elems from 0
- // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
- Ops.push_back(Builder.CreateShuffleVector(
- Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
}
- }
- Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
- Builder.CreateCall(StNFunc, Ops);
+ // If we generating more than one store, we compute the base address of
+ // subsequent stores as an offset from the previous.
+ if (StoreCount > 0)
+ BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
+
+ Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+ Builder.CreateCall(StNFunc, Ops);
+ }
return true;
}
@@ -7690,7 +8023,7 @@ SDValue
AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
std::vector<SDNode *> *Created) const {
- AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+ AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
if (isIntDivCheap(N->getValueType(0), Attr))
return SDValue(N,0); // Lower SDIV as SDIV
@@ -8079,9 +8412,9 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
/// EXTR instruction extracts a contiguous chunk of bits from two existing
/// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
+/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
+/// with an EXTR. Can't quite be done in TableGen because the two immediates
+/// aren't independent.
static SDValue tryCombineToEXTR(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
@@ -8935,16 +9268,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
// instructions (stp).
SDLoc DL(&St);
SDValue BasePtr = St.getBasePtr();
+ uint64_t BaseOffset = 0;
+
const MachinePointerInfo &PtrInfo = St.getPointerInfo();
SDValue NewST1 =
DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
OrigAlignment, St.getMemOperand()->getFlags());
+ // As this in ISel, we will not merge this add which may degrade results.
+ if (BasePtr->getOpcode() == ISD::ADD &&
+ isa<ConstantSDNode>(BasePtr->getOperand(1))) {
+ BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+ BasePtr = BasePtr->getOperand(0);
+ }
+
unsigned Offset = EltOffset;
while (--NumVecElts) {
unsigned Alignment = MinAlign(OrigAlignment, Offset);
- SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
- DAG.getConstant(Offset, DL, MVT::i64));
+ SDValue OffsetPtr =
+ DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+ DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
PtrInfo.getWithOffset(Offset), Alignment,
St.getMemOperand()->getFlags());
@@ -9072,7 +9415,7 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
StoreSDNode *S = cast<StoreSDNode>(N);
- if (S->isVolatile())
+ if (S->isVolatile() || S->isIndexed())
return SDValue();
SDValue StVal = S->getValue();
@@ -9236,17 +9579,17 @@ static SDValue performPostLD1Combine(SDNode *N,
return SDValue();
}
-/// Simplify \Addr given that the top byte of it is ignored by HW during
+/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
/// address translation.
static bool performTBISimplification(SDValue Addr,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
APInt DemandedMask = APInt::getLowBitsSet(64, 56);
- APInt KnownZero, KnownOne;
- TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
- DCI.isBeforeLegalizeOps());
+ KnownBits Known;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
+ if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
return true;
}
@@ -9267,266 +9610,6 @@ static SDValue performSTORECombine(SDNode *N,
return SDValue();
}
- /// This function handles the log2-shuffle pattern produced by the
-/// LoopVectorizer for the across vector reduction. It consists of
-/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
-/// are reduced, where s is an induction variable from 0 to
-/// log2(NumVectorElements).
-static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
- unsigned Op,
- SelectionDAG &DAG) {
- EVT VTy = OpV->getOperand(0).getValueType();
- if (!VTy.isVector())
- return SDValue();
-
- int NumVecElts = VTy.getVectorNumElements();
- if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
- if (NumVecElts != 4)
- return SDValue();
- } else {
- if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
- return SDValue();
- }
-
- int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
- SDValue PreOp = OpV;
- // Iterate over each step of the across vector reduction.
- for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
- SDValue CurOp = PreOp.getOperand(0);
- SDValue Shuffle = PreOp.getOperand(1);
- if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
- // Try to swap the 1st and 2nd operand as add and min/max instructions
- // are commutative.
- CurOp = PreOp.getOperand(1);
- Shuffle = PreOp.getOperand(0);
- if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
- return SDValue();
- }
-
- // Check if the input vector is fed by the operator we want to handle,
- // except the last step; the very first input vector is not necessarily
- // the same operator we are handling.
- if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
- return SDValue();
-
- // Check if it forms one step of the across vector reduction.
- // E.g.,
- // %cur = add %1, %0
- // %shuffle = vector_shuffle %cur, <2, 3, u, u>
- // %pre = add %cur, %shuffle
- if (Shuffle.getOperand(0) != CurOp)
- return SDValue();
-
- int NumMaskElts = 1 << CurStep;
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
- // Check mask values in each step.
- // We expect the shuffle mask in each step follows a specific pattern
- // denoted here by the <M, U> form, where M is a sequence of integers
- // starting from NumMaskElts, increasing by 1, and the number integers
- // in M should be NumMaskElts. U is a sequence of UNDEFs and the number
- // of undef in U should be NumVecElts - NumMaskElts.
- // E.g., for <8 x i16>, mask values in each step should be :
- // step 0 : <1,u,u,u,u,u,u,u>
- // step 1 : <2,3,u,u,u,u,u,u>
- // step 2 : <4,5,6,7,u,u,u,u>
- for (int i = 0; i < NumVecElts; ++i)
- if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
- (i >= NumMaskElts && !(Mask[i] < 0)))
- return SDValue();
-
- PreOp = CurOp;
- }
- unsigned Opcode;
- bool IsIntrinsic = false;
-
- switch (Op) {
- default:
- llvm_unreachable("Unexpected operator for across vector reduction");
- case ISD::ADD:
- Opcode = AArch64ISD::UADDV;
- break;
- case ISD::SMAX:
- Opcode = AArch64ISD::SMAXV;
- break;
- case ISD::UMAX:
- Opcode = AArch64ISD::UMAXV;
- break;
- case ISD::SMIN:
- Opcode = AArch64ISD::SMINV;
- break;
- case ISD::UMIN:
- Opcode = AArch64ISD::UMINV;
- break;
- case ISD::FMAXNUM:
- Opcode = Intrinsic::aarch64_neon_fmaxnmv;
- IsIntrinsic = true;
- break;
- case ISD::FMINNUM:
- Opcode = Intrinsic::aarch64_neon_fminnmv;
- IsIntrinsic = true;
- break;
- }
- SDLoc DL(N);
-
- return IsIntrinsic
- ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
- DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
- : DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
- DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
- DAG.getConstant(0, DL, MVT::i64));
-}
-
-/// Target-specific DAG combine for the across vector min/max reductions.
-/// This function specifically handles the final clean-up step of the vector
-/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
-/// pattern, which narrows down and finds the final min/max value from all
-/// elements of the vector.
-/// For example, for a <16 x i8> vector :
-/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
-/// %smax0 = smax %arr, svn0
-/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
-/// %smax1 = smax %smax0, %svn1
-/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-/// %smax2 = smax %smax1, svn2
-/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-/// %sc = setcc %smax2, %svn3, gt
-/// %n0 = extract_vector_elt %sc, #0
-/// %n1 = extract_vector_elt %smax2, #0
-/// %n2 = extract_vector_elt $smax2, #1
-/// %result = select %n0, %n1, n2
-/// becomes :
-/// %1 = smaxv %0
-/// %result = extract_vector_elt %1, 0
-static SDValue
-performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- if (!Subtarget->hasNEON())
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- SDValue IfTrue = N->getOperand(1);
- SDValue IfFalse = N->getOperand(2);
-
- // Check if the SELECT merges up the final result of the min/max
- // from a vector.
- if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
-
- // Expect N0 is fed by SETCC.
- SDValue SetCC = N0.getOperand(0);
- EVT SetCCVT = SetCC.getValueType();
- if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
- SetCCVT.getVectorElementType() != MVT::i1)
- return SDValue();
-
- SDValue VectorOp = SetCC.getOperand(0);
- unsigned Op = VectorOp->getOpcode();
- // Check if the input vector is fed by the operator we want to handle.
- if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
- Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
- return SDValue();
-
- EVT VTy = VectorOp.getValueType();
- if (!VTy.isVector())
- return SDValue();
-
- if (VTy.getSizeInBits() < 64)
- return SDValue();
-
- EVT EltTy = VTy.getVectorElementType();
- if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
- if (EltTy != MVT::f32)
- return SDValue();
- } else {
- if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
- return SDValue();
- }
-
- // Check if extracting from the same vector.
- // For example,
- // %sc = setcc %vector, %svn1, gt
- // %n0 = extract_vector_elt %sc, #0
- // %n1 = extract_vector_elt %vector, #0
- // %n2 = extract_vector_elt $vector, #1
- if (!(VectorOp == IfTrue->getOperand(0) &&
- VectorOp == IfFalse->getOperand(0)))
- return SDValue();
-
- // Check if the condition code is matched with the operator type.
- ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
- if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
- (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
- (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
- (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
- (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
- CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
- CC != ISD::SETGE) ||
- (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
- CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
- CC != ISD::SETLE))
- return SDValue();
-
- // Expect to check only lane 0 from the vector SETCC.
- if (!isNullConstant(N0.getOperand(1)))
- return SDValue();
-
- // Expect to extract the true value from lane 0.
- if (!isNullConstant(IfTrue.getOperand(1)))
- return SDValue();
-
- // Expect to extract the false value from lane 1.
- if (!isOneConstant(IfFalse.getOperand(1)))
- return SDValue();
-
- return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
-}
-
-/// Target-specific DAG combine for the across vector add reduction.
-/// This function specifically handles the final clean-up step of the vector
-/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
-/// pattern, which adds all elements of a vector together.
-/// For example, for a <4 x i32> vector :
-/// %1 = vector_shuffle %0, <2,3,u,u>
-/// %2 = add %0, %1
-/// %3 = vector_shuffle %2, <1,u,u,u>
-/// %4 = add %2, %3
-/// %result = extract_vector_elt %4, 0
-/// becomes :
-/// %0 = uaddv %0
-/// %result = extract_vector_elt %0, 0
-static SDValue
-performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- if (!Subtarget->hasNEON())
- return SDValue();
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- // Check if the input vector is fed by the ADD.
- if (N0->getOpcode() != ISD::ADD)
- return SDValue();
-
- // The vector extract idx must constant zero because we only expect the final
- // result of the reduction is placed in lane 0.
- if (!isNullConstant(N1))
- return SDValue();
-
- EVT VTy = N0.getValueType();
- if (!VTy.isVector())
- return SDValue();
-
- EVT EltTy = VTy.getVectorElementType();
- if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
- return SDValue();
-
- if (VTy.getSizeInBits() < 64)
- return SDValue();
-
- return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
-}
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
@@ -10205,12 +10288,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
- case ISD::SELECT: {
- SDValue RV = performSelectCombine(N, DCI);
- if (!RV.getNode())
- RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
- return RV;
- }
+ case ISD::SELECT:
+ return performSelectCombine(N, DCI);
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
case ISD::LOAD:
@@ -10232,8 +10311,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
- case ISD::EXTRACT_VECTOR_ELT:
- return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -10307,7 +10384,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
// call. This will cause the optimizers to attempt to move, or duplicate,
// return instructions to help enable tail call optimizations for this
// instruction.
-bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return CI->isTailCall();
}
@@ -10453,6 +10530,14 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::BITCAST:
ReplaceBITCASTResults(N, Results, DAG);
return;
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
+ return;
+
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
return;
@@ -10483,9 +10568,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
}
bool AArch64TargetLowering::useLoadStackGuardNode() const {
- if (!Subtarget->isTargetAndroid())
- return true;
- return TargetLowering::useLoadStackGuardNode();
+ if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
+ return TargetLowering::useLoadStackGuardNode();
+ return true;
}
unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10527,11 +10612,17 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
- return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
+ if (Size > 128) return AtomicExpansionKind::None;
+ // Nand not supported in LSE.
+ if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
+ // Leave 128 bits to LLSC.
+ return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
}
bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *AI) const {
+ // If subtarget has LSE, leave cmpxchg intact for codegen.
+ if (Subtarget->hasLSE()) return false;
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
// on the stack and close enough to the spill slot, this can lead to a
@@ -10623,36 +10714,56 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
return false;
}
-Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
- if (!Subtarget->isTargetAndroid())
- return TargetLowering::getIRStackGuard(IRB);
-
- // Android provides a fixed TLS slot for the stack cookie. See the definition
- // of TLS_SLOT_STACK_GUARD in
- // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
- const unsigned TlsOffset = 0x28;
+static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
- IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+ IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
}
-Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
- if (!Subtarget->isTargetAndroid())
- return TargetLowering::getSafeStackPointerLocation(IRB);
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+ // Android provides a fixed TLS slot for the stack cookie. See the definition
+ // of TLS_SLOT_STACK_GUARD in
+ // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+ if (Subtarget->isTargetAndroid())
+ return UseTlsOffset(IRB, 0x28);
+ // Fuchsia is similar.
+ // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+ if (Subtarget->isTargetFuchsia())
+ return UseTlsOffset(IRB, -0x10);
+
+ return TargetLowering::getIRStackGuard(IRB);
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
- const unsigned TlsOffset = 0x48;
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
- Function *ThreadPointerFunc =
- Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
- return IRB.CreatePointerCast(
- IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
- Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+ if (Subtarget->isTargetAndroid())
+ return UseTlsOffset(IRB, 0x48);
+
+ // Fuchsia is similar.
+ // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+ if (Subtarget->isTargetFuchsia())
+ return UseTlsOffset(IRB, -0x8);
+
+ return TargetLowering::getSafeStackPointerLocation(IRB);
+}
+
+bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
+ const Instruction &AndI) const {
+ // Only sink 'and' mask to cmp use block if it is masking a single bit, since
+ // this is likely to be fold the and/cmp/br into a single tbz instruction. It
+ // may be beneficial to sink in other cases, but we would have to check that
+ // the cmp would not get folded into the br to form a cbz for these to be
+ // beneficial.
+ ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+ if (!Mask)
+ return false;
+ return Mask->getUniqueInteger().isPowerOf2();
}
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
@@ -10702,7 +10813,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
}
}
-bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
// Integer division on AArch64 is expensive. However, when aggressively
// optimizing for code size, we prefer to use a div instruction, as it is
// usually smaller than the alternative sequence.
@@ -10711,6 +10822,14 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
bool OptSize =
- Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+ Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
return OptSize && !VT.isVector();
}
+
+unsigned
+AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
+ if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
+ return getPointerTy(DL).getSizeInBits();
+
+ return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 054ccc3..3b0e0f1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -250,10 +250,14 @@ public:
/// Determine which of the bits specified in Mask are known to be either zero
/// or one and return them in the KnownZero/KnownOne bitsets.
- void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
- APInt &KnownOne, const SelectionDAG &DAG,
+ void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
unsigned Depth = 0) const override;
+ bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+ TargetLoweringOpt &TLO) const override;
+
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
/// Returns true if the target allows unaligned memory accesses of the
@@ -402,7 +406,20 @@ public:
return AArch64::X1;
}
- bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+ bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
+
+ bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+ const SelectionDAG &DAG) const override {
+ // Do not merge to float value size (128 bytes) if no implicit
+ // float attribute is set.
+
+ bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+ Attribute::NoImplicitFloat);
+
+ if (NoFloat)
+ return (MemVT.getSizeInBits() <= 64);
+ return true;
+ }
bool isCheapToSpeculateCttz() const override {
return true;
@@ -412,6 +429,8 @@ public:
return true;
}
+ bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
bool hasAndNotCompare(SDValue) const override {
// 'bics'
return true;
@@ -435,6 +454,22 @@ public:
return true;
}
+ /// Returns the size of the platform's va_list object.
+ unsigned getVaListSizeInBits(const DataLayout &DL) const override;
+
+ /// Returns true if \p VecTy is a legal interleaved access type. This
+ /// function checks the vector element type and the overall width of the
+ /// vector.
+ bool isLegalInterleavedAccessType(VectorType *VecTy,
+ const DataLayout &DL) const;
+
+ /// Returns the number of interleaved accesses that will be generated when
+ /// lowering accesses of the given type.
+ unsigned getNumInterleavedAccesses(VectorType *VecTy,
+ const DataLayout &DL) const;
+
+ MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+
private:
bool isExtFreeImpl(const Instruction *Ext) const override;
@@ -491,6 +526,18 @@ private:
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
+ SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+ SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+ SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+ SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+ unsigned Flag) const;
+ template <class NodeTy> SDValue getGOT(NodeTy *N, SelectionDAG &DAG) const;
+ template <class NodeTy>
+ SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG) const;
+ template <class NodeTy> SDValue getAddr(NodeTy *N, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -509,6 +556,7 @@ private:
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
@@ -536,6 +584,7 @@ private:
SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
std::vector<SDNode *> *Created) const override;
@@ -576,7 +625,7 @@ private:
}
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
- bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+ bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
ISD::MemIndexedMode &AM, bool &IsInc,
SelectionDAG &DAG) const;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 867074c..eec41dd 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -14,6 +14,9 @@
//===----------------------------------
// Atomic fences
//===----------------------------------
+let AddedComplexity = 15, Size = 0 in
+def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering),
+ [(atomic_fence imm:$ordering, 0)]>, Sched<[]>;
def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
@@ -402,3 +405,59 @@ def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$scratch),
(ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
GPR64:$newLo, GPR64:$newHi), []>,
Sched<[WriteAtomic]>;
+
+// v8.1 Atomic instructions:
+def : Pat<(atomic_load_add_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_add_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_or_8 GPR64:$Rn, GPR32:$Rs), (LDSETALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_16 GPR64:$Rn, GPR32:$Rs), (LDSETALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_32 GPR64:$Rn, GPR32:$Rs), (LDSETALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_or_64 GPR64:$Rn, GPR64:$Rs), (LDSETALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_xor_8 GPR64:$Rn, GPR32:$Rs), (LDEORALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_16 GPR64:$Rn, GPR32:$Rs), (LDEORALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_32 GPR64:$Rn, GPR32:$Rs), (LDEORALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_xor_64 GPR64:$Rn, GPR64:$Rs), (LDEORALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_max_8 GPR64:$Rn, GPR32:$Rs), (LDSMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_16 GPR64:$Rn, GPR32:$Rs), (LDSMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_32 GPR64:$Rn, GPR32:$Rs), (LDSMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_max_64 GPR64:$Rn, GPR64:$Rs), (LDSMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umax_8 GPR64:$Rn, GPR32:$Rs), (LDUMAXALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_16 GPR64:$Rn, GPR32:$Rs), (LDUMAXALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_32 GPR64:$Rn, GPR32:$Rs), (LDUMAXALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umax_64 GPR64:$Rn, GPR64:$Rs), (LDUMAXALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_min_8 GPR64:$Rn, GPR32:$Rs), (LDSMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_16 GPR64:$Rn, GPR32:$Rs), (LDSMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_32 GPR64:$Rn, GPR32:$Rs), (LDSMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_min_64 GPR64:$Rn, GPR64:$Rs), (LDSMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_umin_8 GPR64:$Rn, GPR32:$Rs), (LDUMINALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_16 GPR64:$Rn, GPR32:$Rs), (LDUMINALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_32 GPR64:$Rn, GPR32:$Rs), (LDUMINALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_load_umin_64 GPR64:$Rn, GPR64:$Rs), (LDUMINALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_cmp_swap_8 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALb GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_16 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALh GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_32 GPR64:$Rn, GPR32:$Rold, GPR32:$Rnew), (CASALs GPR32:$Rold, GPR32:$Rnew, GPR64sp:$Rn)>;
+def : Pat<(atomic_cmp_swap_64 GPR64:$Rn, GPR64:$Rold, GPR64:$Rnew), (CASALd GPR64:$Rold, GPR64:$Rnew, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>;
+def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
+
+def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>;
+def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index cefdf51..c44daf3 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -39,6 +39,9 @@ class AArch64Inst<Format f, string cstr> : Instruction {
let Constraints = cstr;
}
+class InstSubst<string Asm, dag Result, bit EmitPriority = 0>
+ : InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>;
+
// Pseudo instructions (don't have encoding information)
class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
: AArch64Inst<PseudoFrm, cstr> {
@@ -257,6 +260,7 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
class AsmImmRange<int Low, int High> : AsmOperandClass {
let Name = "Imm" # Low # "_" # High;
let DiagnosticType = "InvalidImm" # Low # "_" # High;
+ let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
}
def Imm1_8Operand : AsmImmRange<1, 8>;
@@ -264,6 +268,20 @@ def Imm1_16Operand : AsmImmRange<1, 16>;
def Imm1_32Operand : AsmImmRange<1, 32>;
def Imm1_64Operand : AsmImmRange<1, 64>;
+class BranchTarget<int N> : AsmOperandClass {
+ let Name = "BranchTarget" # N;
+ let DiagnosticType = "InvalidLabel";
+ let PredicateMethod = "isBranchTarget<" # N # ">";
+}
+
+class PCRelLabel<int N> : BranchTarget<N> {
+ let Name = "PCRelLabel" # N;
+}
+
+def BranchTarget14Operand : BranchTarget<14>;
+def BranchTarget26Operand : BranchTarget<26>;
+def PCRelLabel19Operand : PCRelLabel<19>;
+
def MovZSymbolG3AsmOperand : AsmOperandClass {
let Name = "MovZSymbolG3";
let RenderMethod = "addImmOperands";
@@ -500,7 +518,8 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
}
// imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def Imm0_255Operand : AsmImmRange<0,255>;
+
def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 256;
}]> {
@@ -673,6 +692,14 @@ def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
+def gi_addsub_shifted_imm32 :
+ GIComplexOperandMatcher<s32, "selectArithImmed">,
+ GIComplexPatternEquiv<addsub_shifted_imm32>;
+
+def gi_addsub_shifted_imm64 :
+ GIComplexOperandMatcher<s64, "selectArithImmed">,
+ GIComplexPatternEquiv<addsub_shifted_imm64>;
+
class neg_addsub_shifted_imm<ValueType Ty>
: Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
let PrintMethod = "printAddSubImm";
@@ -1094,10 +1121,6 @@ def inv_ccode : Operand<i32> {
// Conditional branch target. 19-bit immediate. The low two bits of the target
// offset are implied zero and so are not part of the immediate.
-def PCRelLabel19Operand : AsmOperandClass {
- let Name = "PCRelLabel19";
- let DiagnosticType = "InvalidLabel";
-}
def am_brcond : Operand<OtherVT> {
let EncoderMethod = "getCondBranchTargetOpValue";
let DecoderMethod = "DecodePCRelLabel19";
@@ -1154,9 +1177,6 @@ multiclass CmpBranch<bit op, string asm, SDNode node> {
//---
// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
// the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
- let Name = "BranchTarget14";
-}
def am_tbrcond : Operand<OtherVT> {
let EncoderMethod = "getTestBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
@@ -1166,11 +1186,12 @@ def am_tbrcond : Operand<OtherVT> {
// AsmOperand classes to emit (or not) special diagnostics
def TBZImm0_31Operand : AsmOperandClass {
let Name = "TBZImm0_31";
- let PredicateMethod = "isImm0_31";
+ let PredicateMethod = "isImmInRange<0,31>";
let RenderMethod = "addImm0_31Operands";
}
def TBZImm32_63Operand : AsmOperandClass {
let Name = "Imm32_63";
+ let PredicateMethod = "isImmInRange<32,63>";
let DiagnosticType = "InvalidImm0_63";
}
@@ -1232,10 +1253,6 @@ multiclass TestBranch<bit op, string asm, SDNode node> {
//---
// Unconditional branch (immediate) instructions.
//---
-def BranchTarget26Operand : AsmOperandClass {
- let Name = "BranchTarget26";
- let DiagnosticType = "InvalidLabel";
-}
def am_b_target : Operand<OtherVT> {
let EncoderMethod = "getBranchTargetOpValue";
let PrintMethod = "printAlignedLabel";
@@ -1784,10 +1801,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
}
// add Rd, Rb, -imm -> sub Rd, Rn, imm
- def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
@@ -1859,10 +1876,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
} // Defs = [NZCV]
// Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
- def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+ def : InstSubst<alias#"\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
addsub_shifted_imm64_neg:$imm), 0>;
@@ -1883,9 +1900,9 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
// Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
- def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+ def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
- def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+ def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
// Compare shorthands
@@ -2100,10 +2117,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
let Inst{31} = 1;
}
- def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
- def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
}
@@ -2122,10 +2139,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
}
} // end Defs = [NZCV]
- def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
logical_imm32_not:$imm), 0>;
- def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+ def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
(!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
logical_imm64_not:$imm), 0>;
}
@@ -2454,7 +2471,7 @@ class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
// Load literal address: 19-bit immediate. The low two bits of the target
// offset are implied zero and so are not part of the immediate.
-def am_ldrlit : Operand<OtherVT> {
+def am_ldrlit : Operand<iPTR> {
let EncoderMethod = "getLoadLiteralOpValue";
let DecoderMethod = "DecodePCRelLabel19";
let PrintMethod = "printAlignedLabel";
@@ -9060,7 +9077,7 @@ multiclass SIMDLdSt4SingleAliases<string asm> {
// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
//----------------------------------------------------------------------------
-let Predicates = [HasNEON, HasV8_1a] in {
+let Predicates = [HasNEON, HasRDM] in {
class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
RegisterOperand regtype, string asm,
@@ -9221,7 +9238,7 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
let Inst{21} = idx{0};
}
}
-} // let Predicates = [HasNeon, HasV8_1a]
+} // let Predicates = [HasNeon, HasRDM]
//----------------------------------------------------------------------------
// Crypto extensions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4c78992..c0c6055 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,12 +12,13 @@
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -51,9 +52,6 @@ using namespace llvm;
#define GET_INSTRINFO_CTOR_DTOR
#include "AArch64GenInstrInfo.inc"
-static const MachineMemOperand::Flags MOSuppressPair =
- MachineMemOperand::MOTargetFlag1;
-
static cl::opt<unsigned>
TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
@@ -369,7 +367,7 @@ void AArch64InstrInfo::instantiateCondBranch(
// Folded compare-and-branch
// Note that we use addOperand instead of addReg to keep the flags.
const MachineInstrBuilder MIB =
- BuildMI(&MBB, DL, get(Cond[1].getImm())).addOperand(Cond[2]);
+ BuildMI(&MBB, DL, get(Cond[1].getImm())).add(Cond[2]);
if (Cond.size() > 3)
MIB.addImm(Cond[3].getImm());
MIB.addMBB(TBB);
@@ -762,6 +760,128 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
llvm_unreachable("Unknown opcode to check as cheap as a move!");
}
+bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ default:
+ return false;
+
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSXrs: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+ if (ShiftVal == 0)
+ return true;
+ return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
+ }
+
+ case AArch64::ADDWrx:
+ case AArch64::ADDXrx:
+ case AArch64::ADDXrx64:
+ case AArch64::ADDSWrx:
+ case AArch64::ADDSXrx:
+ case AArch64::ADDSXrx64: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ switch (AArch64_AM::getArithExtendType(Imm)) {
+ default:
+ return false;
+ case AArch64_AM::UXTB:
+ case AArch64_AM::UXTH:
+ case AArch64_AM::UXTW:
+ case AArch64_AM::UXTX:
+ return AArch64_AM::getArithShiftValue(Imm) <= 4;
+ }
+ }
+
+ case AArch64::SUBWrs:
+ case AArch64::SUBSWrs: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+ return ShiftVal == 0 ||
+ (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
+ }
+
+ case AArch64::SUBXrs:
+ case AArch64::SUBSXrs: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+ return ShiftVal == 0 ||
+ (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
+ }
+
+ case AArch64::SUBWrx:
+ case AArch64::SUBXrx:
+ case AArch64::SUBXrx64:
+ case AArch64::SUBSWrx:
+ case AArch64::SUBSXrx:
+ case AArch64::SUBSXrx64: {
+ unsigned Imm = MI.getOperand(3).getImm();
+ switch (AArch64_AM::getArithExtendType(Imm)) {
+ default:
+ return false;
+ case AArch64_AM::UXTB:
+ case AArch64_AM::UXTH:
+ case AArch64_AM::UXTW:
+ case AArch64_AM::UXTX:
+ return AArch64_AM::getArithShiftValue(Imm) == 0;
+ }
+ }
+
+ case AArch64::LDRBBroW:
+ case AArch64::LDRBBroX:
+ case AArch64::LDRBroW:
+ case AArch64::LDRBroX:
+ case AArch64::LDRDroW:
+ case AArch64::LDRDroX:
+ case AArch64::LDRHHroW:
+ case AArch64::LDRHHroX:
+ case AArch64::LDRHroW:
+ case AArch64::LDRHroX:
+ case AArch64::LDRQroW:
+ case AArch64::LDRQroX:
+ case AArch64::LDRSBWroW:
+ case AArch64::LDRSBWroX:
+ case AArch64::LDRSBXroW:
+ case AArch64::LDRSBXroX:
+ case AArch64::LDRSHWroW:
+ case AArch64::LDRSHWroX:
+ case AArch64::LDRSHXroW:
+ case AArch64::LDRSHXroX:
+ case AArch64::LDRSWroW:
+ case AArch64::LDRSWroX:
+ case AArch64::LDRSroW:
+ case AArch64::LDRSroX:
+ case AArch64::LDRWroW:
+ case AArch64::LDRWroX:
+ case AArch64::LDRXroW:
+ case AArch64::LDRXroX:
+ case AArch64::PRFMroW:
+ case AArch64::PRFMroX:
+ case AArch64::STRBBroW:
+ case AArch64::STRBBroX:
+ case AArch64::STRBroW:
+ case AArch64::STRBroX:
+ case AArch64::STRDroW:
+ case AArch64::STRDroX:
+ case AArch64::STRHHroW:
+ case AArch64::STRHHroX:
+ case AArch64::STRHroW:
+ case AArch64::STRHroX:
+ case AArch64::STRQroW:
+ case AArch64::STRQroX:
+ case AArch64::STRSroW:
+ case AArch64::STRSroX:
+ case AArch64::STRWroW:
+ case AArch64::STRWroX:
+ case AArch64::STRXroW:
+ case AArch64::STRXroX: {
+ unsigned IsSigned = MI.getOperand(3).getImm();
+ return !IsSigned;
+ }
+ }
+}
+
bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const {
@@ -913,7 +1033,7 @@ static bool UpdateOperandRegClass(MachineInstr &Instr) {
/// \brief Return the opcode that does not set flags when possible - otherwise
/// return the original opcode. The caller is responsible to do the actual
/// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
+static unsigned convertToNonFlagSettingOpc(const MachineInstr &MI) {
// Don't convert all compare instructions, because for some the zero register
// encoding becomes the sp register.
bool MIDefinesZeroReg = false;
@@ -1022,7 +1142,7 @@ bool AArch64InstrInfo::optimizeCompareInstr(
return true;
}
unsigned Opc = CmpInstr.getOpcode();
- unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
+ unsigned NewOpc = convertToNonFlagSettingOpc(CmpInstr);
if (NewOpc == Opc)
return false;
const MCInstrDesc &MCID = get(NewOpc);
@@ -1159,6 +1279,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
case AArch64CC::HI: // Z clear and C set
case AArch64CC::LS: // Z set or C clear
UsedFlags.Z = true;
+ LLVM_FALLTHROUGH;
case AArch64CC::HS: // C set
case AArch64CC::LO: // C clear
UsedFlags.C = true;
@@ -1177,6 +1298,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
case AArch64CC::GT: // Z clear, N and V the same
case AArch64CC::LE: // Z set, N and V differ
UsedFlags.Z = true;
+ LLVM_FALLTHROUGH;
case AArch64CC::GE: // N and V the same
case AArch64CC::LT: // N and V differ
UsedFlags.N = true;
@@ -1299,16 +1421,16 @@ bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
.addMemOperand(*MI.memoperands_begin());
} else if (TM.getCodeModel() == CodeModel::Large) {
BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
- .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
+ .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
.addReg(Reg, RegState::Kill)
- .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
+ .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
.addReg(Reg, RegState::Kill)
- .addGlobalAddress(GV, 0, AArch64II::MO_G1 | MO_NC).addImm(16);
+ .addGlobalAddress(GV, 0, AArch64II::MO_G2 | MO_NC).addImm(32);
BuildMI(MBB, MI, DL, get(AArch64::MOVKXi), Reg)
.addReg(Reg, RegState::Kill)
- .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
+ .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
.addReg(Reg, RegState::Kill)
.addImm(0)
@@ -1345,14 +1467,6 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
case AArch64::BICSXrs:
case AArch64::BICWrs:
case AArch64::BICXrs:
- case AArch64::CRC32Brr:
- case AArch64::CRC32CBrr:
- case AArch64::CRC32CHrr:
- case AArch64::CRC32CWrr:
- case AArch64::CRC32CXrr:
- case AArch64::CRC32Hrr:
- case AArch64::CRC32Wrr:
- case AArch64::CRC32Xrr:
case AArch64::EONWrs:
case AArch64::EONXrs:
case AArch64::EORWrs:
@@ -1598,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
(*MI.memoperands_begin())->setFlags(MOSuppressPair);
}
+/// Check all MachineMemOperands for a hint that the load/store is strided.
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+ return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+ return MMO->getFlags() & MOStridedAccess;
+ });
+}
+
bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
switch (Opc) {
default:
@@ -1691,16 +1812,59 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
} else
return false;
- // Offset is calculated as the immediate operand multiplied by the scaling factor.
- // Unscaled instructions have scaling factor set to 1.
+ // Get the scaling factor for the instruction and set the width for the
+ // instruction.
unsigned Scale = 0;
- switch (LdSt.getOpcode()) {
+ int64_t Dummy1, Dummy2;
+
+ // If this returns false, then it's an instruction we don't want to handle.
+ if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
+ return false;
+
+ // Compute the offset. Offset is calculated as the immediate operand
+ // multiplied by the scaling factor. Unscaled instructions have scaling factor
+ // set to 1.
+ if (LdSt.getNumExplicitOperands() == 3) {
+ BaseReg = LdSt.getOperand(1).getReg();
+ Offset = LdSt.getOperand(2).getImm() * Scale;
+ } else {
+ assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+ BaseReg = LdSt.getOperand(2).getReg();
+ Offset = LdSt.getOperand(3).getImm() * Scale;
+ }
+ return true;
+}
+
+MachineOperand&
+AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
+ assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
+ MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands()-1);
+ assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
+ return OfsOp;
+}
+
+bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
+ unsigned &Width, int64_t &MinOffset,
+ int64_t &MaxOffset) const {
+ switch (Opcode) {
+ // Not a memory operation or something we want to handle.
default:
+ Scale = Width = 0;
+ MinOffset = MaxOffset = 0;
return false;
+ case AArch64::STRWpost:
+ case AArch64::LDRWpost:
+ Width = 32;
+ Scale = 4;
+ MinOffset = -256;
+ MaxOffset = 255;
+ break;
case AArch64::LDURQi:
case AArch64::STURQi:
Width = 16;
Scale = 1;
+ MinOffset = -256;
+ MaxOffset = 255;
break;
case AArch64::LDURXi:
case AArch64::LDURDi:
@@ -1708,6 +1872,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STURDi:
Width = 8;
Scale = 1;
+ MinOffset = -256;
+ MaxOffset = 255;
break;
case AArch64::LDURWi:
case AArch64::LDURSi:
@@ -1716,6 +1882,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STURSi:
Width = 4;
Scale = 1;
+ MinOffset = -256;
+ MaxOffset = 255;
break;
case AArch64::LDURHi:
case AArch64::LDURHHi:
@@ -1725,6 +1893,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STURHHi:
Width = 2;
Scale = 1;
+ MinOffset = -256;
+ MaxOffset = 255;
break;
case AArch64::LDURBi:
case AArch64::LDURBBi:
@@ -1734,6 +1904,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STURBBi:
Width = 1;
Scale = 1;
+ MinOffset = -256;
+ MaxOffset = 255;
break;
case AArch64::LDPQi:
case AArch64::LDNPQi:
@@ -1741,10 +1913,14 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STNPQi:
Scale = 16;
Width = 32;
+ MinOffset = -64;
+ MaxOffset = 63;
break;
case AArch64::LDRQui:
case AArch64::STRQui:
Scale = Width = 16;
+ MinOffset = 0;
+ MaxOffset = 4095;
break;
case AArch64::LDPXi:
case AArch64::LDPDi:
@@ -1756,12 +1932,16 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STNPDi:
Scale = 8;
Width = 16;
+ MinOffset = -64;
+ MaxOffset = 63;
break;
case AArch64::LDRXui:
case AArch64::LDRDui:
case AArch64::STRXui:
case AArch64::STRDui:
Scale = Width = 8;
+ MinOffset = 0;
+ MaxOffset = 4095;
break;
case AArch64::LDPWi:
case AArch64::LDPSi:
@@ -1773,6 +1953,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STNPSi:
Scale = 4;
Width = 8;
+ MinOffset = -64;
+ MaxOffset = 63;
break;
case AArch64::LDRWui:
case AArch64::LDRSui:
@@ -1780,29 +1962,27 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
case AArch64::STRWui:
case AArch64::STRSui:
Scale = Width = 4;
+ MinOffset = 0;
+ MaxOffset = 4095;
break;
case AArch64::LDRHui:
case AArch64::LDRHHui:
case AArch64::STRHui:
case AArch64::STRHHui:
Scale = Width = 2;
+ MinOffset = 0;
+ MaxOffset = 4095;
break;
case AArch64::LDRBui:
case AArch64::LDRBBui:
case AArch64::STRBui:
case AArch64::STRBBui:
Scale = Width = 1;
+ MinOffset = 0;
+ MaxOffset = 4095;
break;
}
- if (LdSt.getNumExplicitOperands() == 3) {
- BaseReg = LdSt.getOperand(1).getReg();
- Offset = LdSt.getOperand(2).getImm() * Scale;
- } else {
- assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
- BaseReg = LdSt.getOperand(2).getReg();
- Offset = LdSt.getOperand(3).getImm() * Scale;
- }
return true;
}
@@ -1903,88 +2083,6 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
return Offset1 + 1 == Offset2;
}
-bool AArch64InstrInfo::shouldScheduleAdjacent(
- const MachineInstr &First, const MachineInstr &Second) const {
- if (Subtarget.hasArithmeticBccFusion()) {
- // Fuse CMN, CMP, TST followed by Bcc.
- unsigned SecondOpcode = Second.getOpcode();
- if (SecondOpcode == AArch64::Bcc) {
- switch (First.getOpcode()) {
- default:
- return false;
- case AArch64::ADDSWri:
- case AArch64::ADDSWrr:
- case AArch64::ADDSXri:
- case AArch64::ADDSXrr:
- case AArch64::ANDSWri:
- case AArch64::ANDSWrr:
- case AArch64::ANDSXri:
- case AArch64::ANDSXrr:
- case AArch64::SUBSWri:
- case AArch64::SUBSWrr:
- case AArch64::SUBSXri:
- case AArch64::SUBSXrr:
- case AArch64::BICSWrr:
- case AArch64::BICSXrr:
- return true;
- case AArch64::ADDSWrs:
- case AArch64::ADDSXrs:
- case AArch64::ANDSWrs:
- case AArch64::ANDSXrs:
- case AArch64::SUBSWrs:
- case AArch64::SUBSXrs:
- case AArch64::BICSWrs:
- case AArch64::BICSXrs:
- // Shift value can be 0 making these behave like the "rr" variant...
- return !hasShiftedReg(Second);
- }
- }
- }
- if (Subtarget.hasArithmeticCbzFusion()) {
- // Fuse ALU operations followed by CBZ/CBNZ.
- unsigned SecondOpcode = Second.getOpcode();
- if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
- SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
- switch (First.getOpcode()) {
- default:
- return false;
- case AArch64::ADDWri:
- case AArch64::ADDWrr:
- case AArch64::ADDXri:
- case AArch64::ADDXrr:
- case AArch64::ANDWri:
- case AArch64::ANDWrr:
- case AArch64::ANDXri:
- case AArch64::ANDXrr:
- case AArch64::EORWri:
- case AArch64::EORWrr:
- case AArch64::EORXri:
- case AArch64::EORXrr:
- case AArch64::ORRWri:
- case AArch64::ORRWrr:
- case AArch64::ORRXri:
- case AArch64::ORRXrr:
- case AArch64::SUBWri:
- case AArch64::SUBWrr:
- case AArch64::SUBXri:
- case AArch64::SUBXrr:
- return true;
- case AArch64::ADDWrs:
- case AArch64::ADDXrs:
- case AArch64::ANDWrs:
- case AArch64::ANDXrs:
- case AArch64::SUBWrs:
- case AArch64::SUBXrs:
- case AArch64::BICWrs:
- case AArch64::BICXrs:
- // Shift value can be 0 making these behave like the "rr" variant...
- return !hasShiftedReg(Second);
- }
- }
- }
- return false;
-}
-
MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
const MDNode *Expr, const DebugLoc &DL) const {
@@ -2339,7 +2437,7 @@ void AArch64InstrInfo::storeRegToStackSlot(
PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
unsigned Opc = 0;
bool Offset = true;
- switch (RC->getSize()) {
+ switch (TRI->getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
Opc = AArch64::STRBui;
@@ -2443,7 +2541,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(
unsigned Opc = 0;
bool Offset = true;
- switch (RC->getSize()) {
+ switch (TRI->getSpillSize(*RC)) {
case 1:
if (AArch64::FPR8RegClass.hasSubClassEq(RC))
Opc = AArch64::LDRBui;
@@ -2668,7 +2766,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
};
if (DstMO.getSubReg() == 0 && SrcMO.getSubReg() == 0) {
- assert(getRegClass(DstReg)->getSize() == getRegClass(SrcReg)->getSize() &&
+ assert(TRI.getRegSizeInBits(*getRegClass(DstReg)) ==
+ TRI.getRegSizeInBits(*getRegClass(SrcReg)) &&
"Mismatched register size in non subreg COPY");
if (IsSpill)
storeRegToStackSlot(MBB, InsertPt, SrcReg, SrcMO.isKill(), FrameIndex,
@@ -2754,7 +2853,8 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
}
if (FillRC) {
- assert(getRegClass(SrcReg)->getSize() == FillRC->getSize() &&
+ assert(TRI.getRegSizeInBits(*getRegClass(SrcReg)) ==
+ TRI.getRegSizeInBits(*FillRC) &&
"Mismatched regclass size on folded subreg COPY");
loadRegFromStackSlot(MBB, InsertPt, DstReg, FrameIndex, FillRC, &TRI);
MachineInstr &LoadMI = *--InsertPt;
@@ -3044,7 +3144,7 @@ bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
return false;
}
-void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+void AArch64InstrInfo::getNoop(MCInst &NopInst) const {
NopInst.setOpcode(AArch64::HINT);
NopInst.addOperand(MCOperand::createImm(0));
}
@@ -3224,7 +3324,7 @@ static bool getMaddPatterns(MachineInstr &Root,
// When NZCV is live bail out.
if (Cmp_NZCV == -1)
return false;
- unsigned NewOpc = convertFlagSettingOpcode(Root);
+ unsigned NewOpc = convertToNonFlagSettingOpc(Root);
// When opcode can't change bail out.
// CHECKME: do we miss any cases for opcode conversion?
if (NewOpc == Opc)
@@ -3444,6 +3544,10 @@ static bool getFMAPatterns(MachineInstr &Root,
Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
Found = true;
}
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) {
+ Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1);
+ Found = true;
+ }
break;
case AArch64::FSUBDrr:
if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
@@ -3458,6 +3562,10 @@ static bool getFMAPatterns(MachineInstr &Root,
Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
Found = true;
}
+ if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) {
+ Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1);
+ Found = true;
+ }
break;
case AArch64::FSUBv2f32:
if (canCombineWithFMUL(MBB, Root.getOperand(2),
@@ -3512,6 +3620,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
case MachineCombinerPattern::FMULADDD_OP2:
case MachineCombinerPattern::FMULSUBD_OP1:
case MachineCombinerPattern::FMULSUBD_OP2:
+ case MachineCombinerPattern::FNMULSUBS_OP1:
+ case MachineCombinerPattern::FNMULSUBD_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
@@ -3565,12 +3675,17 @@ enum class FMAInstKind { Default, Indexed, Accumulator };
/// F|MUL I=A,B,0
/// F|ADD R,I,C
/// ==> F|MADD R,A,B,C
+/// \param MF Containing MachineFunction
+/// \param MRI Register information
+/// \param TII Target information
/// \param Root is the F|ADD instruction
/// \param [out] InsInstrs is a vector of machine instructions and will
/// contain the generated madd instruction
/// \param IdxMulOpd is index of operand in Root that is the result of
/// the F|MUL. In the example above IdxMulOpd is 1.
/// \param MaddOpc the opcode fo the f|madd instruction
+/// \param RC Register class of operands
+/// \param kind of fma instruction (addressing mode) to be generated
static MachineInstr *
genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
const TargetInstrInfo *TII, MachineInstr &Root,
@@ -3629,6 +3744,9 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
/// ADD R,I,Imm
/// ==> ORR V, ZR, Imm
/// ==> MADD R,A,B,V
+/// \param MF Containing MachineFunction
+/// \param MRI Register information
+/// \param TII Target information
/// \param Root is the ADD instruction
/// \param [out] InsInstrs is a vector of machine instructions and will
/// contain the generated madd instruction
@@ -3637,6 +3755,7 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
/// \param MaddOpc the opcode fo the madd instruction
/// \param VR is a virtual register that holds the value of an ADD operand
/// (V in the example above).
+/// \param RC Register class of operands
static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI,
const TargetInstrInfo *TII, MachineInstr &Root,
SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -3793,7 +3912,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MachineInstrBuilder MIB1 =
BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc), NewVR)
.addReg(ZeroReg)
- .addOperand(Root.getOperand(2));
+ .add(Root.getOperand(2));
InsInstrs.push_back(MIB1);
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
@@ -4013,6 +4132,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
break;
}
+
+ case MachineCombinerPattern::FNMULSUBS_OP1:
+ case MachineCombinerPattern::FNMULSUBD_OP1: {
+ // FNMUL I=A,B,0
+ // FSUB R,I,C
+ // ==> FNMADD R,A,B,C // = -A*B - C
+ // --- Create(FNMADD);
+ if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) {
+ Opc = AArch64::FNMADDSrrr;
+ RC = &AArch64::FPR32RegClass;
+ } else {
+ Opc = AArch64::FNMADDDrrr;
+ RC = &AArch64::FPR64RegClass;
+ }
+ MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ }
+
case MachineCombinerPattern::FMULSUBS_OP2:
case MachineCombinerPattern::FMULSUBD_OP2: {
// FMUL I=A,B,0
@@ -4028,6 +4165,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
+ }
case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
Opc = AArch64::FMLSv1i32_indexed;
@@ -4084,7 +4222,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
FMAInstKind::Accumulator);
}
break;
- }
} // end switch (Pattern)
// Record MUL and ADD/SUB for deletion
DelInstrs.push_back(MUL);
@@ -4094,26 +4231,36 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
/// \brief Replace csincr-branch sequence by simple conditional branch
///
/// Examples:
-/// 1.
+/// 1. \code
/// csinc w9, wzr, wzr, <condition code>
/// tbnz w9, #0, 0x44
+/// \endcode
/// to
+/// \code
/// b.<inverted condition code>
+/// \endcode
///
-/// 2.
+/// 2. \code
/// csinc w9, wzr, wzr, <condition code>
/// tbz w9, #0, 0x44
+/// \endcode
/// to
+/// \code
/// b.<condition code>
+/// \endcode
///
/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
/// compare's constant operand is power of 2.
///
/// Examples:
+/// \code
/// and w8, w8, #0x400
/// cbnz w8, L1
+/// \endcode
/// to
+/// \code
/// tbnz w8, #10, L1
+/// \endcode
///
/// \param MI Conditional Branch
/// \return True when the simple conditional branch is generated
@@ -4286,3 +4433,207 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
{MO_TLS, "aarch64-tls"}};
return makeArrayRef(TargetFlags);
}
+
+ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
+ static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
+ {{MOSuppressPair, "aarch64-suppress-pair"},
+ {MOStridedAccess, "aarch64-strided-access"}};
+ return makeArrayRef(TargetFlags);
+}
+
+unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize,
+ size_t Occurrences,
+ bool CanBeTailCall) const {
+ unsigned NotOutlinedSize = SequenceSize * Occurrences;
+ unsigned OutlinedSize;
+
+ // Is this candidate something we can outline as a tail call?
+ if (CanBeTailCall) {
+ // If yes, then we just outline the sequence and replace each of its
+ // occurrences with a branch instruction.
+ OutlinedSize = SequenceSize + Occurrences;
+ } else {
+ // If no, then we outline the sequence (SequenceSize), add a return (+1),
+ // and replace each occurrence with a save/restore to LR and a call
+ // (3 * Occurrences)
+ OutlinedSize = (SequenceSize + 1) + (3 * Occurrences);
+ }
+
+ // Return the number of instructions saved by outlining this sequence.
+ return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+}
+
+bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+ return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+AArch64GenInstrInfo::MachineOutlinerInstrType
+AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+ // Don't outline LOHs.
+ if (FuncInfo->getLOHRelated().count(&MI))
+ return MachineOutlinerInstrType::Illegal;
+
+ // Don't allow debug values to impact outlining type.
+ if (MI.isDebugValue() || MI.isIndirectDebugValue())
+ return MachineOutlinerInstrType::Invisible;
+
+ // Is this a terminator for a basic block?
+ if (MI.isTerminator()) {
+
+ // Is this the end of a function?
+ if (MI.getParent()->succ_empty())
+ return MachineOutlinerInstrType::Legal;
+
+ // It's not, so don't outline it.
+ return MachineOutlinerInstrType::Illegal;
+ }
+
+ // Don't outline positions.
+ if (MI.isPosition())
+ return MachineOutlinerInstrType::Illegal;
+
+ // Make sure none of the operands are un-outlinable.
+ for (const MachineOperand &MOP : MI.operands())
+ if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+ MOP.isTargetIndex())
+ return MachineOutlinerInstrType::Illegal;
+
+ // Don't outline anything that uses the link register.
+ if (MI.modifiesRegister(AArch64::LR, &RI) ||
+ MI.readsRegister(AArch64::LR, &RI))
+ return MachineOutlinerInstrType::Illegal;
+
+ // Does this use the stack?
+ if (MI.modifiesRegister(AArch64::SP, &RI) ||
+ MI.readsRegister(AArch64::SP, &RI)) {
+
+ // Is it a memory operation?
+ if (MI.mayLoadOrStore()) {
+ unsigned Base; // Filled with the base regiser of MI.
+ int64_t Offset; // Filled with the offset of MI.
+ unsigned DummyWidth;
+
+ // Does it allow us to offset the base register and is the base SP?
+ if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) ||
+ Base != AArch64::SP)
+ return MachineOutlinerInstrType::Illegal;
+
+ // Find the minimum/maximum offset for this instruction and check if
+ // fixing it up would be in range.
+ int64_t MinOffset, MaxOffset;
+ unsigned DummyScale;
+ getMemOpInfo(MI.getOpcode(), DummyScale, DummyWidth, MinOffset,
+ MaxOffset);
+
+ // TODO: We should really test what happens if an instruction overflows.
+ // This is tricky to test with IR tests, but when the outliner is moved
+ // to a MIR test, it really ought to be checked.
+ if (Offset + 16 < MinOffset || Offset + 16 > MaxOffset)
+ return MachineOutlinerInstrType::Illegal;
+
+ // It's in range, so we can outline it.
+ return MachineOutlinerInstrType::Legal;
+ }
+
+ // We can't fix it up, so don't outline it.
+ return MachineOutlinerInstrType::Illegal;
+ }
+
+ return MachineOutlinerInstrType::Legal;
+}
+
+void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
+ for (MachineInstr &MI : MBB) {
+ unsigned Base, Width;
+ int64_t Offset;
+
+ // Is this a load or store with an immediate offset with SP as the base?
+ if (!MI.mayLoadOrStore() ||
+ !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) ||
+ Base != AArch64::SP)
+ continue;
+
+ // It is, so we have to fix it up.
+ unsigned Scale;
+ int64_t Dummy1, Dummy2;
+
+ MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
+ assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
+ getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
+ assert(Scale != 0 && "Unexpected opcode!");
+
+ // We've pushed the return address to the stack, so add 16 to the offset.
+ // This is safe, since we already checked if it would overflow when we
+ // checked if this instruction was legal to outline.
+ int64_t NewImm = (Offset + 16)/Scale;
+ StackOffsetOperand.setImm(NewImm);
+ }
+}
+
+void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+ MachineFunction &MF,
+ bool IsTailCall) const {
+
+ // If this is a tail call outlined function, then there's already a return.
+ if (IsTailCall)
+ return;
+
+ // It's not a tail call, so we have to insert the return ourselves.
+ MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
+ .addReg(AArch64::LR, RegState::Undef);
+ MBB.insert(MBB.end(), ret);
+
+ // Walk over the basic block and fix up all the stack accesses.
+ fixupPostOutline(MBB);
+}
+
+void AArch64InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
+ MachineFunction &MF,
+ bool IsTailCall) const {}
+
+MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
+ Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
+ MachineFunction &MF, bool IsTailCall) const {
+
+ // Are we tail calling?
+ if (IsTailCall) {
+ // If yes, then we can just branch to the label.
+ It = MBB.insert(It,
+ BuildMI(MF, DebugLoc(), get(AArch64::B))
+ .addGlobalAddress(M.getNamedValue(MF.getName())));
+ return It;
+ }
+
+ // We're not tail calling, so we have to save LR before the call and restore
+ // it after.
+ MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
+ .addReg(AArch64::SP, RegState::Define)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::SP)
+ .addImm(-16);
+ It = MBB.insert(It, STRXpre);
+ It++;
+
+ // Insert the call.
+ It = MBB.insert(It,
+ BuildMI(MF, DebugLoc(), get(AArch64::BL))
+ .addGlobalAddress(M.getNamedValue(MF.getName())));
+
+ It++;
+
+ // Restore the link register.
+ MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
+ .addReg(AArch64::SP, RegState::Define)
+ .addReg(AArch64::LR)
+ .addReg(AArch64::SP)
+ .addImm(16);
+ It = MBB.insert(It, LDRXpost);
+
+ return It;
+}
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 5037866..1765a02 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -27,6 +27,13 @@ namespace llvm {
class AArch64Subtarget;
class AArch64TargetMachine;
+static const MachineMemOperand::Flags MOSuppressPair =
+ MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MOStridedAccess =
+ MachineMemOperand::MOTargetFlag2;
+
+#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access"
+
class AArch64InstrInfo final : public AArch64GenInstrInfo {
const AArch64RegisterInfo RI;
const AArch64Subtarget &Subtarget;
@@ -81,6 +88,9 @@ public:
/// unprofitable.
bool isLdStPairSuppressed(const MachineInstr &MI) const;
+ /// Return true if the given load or store is a strided memory access.
+ bool isStridedAccess(const MachineInstr &MI) const;
+
/// Return true if this is an unscaled load/store.
bool isUnscaledLdSt(unsigned Opc) const;
@@ -119,6 +129,44 @@ public:
}
}
+ /// \brief Return the opcode that set flags when possible. The caller is
+ /// responsible for ensuring the opc has a flag setting equivalent.
+ static unsigned convertToFlagSettingOpc(unsigned Opc, bool &Is64Bit) {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Opcode has no flag setting equivalent!");
+ // 32-bit cases:
+ case AArch64::ADDWri: Is64Bit = false; return AArch64::ADDSWri;
+ case AArch64::ADDWrr: Is64Bit = false; return AArch64::ADDSWrr;
+ case AArch64::ADDWrs: Is64Bit = false; return AArch64::ADDSWrs;
+ case AArch64::ADDWrx: Is64Bit = false; return AArch64::ADDSWrx;
+ case AArch64::ANDWri: Is64Bit = false; return AArch64::ANDSWri;
+ case AArch64::ANDWrr: Is64Bit = false; return AArch64::ANDSWrr;
+ case AArch64::ANDWrs: Is64Bit = false; return AArch64::ANDSWrs;
+ case AArch64::BICWrr: Is64Bit = false; return AArch64::BICSWrr;
+ case AArch64::BICWrs: Is64Bit = false; return AArch64::BICSWrs;
+ case AArch64::SUBWri: Is64Bit = false; return AArch64::SUBSWri;
+ case AArch64::SUBWrr: Is64Bit = false; return AArch64::SUBSWrr;
+ case AArch64::SUBWrs: Is64Bit = false; return AArch64::SUBSWrs;
+ case AArch64::SUBWrx: Is64Bit = false; return AArch64::SUBSWrx;
+ // 64-bit cases:
+ case AArch64::ADDXri: Is64Bit = true; return AArch64::ADDSXri;
+ case AArch64::ADDXrr: Is64Bit = true; return AArch64::ADDSXrr;
+ case AArch64::ADDXrs: Is64Bit = true; return AArch64::ADDSXrs;
+ case AArch64::ADDXrx: Is64Bit = true; return AArch64::ADDSXrx;
+ case AArch64::ANDXri: Is64Bit = true; return AArch64::ANDSXri;
+ case AArch64::ANDXrr: Is64Bit = true; return AArch64::ANDSXrr;
+ case AArch64::ANDXrs: Is64Bit = true; return AArch64::ANDSXrs;
+ case AArch64::BICXrr: Is64Bit = true; return AArch64::BICSXrr;
+ case AArch64::BICXrs: Is64Bit = true; return AArch64::BICSXrs;
+ case AArch64::SUBXri: Is64Bit = true; return AArch64::SUBSXri;
+ case AArch64::SUBXrr: Is64Bit = true; return AArch64::SUBSXrr;
+ case AArch64::SUBXrs: Is64Bit = true; return AArch64::SUBSXrs;
+ case AArch64::SUBXrx: Is64Bit = true; return AArch64::SUBSXrx;
+ }
+ }
+
+
/// Return true if this is a load/store that can be potentially paired/merged.
bool isCandidateToMergeOrPair(MachineInstr &MI) const;
@@ -133,12 +181,19 @@ public:
int64_t &Offset, unsigned &Width,
const TargetRegisterInfo *TRI) const;
+ /// Return the immediate offset of the base register in a load/store \p LdSt.
+ MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const;
+
+ /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set
+ /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
+ ///
+ /// For unscaled instructions, \p Scale is set to 1.
+ bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
+ int64_t &MinOffset, int64_t &MaxOffset) const;
+
bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
unsigned NumLoads) const override;
- bool shouldScheduleAdjacent(const MachineInstr &First,
- const MachineInstr &Second) const override;
-
MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
uint64_t Offset, const MDNode *Var,
const MDNode *Expr,
@@ -198,7 +253,7 @@ public:
const DebugLoc &DL, unsigned DstReg,
ArrayRef<MachineOperand> Cond, unsigned TrueReg,
unsigned FalseReg) const override;
- void getNoopForMachoTarget(MCInst &NopInst) const override;
+ void getNoop(MCInst &NopInst) const override;
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
@@ -218,8 +273,8 @@ public:
/// \param Pattern - combiner pattern
bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
/// Return true when there is potentially a faster code sequence
- /// for an instruction chain ending in <Root>. All potential patterns are
- /// listed in the <Patterns> array.
+ /// for an instruction chain ending in ``Root``. All potential patterns are
+ /// listed in the ``Patterns`` array.
bool getMachineCombinerPatterns(MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns)
const override;
@@ -244,8 +299,36 @@ public:
getSerializableDirectMachineOperandTargetFlags() const override;
ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const override;
-
+ ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
+ getSerializableMachineMemOperandTargetFlags() const override;
+
+ bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+ unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences,
+ bool CanBeTailCall) const override;
+ AArch64GenInstrInfo::MachineOutlinerInstrType
+ getOutliningType(MachineInstr &MI) const override;
+ void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+ MachineFunction &MF,
+ bool IsTailCall) const override;
+ void insertOutlinerPrologue(MachineBasicBlock &MBB,
+ MachineFunction &MF,
+ bool isTailCall) const override;
+ MachineBasicBlock::iterator
+ insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &It,
+ MachineFunction &MF,
+ bool IsTailCall) const override;
+ /// Returns true if the instruction has a shift by immediate that can be
+ /// executed in one cycle less.
+ bool isFalkorShiftExtFast(const MachineInstr &MI) const;
private:
+
+ /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+ /// so that they will be valid post-outlining.
+ ///
+ /// \param MBB A \p MachineBasicBlock in an outlined function.
+ void fixupPostOutline(MachineBasicBlock &MBB) const;
+
void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
MachineBasicBlock *TBB,
ArrayRef<MachineOperand> Cond) const;
@@ -283,7 +366,7 @@ enum AArch64FrameOffsetStatus {
/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
/// use an offset.eq
/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
+/// rewritten in @p MI.
/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
/// amount that is off the limit of the legal offset.
/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 2244baa..5049a39 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -30,17 +30,29 @@ def HasLSE : Predicate<"Subtarget->hasLSE()">,
AssemblerPredicate<"FeatureLSE", "lse">;
def HasRAS : Predicate<"Subtarget->hasRAS()">,
AssemblerPredicate<"FeatureRAS", "ras">;
+def HasRDM : Predicate<"Subtarget->hasRDM()">,
+ AssemblerPredicate<"FeatureRDM", "rdm">;
def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
def HasSPE : Predicate<"Subtarget->hasSPE()">,
AssemblerPredicate<"FeatureSPE", "spe">;
+def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
+ AssemblerPredicate<"FeatureFuseAES",
+ "fuse-aes">;
+def HasSVE : Predicate<"Subtarget->hasSVE()">,
+ AssemblerPredicate<"FeatureSVE", "sve">;
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def UseAlternateSExtLoadCVTF32
: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
+def UseNegativeImmediates
+ : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
+ "NegativeImmediates">;
+
+
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
//
@@ -149,7 +161,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
- SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+ SDCallSeqStart<[ SDTCisVT<0, i32>,
+ SDTCisVT<1, i32> ]>,
[SDNPHasChain, SDNPOutGlue]>;
def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
SDCallSeqEnd<[ SDTCisVT<0, i32>,
@@ -305,10 +318,13 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
//===----------------------------------------------------------------------===//
// AArch64 Instruction Predicate Definitions.
-def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize : Predicate<"ForCodeSize">;
-def NotForCodeSize : Predicate<"!ForCodeSize">;
+// We could compute these on a per-module basis but doing so requires accessing
+// the Function object through the <Target>Subtarget and objections were raised
+// to that (see post-commit review comments for r301750).
+let RecomputePerFunction = 1 in {
+ def ForCodeSize : Predicate<"MF->getFunction()->optForSize()">;
+ def NotForCodeSize : Predicate<"!MF->getFunction()->optForSize()">;
+}
include "AArch64InstrFormats.td"
@@ -321,8 +337,9 @@ include "AArch64InstrFormats.td"
let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
// We set Sched to empty list because we expect these instructions to simply get
// removed in most cases.
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
- [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+ [(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
+ Sched<[]>;
def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
[(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
Sched<[]>;
@@ -424,8 +441,10 @@ def MSRpstateImm1 : MSRpstateImm0_1;
def MSRpstateImm4 : MSRpstateImm0_15;
// The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+// TPIDR_EL0. Add pseudo op so we can mark it as not having any side effects.
+let hasSideEffects = 0 in
+def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
+ [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;
// The cycle counter PMC register is PMCCNTR_EL0.
let Predicates = [HasPerfMon] in
@@ -574,31 +593,31 @@ def : Pat<(f64 fpimm:$in),
// sequences.
def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
tglobaladdr:$g1, tglobaladdr:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
- tglobaladdr:$g2, 32),
- tglobaladdr:$g1, 16),
- tglobaladdr:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0),
+ tglobaladdr:$g1, 16),
+ tglobaladdr:$g2, 32),
+ tglobaladdr:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
tblockaddress:$g1, tblockaddress:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
- tblockaddress:$g2, 32),
- tblockaddress:$g1, 16),
- tblockaddress:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0),
+ tblockaddress:$g1, 16),
+ tblockaddress:$g2, 32),
+ tblockaddress:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
tconstpool:$g1, tconstpool:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
- tconstpool:$g2, 32),
- tconstpool:$g1, 16),
- tconstpool:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0),
+ tconstpool:$g1, 16),
+ tconstpool:$g2, 32),
+ tconstpool:$g3, 48)>;
def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
tjumptable:$g1, tjumptable:$g0),
- (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
- tjumptable:$g2, 32),
- tjumptable:$g1, 16),
- tjumptable:$g0, 0)>;
+ (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0),
+ tjumptable:$g1, 16),
+ tjumptable:$g2, 32),
+ tjumptable:$g3, 48)>;
//===----------------------------------------------------------------------===//
@@ -697,10 +716,10 @@ def : InstAlias<"negs $dst, $src$shift",
defm UDIV : Div<0, "udiv", udiv>;
defm SDIV : Div<1, "sdiv", sdiv>;
-def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>;
-def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>;
-def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>;
-def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>;
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>;
// Variable shift
defm ASRV : Shift<0b10, "asr", sra>;
@@ -718,7 +737,7 @@ def : ShiftAlias<"rorv", RORVWr, GPR32>;
def : ShiftAlias<"rorv", RORVXr, GPR64>;
// Multiply-add
-let AddedComplexity = 7 in {
+let AddedComplexity = 5 in {
defm MADD : MulAccum<0, "madd", add>;
defm MSUB : MulAccum<1, "msub", sub>;
@@ -735,7 +754,7 @@ def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
+} // AddedComplexity = 5
let AddedComplexity = 5 in {
def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
@@ -2577,6 +2596,11 @@ def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
Sched<[WriteF]>;
}
+// Similarly add aliases
+def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>,
+ Requires<[HasFullFP16]>;
+def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>;
+def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>;
//===----------------------------------------------------------------------===//
// Floating point conversion instruction.
@@ -2720,60 +2744,36 @@ defm FMOV : FPMoveImmediate<"fmov">;
defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
int_aarch64_neon_uabd>;
// Match UABDL in log2-shuffle patterns.
+def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
+ (zext (v8i8 V64:$opB))))),
+ (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
(v8i16 (add (sub (zext (v8i8 V64:$opA)),
(zext (v8i8 V64:$opB))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
+def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
+ (zext (extract_high_v16i8 V128:$opB))))),
+ (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
(v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
(zext (extract_high_v16i8 V128:$opB))),
(AArch64vashr v8i16:$src, (i32 15))))),
(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
-def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
- (v4i32 (add (sub (zext (v4i16 V64:$opA)),
- (zext (v4i16 V64:$opB))),
- (AArch64vashr v4i32:$src, (i32 31))))),
+def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
+ (zext (v4i16 V64:$opB))))),
(UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
-def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))),
- (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)),
- (zext (extract_high_v8i16 V128:$opB))),
- (AArch64vashr v4i32:$src, (i32 31))))),
+def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
+ (zext (extract_high_v8i16 V128:$opB))))),
(UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
-def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
- (v2i64 (add (sub (zext (v2i32 V64:$opA)),
- (zext (v2i32 V64:$opB))),
- (AArch64vashr v2i64:$src, (i32 63))))),
+def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
+ (zext (v2i32 V64:$opB))))),
(UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
-def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))),
- (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)),
- (zext (extract_high_v4i32 V128:$opB))),
- (AArch64vashr v2i64:$src, (i32 63))))),
+def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
+ (zext (extract_high_v4i32 V128:$opB))))),
(UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;
-defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
-def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
- (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
- (ABSv8i8 V64:$src)>;
-def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
- (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
- (ABSv4i16 V64:$src)>;
-def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
- (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
- (ABSv2i32 V64:$src)>;
-def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
- (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
- (ABSv16i8 V128:$src)>;
-def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
- (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
- (ABSv8i16 V128:$src)>;
-def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
- (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
- (ABSv4i32 V128:$src)>;
-def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
- (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
- (ABSv2i64 V128:$src)>;
-
+defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
@@ -3284,7 +3284,7 @@ defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>
defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
-let Predicates = [HasV8_1a] in {
+let Predicates = [HasRDM] in {
defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
def : Pat<(i32 (int_aarch64_neon_sqadd
@@ -3345,7 +3345,7 @@ def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
// Advanced SIMD two scalar instructions.
//===----------------------------------------------------------------------===//
-defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", abs>;
defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
@@ -5029,7 +5029,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
0),
dsub)))>,
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
-
+
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
@@ -5307,6 +5307,31 @@ def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
+// for AES fusion on some CPUs.
+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
+def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+ Sched<[WriteV]>;
+def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
+ Sched<[WriteV]>;
+}
+
+// Only use constrained versions of AES(I)MC instructions if they are paired with
+// AESE/AESD.
+def : Pat<(v16i8 (int_aarch64_crypto_aesmc
+ (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1),
+ (v16i8 V128:$src2))))),
+ (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1),
+ (v16i8 V128:$src2)))))>,
+ Requires<[HasFuseAES]>;
+
+def : Pat<(v16i8 (int_aarch64_crypto_aesimc
+ (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1),
+ (v16i8 V128:$src2))))),
+ (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1),
+ (v16i8 V128:$src2)))))>,
+ Requires<[HasFuseAES]>;
+
def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index b514735..7e275e4 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -12,17 +12,20 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
-#include "AArch64InstructionSelector.h"
#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterBankInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
@@ -30,19 +33,79 @@
#define DEBUG_TYPE "aarch64-isel"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+
using namespace llvm;
#ifndef LLVM_BUILD_GLOBAL_ISEL
#error "You shouldn't build this"
#endif
+namespace {
+
+#define GET_GLOBALISEL_PREDICATE_BITSET
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATE_BITSET
+
+class AArch64InstructionSelector : public InstructionSelector {
+public:
+ AArch64InstructionSelector(const AArch64TargetMachine &TM,
+ const AArch64Subtarget &STI,
+ const AArch64RegisterBankInfo &RBI);
+
+ bool select(MachineInstr &I) const override;
+
+private:
+ /// tblgen-erated 'select' implementation, used as the initial selector for
+ /// the patterns that don't require complex C++.
+ bool selectImpl(MachineInstr &I) const;
+
+ bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
+ MachineRegisterInfo &MRI) const;
+ bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
+ MachineRegisterInfo &MRI) const;
+
+ bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
+ MachineRegisterInfo &MRI) const;
+
+ ComplexRendererFn selectArithImmed(MachineOperand &Root) const;
+
+ const AArch64TargetMachine &TM;
+ const AArch64Subtarget &STI;
+ const AArch64InstrInfo &TII;
+ const AArch64RegisterInfo &TRI;
+ const AArch64RegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_PREDICATES_DECL
#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_DECL
+
+// We declare the temporaries used by selectImpl() in the class to minimize the
+// cost of constructing placeholder values.
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
AArch64InstructionSelector::AArch64InstructionSelector(
const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
const AArch64RegisterBankInfo &RBI)
- : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
- TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+ : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+ TRI(*STI.getRegisterInfo()), RBI(RBI),
+#define GET_GLOBALISEL_PREDICATES_INIT
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_PREDICATES_INIT
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
// FIXME: This should be target-independent, inferred from the types declared
// for each class in the bank.
@@ -119,71 +182,39 @@ static bool unsupportedBinOp(const MachineInstr &I,
}
/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
-/// (such as G_OR or G_ADD), appropriate for the register bank \p RegBankID
+/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
/// and of size \p OpSize.
/// \returns \p GenericOpc if the combination is unsupported.
static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
unsigned OpSize) {
switch (RegBankID) {
case AArch64::GPRRegBankID:
- if (OpSize <= 32) {
- assert((OpSize == 32 || (GenericOpc != TargetOpcode::G_SDIV &&
- GenericOpc != TargetOpcode::G_UDIV &&
- GenericOpc != TargetOpcode::G_LSHR &&
- GenericOpc != TargetOpcode::G_ASHR)) &&
- "operation should have been legalized before now");
-
+ if (OpSize == 32) {
switch (GenericOpc) {
- case TargetOpcode::G_OR:
- return AArch64::ORRWrr;
- case TargetOpcode::G_XOR:
- return AArch64::EORWrr;
- case TargetOpcode::G_AND:
- return AArch64::ANDWrr;
- case TargetOpcode::G_ADD:
- assert(OpSize != 32 && "s32 G_ADD should have been selected");
- return AArch64::ADDWrr;
- case TargetOpcode::G_SUB:
- return AArch64::SUBWrr;
case TargetOpcode::G_SHL:
return AArch64::LSLVWr;
case TargetOpcode::G_LSHR:
return AArch64::LSRVWr;
case TargetOpcode::G_ASHR:
return AArch64::ASRVWr;
- case TargetOpcode::G_SDIV:
- return AArch64::SDIVWr;
- case TargetOpcode::G_UDIV:
- return AArch64::UDIVWr;
default:
return GenericOpc;
}
} else if (OpSize == 64) {
switch (GenericOpc) {
- case TargetOpcode::G_OR:
- return AArch64::ORRXrr;
- case TargetOpcode::G_XOR:
- return AArch64::EORXrr;
- case TargetOpcode::G_AND:
- return AArch64::ANDXrr;
case TargetOpcode::G_GEP:
return AArch64::ADDXrr;
- case TargetOpcode::G_SUB:
- return AArch64::SUBXrr;
case TargetOpcode::G_SHL:
return AArch64::LSLVXr;
case TargetOpcode::G_LSHR:
return AArch64::LSRVXr;
case TargetOpcode::G_ASHR:
return AArch64::ASRVXr;
- case TargetOpcode::G_SDIV:
- return AArch64::SDIVXr;
- case TargetOpcode::G_UDIV:
- return AArch64::UDIVXr;
default:
return GenericOpc;
}
}
+ break;
case AArch64::FPRRegBankID:
switch (OpSize) {
case 32:
@@ -215,7 +246,8 @@ static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
return GenericOpc;
}
}
- };
+ break;
+ }
return GenericOpc;
}
@@ -239,6 +271,7 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
case 64:
return isStore ? AArch64::STRXui : AArch64::LDRXui;
}
+ break;
case AArch64::FPRRegBankID:
switch (OpSize) {
case 8:
@@ -250,7 +283,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
case 64:
return isStore ? AArch64::STRDui : AArch64::LDRDui;
}
- };
+ break;
+ }
return GenericOpc;
}
@@ -473,6 +507,82 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
}
}
+bool AArch64InstructionSelector::selectCompareBranch(
+ MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+
+ const unsigned CondReg = I.getOperand(0).getReg();
+ MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+ MachineInstr *CCMI = MRI.getVRegDef(CondReg);
+ if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
+ return false;
+
+ unsigned LHS = CCMI->getOperand(2).getReg();
+ unsigned RHS = CCMI->getOperand(3).getReg();
+ if (!getConstantVRegVal(RHS, MRI))
+ std::swap(RHS, LHS);
+
+ const auto RHSImm = getConstantVRegVal(RHS, MRI);
+ if (!RHSImm || *RHSImm != 0)
+ return false;
+
+ const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
+ if (RB.getID() != AArch64::GPRRegBankID)
+ return false;
+
+ const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
+ if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
+ return false;
+
+ const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
+ unsigned CBOpc = 0;
+ if (CmpWidth <= 32)
+ CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
+ else if (CmpWidth == 64)
+ CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
+ else
+ return false;
+
+ auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
+ .addUse(LHS)
+ .addMBB(DestMBB);
+
+ constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool AArch64InstructionSelector::selectVaStartAAPCS(
+ MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+ return false;
+}
+
+bool AArch64InstructionSelector::selectVaStartDarwin(
+ MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+ AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ unsigned ListReg = I.getOperand(0).getReg();
+
+ unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+
+ auto MIB =
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
+ .addDef(ArgsAddrReg)
+ .addFrameIndex(FuncInfo->getVarArgsStackIndex())
+ .addImm(0)
+ .addImm(0);
+
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+ MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
+ .addUse(ArgsAddrReg)
+ .addUse(ListReg)
+ .addImm(0)
+ .addMemOperand(*I.memoperands_begin());
+
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
bool AArch64InstructionSelector::select(MachineInstr &I) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -549,6 +659,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
const unsigned CondReg = I.getOperand(0).getReg();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+ if (selectCompareBranch(I, MF, MRI))
+ return true;
+
auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
.addUse(CondReg)
.addImm(/*bit offset=*/0)
@@ -558,6 +671,11 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
}
+ case TargetOpcode::G_BRINDIRECT: {
+ I.setDesc(TII.get(AArch64::BR));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_CONSTANT: {
const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
@@ -629,9 +747,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
// FIXME: Is going through int64_t always correct?
ImmOp.ChangeToImmediate(
ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
- } else {
+ } else if (I.getOperand(1).isCImm()) {
uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
I.getOperand(1).ChangeToImmediate(Val);
+ } else if (I.getOperand(1).isImm()) {
+ uint64_t Val = I.getOperand(1).getImm();
+ I.getOperand(1).ChangeToImmediate(Val);
}
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -686,10 +807,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
return false;
}
-#ifndef NDEBUG
- // Sanity-check the pointer register.
+ auto &MemOp = **I.memoperands_begin();
+ if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+ DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+ return false;
+ }
+
const unsigned PtrReg = I.getOperand(1).getReg();
+#ifndef NDEBUG
const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
+ // Sanity-check the pointer register.
assert(PtrRB.getID() == AArch64::GPRRegBankID &&
"Load/Store pointer operand isn't a GPR");
assert(MRI.getType(PtrReg).isPointer() &&
@@ -706,11 +833,46 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
I.setDesc(TII.get(NewOpc));
- I.addOperand(MachineOperand::CreateImm(0));
+ uint64_t Offset = 0;
+ auto *PtrMI = MRI.getVRegDef(PtrReg);
+
+ // Try to fold a GEP into our unsigned immediate addressing mode.
+ if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
+ if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
+ int64_t Imm = *COff;
+ const unsigned Size = MemTy.getSizeInBits() / 8;
+ const unsigned Scale = Log2_32(Size);
+ if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
+ unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
+ I.getOperand(1).setReg(Ptr2Reg);
+ PtrMI = MRI.getVRegDef(Ptr2Reg);
+ Offset = Imm / Size;
+ }
+ }
+ }
+
+ // If we haven't folded anything into our addressing mode yet, try to fold
+ // a frame index into the base+offset.
+ if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
+ I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
+
+ I.addOperand(MachineOperand::CreateImm(Offset));
+
+ // If we're storing a 0, use WZR/XZR.
+ if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
+ if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
+ if (I.getOpcode() == AArch64::STRWui)
+ I.getOperand(0).setReg(AArch64::WZR);
+ else if (I.getOpcode() == AArch64::STRXui)
+ I.getOperand(0).setReg(AArch64::XZR);
+ }
+ }
+
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- case TargetOpcode::G_MUL: {
+ case TargetOpcode::G_SMULH:
+ case TargetOpcode::G_UMULH: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
@@ -719,48 +881,33 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID) {
- DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n");
+ DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
return false;
}
- unsigned ZeroReg;
- unsigned NewOpc;
- if (Ty.isScalar() && Ty.getSizeInBits() <= 32) {
- NewOpc = AArch64::MADDWrrr;
- ZeroReg = AArch64::WZR;
- } else if (Ty == LLT::scalar(64)) {
- NewOpc = AArch64::MADDXrrr;
- ZeroReg = AArch64::XZR;
- } else {
- DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: "
- << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n');
+ if (Ty != LLT::scalar(64)) {
+ DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
+ << ", expected: " << LLT::scalar(64) << '\n');
return false;
}
+ unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
+ : AArch64::UMULHrr;
I.setDesc(TII.get(NewOpc));
- I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false));
-
// Now that we selected an opcode, we need to constrain the register
// operands to use appropriate classes.
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
-
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_FMUL:
case TargetOpcode::G_FDIV:
case TargetOpcode::G_OR:
- case TargetOpcode::G_XOR:
- case TargetOpcode::G_AND:
case TargetOpcode::G_SHL:
case TargetOpcode::G_LSHR:
case TargetOpcode::G_ASHR:
- case TargetOpcode::G_SDIV:
- case TargetOpcode::G_UDIV:
- case TargetOpcode::G_ADD:
- case TargetOpcode::G_SUB:
case TargetOpcode::G_GEP: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
@@ -783,6 +930,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+ case TargetOpcode::G_PTR_MASK: {
+ uint64_t Align = I.getOperand(2).getImm();
+ if (Align >= 64 || Align == 0)
+ return false;
+
+ uint64_t Mask = ~((1ULL << Align) - 1);
+ I.setDesc(TII.get(AArch64::ANDXri));
+ I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
+
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
case TargetOpcode::G_PTRTOINT:
case TargetOpcode::G_TRUNC: {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -795,7 +953,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
if (DstRB.getID() != SrcRB.getID()) {
- DEBUG(dbgs() << "G_TRUNC input/output on different banks\n");
+ DEBUG(dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
return false;
}
@@ -812,16 +970,21 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
return false;
}
if (DstRC == SrcRC) {
// Nothing to be done
+ } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
+ SrcTy == LLT::scalar(64)) {
+ llvm_unreachable("TableGen can import this case");
+ return false;
} else if (DstRC == &AArch64::GPR32RegClass &&
SrcRC == &AArch64::GPR64RegClass) {
I.getOperand(1).setSubReg(AArch64::sub_32);
} else {
+ DEBUG(dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
return false;
}
@@ -1026,7 +1189,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
if (Ty == LLT::scalar(32)) {
CSelOpc = AArch64::CSELWr;
- } else if (Ty == LLT::scalar(64)) {
+ } else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64)) {
CSelOpc = AArch64::CSELXr;
} else {
return false;
@@ -1134,7 +1297,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
.addDef(Def1Reg)
.addUse(AArch64::WZR)
.addUse(AArch64::WZR)
- .addImm(CC1);
+ .addImm(getInvertedCondCode(CC1));
if (CC2 != AArch64CC::AL) {
unsigned Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
@@ -1143,7 +1306,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
.addDef(Def2Reg)
.addUse(AArch64::WZR)
.addUse(AArch64::WZR)
- .addImm(CC2);
+ .addImm(getInvertedCondCode(CC2));
MachineInstr &OrMI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
.addDef(DefReg)
@@ -1159,7 +1322,67 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
I.eraseFromParent();
return true;
}
+ case TargetOpcode::G_VASTART:
+ return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
+ : selectVaStartAAPCS(I, MF, MRI);
+ case TargetOpcode::G_IMPLICIT_DEF:
+ I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
+ return true;
}
return false;
}
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12. If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+InstructionSelector::ComplexRendererFn
+AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
+ MachineInstr &MI = *Root.getParent();
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction &MF = *MBB.getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // This function is called from the addsub_shifted_imm ComplexPattern,
+ // which lists [imm] as the list of opcode it's interested in, however
+ // we still need to check whether the operand is actually an immediate
+ // here because the ComplexPattern opcode list is only used in
+ // root-level opcode matching.
+ uint64_t Immed;
+ if (Root.isImm())
+ Immed = Root.getImm();
+ else if (Root.isCImm())
+ Immed = Root.getCImm()->getZExtValue();
+ else if (Root.isReg()) {
+ MachineInstr *Def = MRI.getVRegDef(Root.getReg());
+ if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
+ return nullptr;
+ MachineOperand &Op1 = Def->getOperand(1);
+ if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64)
+ return nullptr;
+ Immed = Op1.getCImm()->getZExtValue();
+ } else
+ return nullptr;
+
+ unsigned ShiftAmt;
+
+ if (Immed >> 12 == 0) {
+ ShiftAmt = 0;
+ } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+ ShiftAmt = 12;
+ Immed = Immed >> 12;
+ } else
+ return nullptr;
+
+ unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+ return [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed).addImm(ShVal); };
+}
+
+namespace llvm {
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &TM,
+ AArch64Subtarget &Subtarget,
+ AArch64RegisterBankInfo &RBI) {
+ return new AArch64InstructionSelector(TM, Subtarget, RBI);
+}
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h b/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
deleted file mode 100644
index 2c6e5a9..0000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64InstructionSelector.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- AArch64InstructionSelector --------------------------------*- C++ -*-==//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file declares the targeting of the InstructionSelector class for
-/// AArch64.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-
-namespace llvm {
-
-class AArch64InstrInfo;
-class AArch64RegisterBankInfo;
-class AArch64RegisterInfo;
-class AArch64Subtarget;
-class AArch64TargetMachine;
-
-class AArch64InstructionSelector : public InstructionSelector {
-public:
- AArch64InstructionSelector(const AArch64TargetMachine &TM,
- const AArch64Subtarget &STI,
- const AArch64RegisterBankInfo &RBI);
-
- bool select(MachineInstr &I) const override;
-
-private:
- /// tblgen-erated 'select' implementation, used as the initial selector for
- /// the patterns that don't require complex C++.
- bool selectImpl(MachineInstr &I) const;
-
- const AArch64TargetMachine &TM;
- const AArch64Subtarget &STI;
- const AArch64InstrInfo &TII;
- const AArch64RegisterInfo &TRI;
- const AArch64RegisterBankInfo &RBI;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 83f276a..ffb2783 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -13,9 +13,12 @@
//===----------------------------------------------------------------------===//
#include "AArch64LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Type.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
#include "llvm/Target/TargetOpcodes.h"
using namespace llvm;
@@ -36,11 +39,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
const LLT v4s32 = LLT::vector(4, 32);
const LLT v2s64 = LLT::vector(2, 64);
- for (auto BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
+ for (auto Ty : {p0, s1, s8, s16, s32, s64})
+ setAction({G_IMPLICIT_DEF, Ty}, Legal);
+
+ for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL}) {
// These operations naturally get the right answer when used on
// GPR32, even if the actual type is narrower.
- for (auto Ty : {s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
+ for (auto Ty : {s32, s64, v2s32, v4s32, v2s64})
setAction({BinOp, Ty}, Legal);
+
+ for (auto Ty : {s1, s8, s16})
+ setAction({BinOp, Ty}, WidenScalar);
}
setAction({G_GEP, p0}, Legal);
@@ -49,7 +58,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
for (auto Ty : {s1, s8, s16, s32})
setAction({G_GEP, 1, Ty}, WidenScalar);
- for (auto BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
+ setAction({G_PTR_MASK, p0}, Legal);
+
+ for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
for (auto Ty : {s32, s64})
setAction({BinOp, Ty}, Legal);
@@ -57,25 +68,47 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({BinOp, Ty}, WidenScalar);
}
- for (auto BinOp : { G_SREM, G_UREM })
+ for (unsigned BinOp : {G_SREM, G_UREM})
for (auto Ty : { s1, s8, s16, s32, s64 })
setAction({BinOp, Ty}, Lower);
- for (auto Op : { G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO }) {
+ for (unsigned Op : {G_SMULO, G_UMULO})
+ setAction({Op, s64}, Lower);
+
+ for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
for (auto Ty : { s32, s64 })
setAction({Op, Ty}, Legal);
setAction({Op, 1, s1}, Legal);
}
- for (auto BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+ for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV})
for (auto Ty : {s32, s64})
setAction({BinOp, Ty}, Legal);
- setAction({G_FREM, s32}, Libcall);
- setAction({G_FREM, s64}, Libcall);
+ for (unsigned BinOp : {G_FREM, G_FPOW}) {
+ setAction({BinOp, s32}, Libcall);
+ setAction({BinOp, s64}, Libcall);
+ }
+
+ for (auto Ty : {s32, s64, p0}) {
+ setAction({G_INSERT, Ty}, Legal);
+ setAction({G_INSERT, 1, Ty}, Legal);
+ }
+ for (auto Ty : {s1, s8, s16}) {
+ setAction({G_INSERT, Ty}, WidenScalar);
+ setAction({G_INSERT, 1, Ty}, Legal);
+ // FIXME: Can't widen the sources because that violates the constraints on
+ // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
+ }
+
+ for (auto Ty : {s1, s8, s16, s32, s64, p0})
+ setAction({G_EXTRACT, Ty}, Legal);
+
+ for (auto Ty : {s32, s64})
+ setAction({G_EXTRACT, 1, Ty}, Legal);
- for (auto MemOp : {G_LOAD, G_STORE}) {
+ for (unsigned MemOp : {G_LOAD, G_STORE}) {
for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
setAction({MemOp, Ty}, Legal);
@@ -141,12 +174,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_TRUNC, 1, Ty}, Legal);
// Conversions
- for (auto Ty : { s1, s8, s16, s32, s64 }) {
+ for (auto Ty : { s32, s64 }) {
setAction({G_FPTOSI, 0, Ty}, Legal);
setAction({G_FPTOUI, 0, Ty}, Legal);
setAction({G_SITOFP, 1, Ty}, Legal);
setAction({G_UITOFP, 1, Ty}, Legal);
}
+ for (auto Ty : { s1, s8, s16 }) {
+ setAction({G_FPTOSI, 0, Ty}, WidenScalar);
+ setAction({G_FPTOUI, 0, Ty}, WidenScalar);
+ setAction({G_SITOFP, 1, Ty}, WidenScalar);
+ setAction({G_UITOFP, 1, Ty}, WidenScalar);
+ }
for (auto Ty : { s32, s64 }) {
setAction({G_FPTOSI, 1, Ty}, Legal);
@@ -158,9 +197,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
// Control-flow
for (auto Ty : {s1, s8, s16, s32})
setAction({G_BRCOND, Ty}, Legal);
+ setAction({G_BRINDIRECT, p0}, Legal);
// Select
- for (auto Ty : {s1, s8, s16, s32, s64})
+ for (auto Ty : {s1, s8, s16})
+ setAction({G_SELECT, Ty}, WidenScalar);
+
+ for (auto Ty : {s32, s64, p0})
setAction({G_SELECT, Ty}, Legal);
setAction({G_SELECT, 1, s1}, Legal);
@@ -200,5 +243,81 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
}
+ setAction({G_VASTART, p0}, Legal);
+
+ // va_list must be a pointer, but most sized types are pretty easy to handle
+ // as the destination.
+ setAction({G_VAARG, 1, p0}, Legal);
+
+ for (auto Ty : {s8, s16, s32, s64, p0})
+ setAction({G_VAARG, Ty}, Custom);
+
computeTables();
}
+
+bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ switch (MI.getOpcode()) {
+ default:
+ // No idea what to do.
+ return false;
+ case TargetOpcode::G_VAARG:
+ return legalizeVaArg(MI, MRI, MIRBuilder);
+ }
+
+ llvm_unreachable("expected switch to return");
+}
+
+bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const {
+ MIRBuilder.setInstr(MI);
+ MachineFunction &MF = MIRBuilder.getMF();
+ unsigned Align = MI.getOperand(2).getImm();
+ unsigned Dst = MI.getOperand(0).getReg();
+ unsigned ListPtr = MI.getOperand(1).getReg();
+
+ LLT PtrTy = MRI.getType(ListPtr);
+ LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+ const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
+ unsigned List = MRI.createGenericVirtualRegister(PtrTy);
+ MIRBuilder.buildLoad(
+ List, ListPtr,
+ *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+ PtrSize, /* Align = */ PtrSize));
+
+ unsigned DstPtr;
+ if (Align > PtrSize) {
+ // Realign the list to the actual required alignment.
+ auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1);
+
+ unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy);
+ MIRBuilder.buildGEP(ListTmp, List, AlignMinus1->getOperand(0).getReg());
+
+ DstPtr = MRI.createGenericVirtualRegister(PtrTy);
+ MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
+ } else
+ DstPtr = List;
+
+ uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8;
+ MIRBuilder.buildLoad(
+ Dst, DstPtr,
+ *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+ ValSize, std::max(Align, PtrSize)));
+
+ unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy);
+ MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize));
+
+ unsigned NewList = MRI.createGenericVirtualRegister(PtrTy);
+ MIRBuilder.buildGEP(NewList, DstPtr, SizeReg);
+
+ MIRBuilder.buildStore(
+ NewList, ListPtr,
+ *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
+ PtrSize, /* Align = */ PtrSize));
+
+ MI.eraseFromParent();
+ return true;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
index feacbef..42d4ac1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -25,6 +25,13 @@ class LLVMContext;
class AArch64LegalizerInfo : public LegalizerInfo {
public:
AArch64LegalizerInfo();
+
+ bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const override;
+
+private:
+ bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &MIRBuilder) const;
};
} // End llvm namespace.
#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 8e312dc..9a7f45b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -18,17 +18,27 @@
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+
using namespace llvm;
#define DEBUG_TYPE "aarch64-ldst-opt"
@@ -58,15 +68,15 @@ typedef struct LdStPairFlags {
// If a matching instruction is found, MergeForward is set to true if the
// merge is to remove the first instruction and replace the second with
// a pair-wise insn, and false if the reverse is true.
- bool MergeForward;
+ bool MergeForward = false;
// SExtIdx gives the index of the result of the load pair that must be
// extended. The value of SExtIdx assumes that the paired load produces the
// value in this order: (I, returned iterator), i.e., -1 means no value has
// to be extended, 0 means I, and 1 means the returned iterator.
- int SExtIdx;
+ int SExtIdx = -1;
- LdStPairFlags() : MergeForward(false), SExtIdx(-1) {}
+ LdStPairFlags() = default;
void setMergeForward(bool V = true) { MergeForward = V; }
bool getMergeForward() const { return MergeForward; }
@@ -78,10 +88,12 @@ typedef struct LdStPairFlags {
struct AArch64LoadStoreOpt : public MachineFunctionPass {
static char ID;
+
AArch64LoadStoreOpt() : MachineFunctionPass(ID) {
initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
}
+ AliasAnalysis *AA;
const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;
const AArch64Subtarget *Subtarget;
@@ -89,6 +101,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Track which registers have been modified and used.
BitVector ModifiedRegs, UsedRegs;
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
// Scan the instructions looking for a load/store that can be combined
// with the current instruction into a load/store pair.
// Return the matching instruction if one is found, else MBB->end().
@@ -162,8 +179,10 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
};
+
char AArch64LoadStoreOpt::ID = 0;
-} // namespace
+
+} // end anonymous namespace
INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
AARCH64_LOAD_STORE_OPT_NAME, false, false)
@@ -246,7 +265,7 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
default:
if (IsValidLdStrOpc)
*IsValidLdStrOpc = false;
- return UINT_MAX;
+ return std::numeric_limits<unsigned>::max();
case AArch64::STRDui:
case AArch64::STURDi:
case AArch64::STRQui:
@@ -369,6 +388,10 @@ static unsigned isMatchingStore(MachineInstr &LoadInst,
}
static unsigned getPreIndexedOpcode(unsigned Opc) {
+ // FIXME: We don't currently support creating pre-indexed loads/stores when
+ // the load or store is the unscaled version. If we decide to perform such an
+ // optimization in the future the cases for the unscaled loads/stores will
+ // need to be added here.
switch (Opc) {
default:
llvm_unreachable("Opcode has no pre-indexed equivalent!");
@@ -432,32 +455,42 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
default:
llvm_unreachable("Opcode has no post-indexed wise equivalent!");
case AArch64::STRSui:
+ case AArch64::STURSi:
return AArch64::STRSpost;
case AArch64::STRDui:
+ case AArch64::STURDi:
return AArch64::STRDpost;
case AArch64::STRQui:
+ case AArch64::STURQi:
return AArch64::STRQpost;
case AArch64::STRBBui:
return AArch64::STRBBpost;
case AArch64::STRHHui:
return AArch64::STRHHpost;
case AArch64::STRWui:
+ case AArch64::STURWi:
return AArch64::STRWpost;
case AArch64::STRXui:
+ case AArch64::STURXi:
return AArch64::STRXpost;
case AArch64::LDRSui:
+ case AArch64::LDURSi:
return AArch64::LDRSpost;
case AArch64::LDRDui:
+ case AArch64::LDURDi:
return AArch64::LDRDpost;
case AArch64::LDRQui:
+ case AArch64::LDURQi:
return AArch64::LDRQpost;
case AArch64::LDRBBui:
return AArch64::LDRBBpost;
case AArch64::LDRHHui:
return AArch64::LDRHHpost;
case AArch64::LDRWui:
+ case AArch64::LDURWi:
return AArch64::LDRWpost;
case AArch64::LDRXui:
+ case AArch64::LDURXi:
return AArch64::LDRXpost;
case AArch64::LDRSWui:
return AArch64::LDRSWpost;
@@ -595,7 +628,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
MachineInstrBuilder MIB;
MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
.addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
- .addOperand(BaseRegOp)
+ .add(BaseRegOp)
.addImm(OffsetImm)
.setMemRefs(I->mergeMemRefsWith(*MergeMI));
(void)MIB;
@@ -709,9 +742,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
}
}
MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
- .addOperand(RegOp0)
- .addOperand(RegOp1)
- .addOperand(BaseRegOp)
+ .add(RegOp0)
+ .add(RegOp1)
+ .add(BaseRegOp)
.addImm(OffsetImm)
.setMemRefs(I->mergeMemRefsWith(*Paired));
@@ -776,6 +809,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
int LoadSize = getMemScale(*LoadI);
int StoreSize = getMemScale(*StoreI);
unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+ const MachineOperand &StMO = getLdStRegOp(*StoreI);
unsigned StRt = getLdStRegOp(*StoreI).getReg();
bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
@@ -788,7 +822,13 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
// Remove the load, if the destination register of the loads is the same
// register for stored value.
if (StRt == LdRt && LoadSize == 8) {
- StoreI->clearRegisterKills(StRt, TRI);
+ for (MachineInstr &MI : make_range(StoreI->getIterator(),
+ LoadI->getIterator())) {
+ if (MI.killsRegister(StRt, TRI)) {
+ MI.clearRegisterKills(StRt, TRI);
+ break;
+ }
+ }
DEBUG(dbgs() << "Remove load instruction:\n ");
DEBUG(LoadI->print(dbgs()));
DEBUG(dbgs() << "\n");
@@ -800,7 +840,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
.addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
- .addReg(StRt)
+ .add(StMO)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else {
// FIXME: Currently we disable this transformation in big-endian targets as
@@ -841,14 +881,14 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
DestReg)
- .addReg(StRt)
+ .add(StMO)
.addImm(AndMaskEncoded);
} else {
BitExtMI =
BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
DestReg)
- .addReg(StRt)
+ .add(StMO)
.addImm(Immr)
.addImm(Imms);
}
@@ -857,7 +897,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
// Clear kill flags between store and load.
for (MachineInstr &MI : make_range(StoreI->getIterator(),
BitExtMI->getIterator()))
- MI.clearRegisterKills(StRt, TRI);
+ if (MI.killsRegister(StRt, TRI)) {
+ MI.clearRegisterKills(StRt, TRI);
+ break;
+ }
DEBUG(dbgs() << "Promoting load by replacing :\n ");
DEBUG(StoreI->print(dbgs()));
@@ -923,7 +966,7 @@ static int alignTo(int Num, int PowOf2) {
}
static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
- const AArch64InstrInfo *TII) {
+ AliasAnalysis *AA) {
// One of the instructions must modify memory.
if (!MIa.mayStore() && !MIb.mayStore())
return false;
@@ -932,14 +975,14 @@ static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
return false;
- return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
+ return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
}
static bool mayAlias(MachineInstr &MIa,
SmallVectorImpl<MachineInstr *> &MemInsns,
- const AArch64InstrInfo *TII) {
+ AliasAnalysis *AA) {
for (MachineInstr *MIb : MemInsns)
- if (mayAlias(MIa, *MIb, TII))
+ if (mayAlias(MIa, *MIb, AA))
return true;
return false;
@@ -997,7 +1040,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
return false;
// If we encounter a store aliased with the load, return early.
- if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
+ if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
return false;
} while (MBBI != B && Count < Limit);
return false;
@@ -1167,7 +1210,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// first.
if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
!(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
- !mayAlias(MI, MemInsns, TII)) {
+ !mayAlias(MI, MemInsns, AA)) {
Flags.setMergeForward(false);
return MBBI;
}
@@ -1178,7 +1221,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// into the second.
if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
!(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
- !mayAlias(FirstMI, MemInsns, TII)) {
+ !mayAlias(FirstMI, MemInsns, AA)) {
Flags.setMergeForward(true);
return MBBI;
}
@@ -1233,19 +1276,19 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
if (!isPairedLdSt(*I)) {
// Non-paired instruction.
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(getLdStRegOp(*Update))
- .addOperand(getLdStRegOp(*I))
- .addOperand(getLdStBaseOp(*I))
+ .add(getLdStRegOp(*Update))
+ .add(getLdStRegOp(*I))
+ .add(getLdStBaseOp(*I))
.addImm(Value)
.setMemRefs(I->memoperands_begin(), I->memoperands_end());
} else {
// Paired instruction.
int Scale = getMemScale(*I);
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
- .addOperand(getLdStRegOp(*Update))
- .addOperand(getLdStRegOp(*I, 0))
- .addOperand(getLdStRegOp(*I, 1))
- .addOperand(getLdStBaseOp(*I))
+ .add(getLdStRegOp(*Update))
+ .add(getLdStRegOp(*I, 0))
+ .add(getLdStRegOp(*I, 1))
+ .add(getLdStBaseOp(*I))
.addImm(Value / Scale)
.setMemRefs(I->memoperands_begin(), I->memoperands_end());
}
@@ -1545,7 +1588,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
case AArch64::LDURBBi:
case AArch64::LDURHHi:
case AArch64::LDURWi:
- case AArch64::LDURXi: {
+ case AArch64::LDURXi:
if (tryToPromoteLoadFromStore(MBBI)) {
Modified = true;
break;
@@ -1553,7 +1596,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++MBBI;
break;
}
- }
}
// 2) Merge adjacent zero stores into a wider store.
// e.g.,
@@ -1666,8 +1708,9 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
++NumPostFolded;
break;
}
- // Don't know how to handle pre/post-index versions, so move to the next
- // instruction.
+
+ // Don't know how to handle unscaled pre/post-index versions below, so
+ // move to the next instruction.
if (TII->isUnscaledLdSt(Opc)) {
++MBBI;
break;
@@ -1722,6 +1765,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
TRI = Subtarget->getRegisterInfo();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
// Resize the modified and used register bitfield trackers. We do this once
// per function and then clear the bitfield each time we optimize a load or
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 45083df..f82b9db 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -151,13 +151,24 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
return MCOperand::createExpr(Expr);
}
+MCOperand AArch64MCInstLower::lowerSymbolOperandCOFF(const MachineOperand &MO,
+ MCSymbol *Sym) const {
+ MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+ const MCExpr *Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
+ if (!MO.isJTI() && MO.getOffset())
+ Expr = MCBinaryExpr::createAdd(
+ Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx);
+ return MCOperand::createExpr(Expr);
+}
+
MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
MCSymbol *Sym) const {
if (Printer.TM.getTargetTriple().isOSDarwin())
return lowerSymbolOperandDarwin(MO, Sym);
+ if (Printer.TM.getTargetTriple().isOSBinFormatCOFF())
+ return lowerSymbolOperandCOFF(MO, Sym);
- assert(Printer.TM.getTargetTriple().isOSBinFormatELF() &&
- "Expect Darwin or ELF target");
+ assert(Printer.TM.getTargetTriple().isOSBinFormatELF() && "Invalid target");
return lowerSymbolOperandELF(MO, Sym);
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
index 1e29b80..aa30fe1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.h
@@ -42,6 +42,8 @@ public:
MCSymbol *Sym) const;
MCOperand lowerSymbolOperandELF(const MachineOperand &MO,
MCSymbol *Sym) const;
+ MCOperand lowerSymbolOperandCOFF(const MachineOperand &MO,
+ MCSymbol *Sym) const;
MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
new file mode 100644
index 0000000..963cfad
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -0,0 +1,166 @@
+//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AArch64 implementation of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MacroFusion.h"
+#include "AArch64Subtarget.h"
+#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
+/// together. Given SecondMI, when FirstMI is unspecified, then check if
+/// SecondMI may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+ const TargetSubtargetInfo &TSI,
+ const MachineInstr *FirstMI,
+ const MachineInstr &SecondMI) {
+ const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
+ const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
+
+ // Assume wildcards for unspecified instrs.
+ unsigned FirstOpcode =
+ FirstMI ? FirstMI->getOpcode()
+ : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+ unsigned SecondOpcode = SecondMI.getOpcode();
+
+ if (ST.hasArithmeticBccFusion())
+ // Fuse CMN, CMP, TST followed by Bcc.
+ if (SecondOpcode == AArch64::Bcc)
+ switch (FirstOpcode) {
+ default:
+ return false;
+ case AArch64::ADDSWri:
+ case AArch64::ADDSWrr:
+ case AArch64::ADDSXri:
+ case AArch64::ADDSXrr:
+ case AArch64::ANDSWri:
+ case AArch64::ANDSWrr:
+ case AArch64::ANDSXri:
+ case AArch64::ANDSXrr:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSWrr:
+ case AArch64::SUBSXri:
+ case AArch64::SUBSXrr:
+ case AArch64::BICSWrr:
+ case AArch64::BICSXrr:
+ return true;
+ case AArch64::ADDSWrs:
+ case AArch64::ADDSXrs:
+ case AArch64::ANDSWrs:
+ case AArch64::ANDSXrs:
+ case AArch64::SUBSWrs:
+ case AArch64::SUBSXrs:
+ case AArch64::BICSWrs:
+ case AArch64::BICSXrs:
+ // Shift value can be 0 making these behave like the "rr" variant...
+ return !II.hasShiftedReg(*FirstMI);
+ case AArch64::INSTRUCTION_LIST_END:
+ return true;
+ }
+
+ if (ST.hasArithmeticCbzFusion())
+ // Fuse ALU operations followed by CBZ/CBNZ.
+ if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+ SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
+ switch (FirstOpcode) {
+ default:
+ return false;
+ case AArch64::ADDWri:
+ case AArch64::ADDWrr:
+ case AArch64::ADDXri:
+ case AArch64::ADDXrr:
+ case AArch64::ANDWri:
+ case AArch64::ANDWrr:
+ case AArch64::ANDXri:
+ case AArch64::ANDXrr:
+ case AArch64::EORWri:
+ case AArch64::EORWrr:
+ case AArch64::EORXri:
+ case AArch64::EORXrr:
+ case AArch64::ORRWri:
+ case AArch64::ORRWrr:
+ case AArch64::ORRXri:
+ case AArch64::ORRXrr:
+ case AArch64::SUBWri:
+ case AArch64::SUBWrr:
+ case AArch64::SUBXri:
+ case AArch64::SUBXrr:
+ return true;
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::ANDWrs:
+ case AArch64::ANDXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ case AArch64::BICWrs:
+ case AArch64::BICXrs:
+ // Shift value can be 0 making these behave like the "rr" variant...
+ return !II.hasShiftedReg(*FirstMI);
+ case AArch64::INSTRUCTION_LIST_END:
+ return true;
+ }
+
+ if (ST.hasFuseAES())
+ // Fuse AES crypto operations.
+ switch(SecondOpcode) {
+ // AES encode.
+ case AArch64::AESMCrr:
+ case AArch64::AESMCrrTied:
+ return FirstOpcode == AArch64::AESErr ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+ // AES decode.
+ case AArch64::AESIMCrr:
+ case AArch64::AESIMCrrTied:
+ return FirstOpcode == AArch64::AESDrr ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+ }
+
+ if (ST.hasFuseLiterals())
+ // Fuse literal generation operations.
+ switch (SecondOpcode) {
+ // PC relative address.
+ case AArch64::ADDXri:
+ return FirstOpcode == AArch64::ADRP ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+ // 32 bit immediate.
+ case AArch64::MOVKWi:
+ return (FirstOpcode == AArch64::MOVZWi &&
+ SecondMI.getOperand(3).getImm() == 16) ||
+ FirstOpcode == AArch64::INSTRUCTION_LIST_END;
+ // Lower and upper half of 64 bit immediate.
+ case AArch64::MOVKXi:
+ return FirstOpcode == AArch64::INSTRUCTION_LIST_END ||
+ (FirstOpcode == AArch64::MOVZXi &&
+ SecondMI.getOperand(3).getImm() == 16) ||
+ (FirstOpcode == AArch64::MOVKXi &&
+ FirstMI->getOperand(3).getImm() == 32 &&
+ SecondMI.getOperand(3).getImm() == 48);
+ }
+
+ return false;
+}
+
+} // end namespace
+
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
+ return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h
new file mode 100644
index 0000000..32d90d4
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.h
@@ -0,0 +1,24 @@
+//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the AArch64 definition of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+/// DAG.addMutation(createAArch64MacroFusionDAGMutation());
+/// to AArch64PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation();
+
+} // llvm
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index 038162c..fe4ef4b 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -17,8 +17,8 @@
#define DEBUG_TYPE "aarch64-pbqp"
-#include "AArch64.h"
#include "AArch64PBQPRegAlloc.h"
+#include "AArch64.h"
#include "AArch64RegisterInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
index 4f656f9..b99c1d1 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.h
@@ -1,4 +1,4 @@
-//===-- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints -------===//
+//==- AArch64PBQPRegAlloc.h - AArch64 specific PBQP constraints --*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
@@ -15,6 +15,8 @@
namespace llvm {
+class TargetRegisterInfo;
+
/// Add the accumulator chaining constraint to a PBQP graph
class A57ChainingConstraint : public PBQPRAConstraint {
public:
@@ -33,6 +35,7 @@ private:
// Add constraints between existing chains
void addInterChainConstraint(PBQPRAGraph &G, unsigned Rd, unsigned Ra);
};
-}
+
+} // end namespace llvm
#endif // LLVM_LIB_TARGET_AARCH64_AARCH64PBQPREGALOC_H
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 8f45e6a..4e65c0a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -12,13 +12,14 @@
// CBZW %W0, <BB#2>
// BB#2:
// %W0 = COPY %WZR
-// This pass should be run after register allocation.
+// Similarly, this pass also handles non-zero copies.
+// BB#0:
+// cmp x0, #1
+// b.eq .LBB0_1
+// .LBB0_1:
+// orr x0, xzr, #0x1
//
-// FIXME: This should be extended to handle any constant other than zero. E.g.,
-// cmp w0, #1
-// b.eq .BB1
-// BB1:
-// mov w0, #1
+// This pass should be run after register allocation.
//
// FIXME: This could also be extended to check the whole dominance subtree below
// the comparison if the compile time regression is acceptable.
@@ -26,6 +27,7 @@
//===----------------------------------------------------------------------===//
#include "AArch64.h"
+#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
@@ -43,6 +45,7 @@ namespace {
class AArch64RedundantCopyElimination : public MachineFunctionPass {
const MachineRegisterInfo *MRI;
const TargetRegisterInfo *TRI;
+ BitVector ClobberedRegs;
public:
static char ID;
@@ -50,6 +53,16 @@ public:
initializeAArch64RedundantCopyEliminationPass(
*PassRegistry::getPassRegistry());
}
+
+ struct RegImm {
+ MCPhysReg Reg;
+ int32_t Imm;
+ RegImm(MCPhysReg Reg, int32_t Imm) : Reg(Reg), Imm(Imm) {}
+ };
+
+ Optional<RegImm> knownRegValInBlock(MachineInstr &CondBr,
+ MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &FirstUse);
bool optimizeCopy(MachineBasicBlock *MBB);
bool runOnMachineFunction(MachineFunction &MF) override;
MachineFunctionProperties getRequiredProperties() const override {
@@ -66,18 +79,121 @@ char AArch64RedundantCopyElimination::ID = 0;
INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
"AArch64 redundant copy elimination pass", false, false)
-static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
- unsigned Opc = MI.getOpcode();
+/// Remember what registers the specified instruction modifies.
+static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs,
+ const TargetRegisterInfo *TRI) {
+ for (const MachineOperand &MO : MI.operands()) {
+ if (MO.isRegMask()) {
+ ClobberedRegs.setBitsNotInMask(MO.getRegMask());
+ continue;
+ }
+
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ if (!MO.isDef())
+ continue;
+
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+ ClobberedRegs.set(*AI);
+ }
+}
+
+/// It's possible to determine the value of a register based on a dominating
+/// condition. To do so, this function checks to see if the basic block \p MBB
+/// is the target to which a conditional branch \p CondBr jumps and whose
+/// equality comparison is against a constant. If so, return a known physical
+/// register and constant value pair. Otherwise, return None.
+Optional<AArch64RedundantCopyElimination::RegImm>
+AArch64RedundantCopyElimination::knownRegValInBlock(
+ MachineInstr &CondBr, MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &FirstUse) {
+ unsigned Opc = CondBr.getOpcode();
+
// Check if the current basic block is the target block to which the
// CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
- if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
- MBB == MI.getOperand(1).getMBB())
- return true;
- else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
- MBB != MI.getOperand(1).getMBB())
- return true;
-
- return false;
+ if (((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+ MBB == CondBr.getOperand(1).getMBB()) ||
+ ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+ MBB != CondBr.getOperand(1).getMBB())) {
+ FirstUse = CondBr;
+ return RegImm(CondBr.getOperand(0).getReg(), 0);
+ }
+
+ // Otherwise, must be a conditional branch.
+ if (Opc != AArch64::Bcc)
+ return None;
+
+ // Must be an equality check (i.e., == or !=).
+ AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm();
+ if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+ return None;
+
+ MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB();
+ if ((CC == AArch64CC::EQ && BrTarget != MBB) ||
+ (CC == AArch64CC::NE && BrTarget == MBB))
+ return None;
+
+ // Stop if we get to the beginning of PredMBB.
+ MachineBasicBlock *PredMBB = *MBB->pred_begin();
+ assert(PredMBB == CondBr.getParent() &&
+ "Conditional branch not in predecessor block!");
+ if (CondBr == PredMBB->begin())
+ return None;
+
+ // Registers clobbered in PredMBB between CondBr instruction and current
+ // instruction being checked in loop.
+ ClobberedRegs.reset();
+
+ // Find compare instruction that sets NZCV used by CondBr.
+ MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator();
+ for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) {
+
+ // Track clobbered registers.
+ trackRegDefs(PredI, ClobberedRegs, TRI);
+
+ bool IsCMN = false;
+ switch (PredI.getOpcode()) {
+ default:
+ break;
+
+ // CMN is an alias for ADDS with a dead destination register.
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri:
+ IsCMN = true;
+ LLVM_FALLTHROUGH;
+ // CMP is an alias for SUBS with a dead destination register.
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri: {
+ MCPhysReg SrcReg = PredI.getOperand(1).getReg();
+
+ // Must not be a symbolic immediate.
+ if (!PredI.getOperand(2).isImm())
+ return None;
+
+ // The src register must not be modified between the cmp and conditional
+ // branch. This includes a self-clobbering compare.
+ if (ClobberedRegs[SrcReg])
+ return None;
+
+ // We've found the Cmp that sets NZCV.
+ int32_t KnownImm = PredI.getOperand(2).getImm();
+ int32_t Shift = PredI.getOperand(3).getImm();
+ KnownImm <<= Shift;
+ if (IsCMN)
+ KnownImm = -KnownImm;
+ FirstUse = PredI;
+ return RegImm(SrcReg, KnownImm);
+ }
+ }
+
+ // Bail if we see an instruction that defines NZCV that we don't handle.
+ if (PredI.definesRegister(AArch64::NZCV))
+ return None;
+ }
+ return None;
}
bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
@@ -85,79 +201,187 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
if (MBB->pred_size() != 1)
return false;
+ // Check if the predecessor has two successors, implying the block ends in a
+ // conditional branch.
MachineBasicBlock *PredMBB = *MBB->pred_begin();
- MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
- if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+ if (PredMBB->succ_size() != 2)
+ return false;
+
+ MachineBasicBlock::iterator CondBr = PredMBB->getLastNonDebugInstr();
+ if (CondBr == PredMBB->end())
return false;
- ++CompBr;
+ // Keep track of the earliest point in the PredMBB block where kill markers
+ // need to be removed if a COPY is removed.
+ MachineBasicBlock::iterator FirstUse;
+ // After calling knownRegValInBlock, FirstUse will either point to a CBZ/CBNZ
+ // or a compare (i.e., SUBS). In the latter case, we must take care when
+ // updating FirstUse when scanning for COPY instructions. In particular, if
+ // there's a COPY in between the compare and branch the COPY should not
+ // update FirstUse.
+ bool SeenFirstUse = false;
+ // Registers that contain a known value at the start of MBB.
+ SmallVector<RegImm, 4> KnownRegs;
+
+ MachineBasicBlock::iterator Itr = std::next(CondBr);
do {
- --CompBr;
- if (guaranteesZeroRegInBlock(*CompBr, MBB))
- break;
- } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+ --Itr;
- // We've not found a CBZ/CBNZ, time to bail out.
- if (!guaranteesZeroRegInBlock(*CompBr, MBB))
- return false;
+ Optional<RegImm> KnownRegImm = knownRegValInBlock(*Itr, MBB, FirstUse);
+ if (KnownRegImm == None)
+ continue;
- unsigned TargetReg = CompBr->getOperand(0).getReg();
- if (!TargetReg)
- return false;
- assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
- "Expect physical register");
+ KnownRegs.push_back(*KnownRegImm);
+
+ // Reset the clobber list, which is used by knownRegValInBlock.
+ ClobberedRegs.reset();
+
+ // Look backward in PredMBB for COPYs from the known reg to find other
+ // registers that are known to be a constant value.
+ for (auto PredI = Itr;; --PredI) {
+ if (FirstUse == PredI)
+ SeenFirstUse = true;
+
+ if (PredI->isCopy()) {
+ MCPhysReg CopyDstReg = PredI->getOperand(0).getReg();
+ MCPhysReg CopySrcReg = PredI->getOperand(1).getReg();
+ for (auto &KnownReg : KnownRegs) {
+ if (ClobberedRegs[KnownReg.Reg])
+ continue;
+ // If we have X = COPY Y, and Y is known to be zero, then now X is
+ // known to be zero.
+ if (CopySrcReg == KnownReg.Reg && !ClobberedRegs[CopyDstReg]) {
+ KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm));
+ if (SeenFirstUse)
+ FirstUse = PredI;
+ break;
+ }
+ // If we have X = COPY Y, and X is known to be zero, then now Y is
+ // known to be zero.
+ if (CopyDstReg == KnownReg.Reg && !ClobberedRegs[CopySrcReg]) {
+ KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm));
+ if (SeenFirstUse)
+ FirstUse = PredI;
+ break;
+ }
+ }
+ }
+
+ // Stop if we get to the beginning of PredMBB.
+ if (PredI == PredMBB->begin())
+ break;
+
+ trackRegDefs(*PredI, ClobberedRegs, TRI);
+ // Stop if all of the known-zero regs have been clobbered.
+ if (all_of(KnownRegs, [&](RegImm KnownReg) {
+ return ClobberedRegs[KnownReg.Reg];
+ }))
+ break;
+ }
+ break;
+
+ } while (Itr != PredMBB->begin() && Itr->isTerminator());
- // Remember all registers aliasing with TargetReg.
- SmallSetVector<unsigned, 8> TargetRegs;
- for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
- TargetRegs.insert(*AI);
+ // We've not found a registers with a known value, time to bail out.
+ if (KnownRegs.empty())
+ return false;
bool Changed = false;
+ // UsedKnownRegs is the set of KnownRegs that have had uses added to MBB.
+ SmallSetVector<unsigned, 4> UsedKnownRegs;
MachineBasicBlock::iterator LastChange = MBB->begin();
- unsigned SmallestDef = TargetReg;
- // Remove redundant Copy instructions unless TargetReg is modified.
+ // Remove redundant Copy instructions unless KnownReg is modified.
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
MachineInstr *MI = &*I;
++I;
- if (MI->isCopy() && MI->getOperand(0).isReg() &&
- MI->getOperand(1).isReg()) {
-
- unsigned DefReg = MI->getOperand(0).getReg();
- unsigned SrcReg = MI->getOperand(1).getReg();
-
- if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
- !MRI->isReserved(DefReg) &&
- (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
- DEBUG(dbgs() << "Remove redundant Copy : ");
- DEBUG((MI)->print(dbgs()));
-
- MI->eraseFromParent();
- Changed = true;
- LastChange = I;
- NumCopiesRemoved++;
- SmallestDef =
- TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
- continue;
+ bool RemovedMI = false;
+ bool IsCopy = MI->isCopy();
+ bool IsMoveImm = MI->isMoveImmediate();
+ if (IsCopy || IsMoveImm) {
+ MCPhysReg DefReg = MI->getOperand(0).getReg();
+ MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0;
+ int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0;
+ if (!MRI->isReserved(DefReg) &&
+ ((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) ||
+ IsMoveImm)) {
+ for (RegImm &KnownReg : KnownRegs) {
+ if (KnownReg.Reg != DefReg &&
+ !TRI->isSuperRegister(DefReg, KnownReg.Reg))
+ continue;
+
+ // For a copy, the known value must be a zero.
+ if (IsCopy && KnownReg.Imm != 0)
+ continue;
+
+ if (IsMoveImm) {
+ // For a move immediate, the known immediate must match the source
+ // immediate.
+ if (KnownReg.Imm != SrcImm)
+ continue;
+
+ // Don't remove a move immediate that implicitly defines the upper
+ // bits when only the lower 32 bits are known.
+ MCPhysReg CmpReg = KnownReg.Reg;
+ if (any_of(MI->implicit_operands(), [CmpReg](MachineOperand &O) {
+ return !O.isDead() && O.isReg() && O.isDef() &&
+ O.getReg() != CmpReg;
+ }))
+ continue;
+ }
+
+ if (IsCopy)
+ DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
+ else
+ DEBUG(dbgs() << "Remove redundant Move : " << *MI);
+
+ MI->eraseFromParent();
+ Changed = true;
+ LastChange = I;
+ NumCopiesRemoved++;
+ UsedKnownRegs.insert(KnownReg.Reg);
+ RemovedMI = true;
+ break;
+ }
}
}
- if (MI->modifiesRegister(TargetReg, TRI))
+ // Skip to the next instruction if we removed the COPY/MovImm.
+ if (RemovedMI)
+ continue;
+
+ // Remove any regs the MI clobbers from the KnownConstRegs set.
+ for (unsigned RI = 0; RI < KnownRegs.size();)
+ if (MI->modifiesRegister(KnownRegs[RI].Reg, TRI)) {
+ std::swap(KnownRegs[RI], KnownRegs[KnownRegs.size() - 1]);
+ KnownRegs.pop_back();
+ // Don't increment RI since we need to now check the swapped-in
+ // KnownRegs[RI].
+ } else {
+ ++RI;
+ }
+
+ // Continue until the KnownRegs set is empty.
+ if (KnownRegs.empty())
break;
}
if (!Changed)
return false;
- // Otherwise, we have to fixup the use-def chain, starting with the
- // CBZ/CBNZ. Conservatively mark as much as we can live.
- CompBr->clearRegisterKills(SmallestDef, TRI);
+ // Add newly used regs to the block's live-in list if they aren't there
+ // already.
+ for (MCPhysReg KnownReg : UsedKnownRegs)
+ if (!MBB->isLiveIn(KnownReg))
+ MBB->addLiveIn(KnownReg);
- if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
- MBB->addLiveIn(TargetReg);
-
- // Clear any kills of TargetReg between CompBr and the last removed COPY.
+ // Clear kills in the range where changes were made. This is conservative,
+ // but should be okay since kill markers are being phased out.
+ DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
+ << "\tLastChange: " << *LastChange);
+ for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end()))
+ MMI.clearKillInfo();
for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
- MMI.clearRegisterKills(SmallestDef, TRI);
+ MMI.clearKillInfo();
return true;
}
@@ -168,6 +392,11 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
return false;
TRI = MF.getSubtarget().getRegisterInfo();
MRI = &MF.getRegInfo();
+
+ // Resize the clobber register bitfield tracker. We do this once per
+ // function and then clear the bitfield each time we optimize a copy.
+ ClobberedRegs.resize(TRI->getNumRegs());
+
bool Changed = false;
for (MachineBasicBlock &MBB : MF)
Changed |= optimizeCopy(&MBB);
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index b292c9c..69124db 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//===- AArch64RegisterBankInfo.cpp ----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -13,13 +13,24 @@
//===----------------------------------------------------------------------===//
#include "AArch64RegisterBankInfo.h"
-#include "AArch64InstrInfo.h" // For XXXRegClassID.
-#include "llvm/CodeGen/LowLevelType.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOpcodes.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+
+#define GET_TARGET_REGBANK_IMPL
+#include "AArch64GenRegisterBank.inc"
// This file will be TableGen'ed at some point.
#include "AArch64GenRegisterBankInfo.def"
@@ -31,7 +42,7 @@ using namespace llvm;
#endif
AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
- : RegisterBankInfo(AArch64::RegBanks, AArch64::NumRegisterBanks) {
+ : AArch64GenRegisterBankInfo() {
static bool AlreadyInit = false;
// We have only one set of register banks, whatever the subtarget
// is. Therefore, the initialization of the RegBanks table should be
@@ -78,44 +89,21 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
// Check that the TableGen'ed like file is in sync we our expectations.
// First, the Idx.
- assert(AArch64::PartialMappingIdx::PMI_GPR32 ==
- AArch64::PartialMappingIdx::PMI_FirstGPR &&
- "GPR32 index not first in the GPR list");
- assert(AArch64::PartialMappingIdx::PMI_GPR64 ==
- AArch64::PartialMappingIdx::PMI_LastGPR &&
- "GPR64 index not last in the GPR list");
- assert(AArch64::PartialMappingIdx::PMI_FirstGPR <=
- AArch64::PartialMappingIdx::PMI_LastGPR &&
- "GPR list is backward");
- assert(AArch64::PartialMappingIdx::PMI_FPR32 ==
- AArch64::PartialMappingIdx::PMI_FirstFPR &&
- "FPR32 index not first in the FPR list");
- assert(AArch64::PartialMappingIdx::PMI_FPR512 ==
- AArch64::PartialMappingIdx::PMI_LastFPR &&
- "FPR512 index not last in the FPR list");
- assert(AArch64::PartialMappingIdx::PMI_FirstFPR <=
- AArch64::PartialMappingIdx::PMI_LastFPR &&
- "FPR list is backward");
- assert(AArch64::PartialMappingIdx::PMI_FPR32 + 1 ==
- AArch64::PartialMappingIdx::PMI_FPR64 &&
- AArch64::PartialMappingIdx::PMI_FPR64 + 1 ==
- AArch64::PartialMappingIdx::PMI_FPR128 &&
- AArch64::PartialMappingIdx::PMI_FPR128 + 1 ==
- AArch64::PartialMappingIdx::PMI_FPR256 &&
- AArch64::PartialMappingIdx::PMI_FPR256 + 1 ==
- AArch64::PartialMappingIdx::PMI_FPR512 &&
- "FPR indices not properly ordered");
+ assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
+ {PMI_GPR32, PMI_GPR64}) &&
+ "PartialMappingIdx's are incorrectly ordered");
+ assert(checkPartialMappingIdx(
+ PMI_FirstFPR, PMI_LastFPR,
+ {PMI_FPR32, PMI_FPR64, PMI_FPR128, PMI_FPR256, PMI_FPR512}) &&
+ "PartialMappingIdx's are incorrectly ordered");
// Now, the content.
// Check partial mapping.
#define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB) \
do { \
- const PartialMapping &Map = \
- AArch64::PartMappings[AArch64::PartialMappingIdx::Idx - \
- AArch64::PartialMappingIdx::PMI_Min]; \
- (void)Map; \
- assert(Map.StartIdx == ValStartIdx && Map.Length == ValLength && \
- Map.RegBank == &RB && #Idx " is incorrectly initialized"); \
- } while (0)
+ assert( \
+ checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \
+ #Idx " is incorrectly initialized"); \
+ } while (false)
CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
@@ -128,17 +116,11 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
// Check value mapping.
#define CHECK_VALUEMAP_IMPL(RBName, Size, Offset) \
do { \
- unsigned PartialMapBaseIdx = \
- AArch64::PartialMappingIdx::PMI_##RBName##Size - \
- AArch64::PartialMappingIdx::PMI_Min; \
- (void)PartialMapBaseIdx; \
- const ValueMapping &Map = AArch64::getValueMapping( \
- AArch64::PartialMappingIdx::PMI_First##RBName, Size)[Offset]; \
- (void)Map; \
- assert(Map.BreakDown == &AArch64::PartMappings[PartialMapBaseIdx] && \
- Map.NumBreakDowns == 1 && #RBName #Size \
- " " #Offset " is incorrectly initialized"); \
- } while (0)
+ assert(checkValueMapImpl(PartialMappingIdx::PMI_##RBName##Size, \
+ PartialMappingIdx::PMI_First##RBName, Size, \
+ Offset) && \
+ #RBName #Size " " #Offset " is incorrectly initialized"); \
+ } while (false)
#define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
@@ -157,7 +139,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
CHECK_VALUEMAP_IMPL(RBName, Size, 0); \
CHECK_VALUEMAP_IMPL(RBName, Size, 1); \
CHECK_VALUEMAP_IMPL(RBName, Size, 2); \
- } while (0)
+ } while (false)
CHECK_VALUEMAP_3OPS(GPR, 32);
CHECK_VALUEMAP_3OPS(GPR, 64);
@@ -169,24 +151,23 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
#define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size) \
do { \
- unsigned PartialMapDstIdx = \
- AArch64::PMI_##RBNameDst##Size - AArch64::PMI_Min; \
- unsigned PartialMapSrcIdx = \
- AArch64::PMI_##RBNameSrc##Size - AArch64::PMI_Min; \
- (void) PartialMapDstIdx; \
- (void) PartialMapSrcIdx; \
- const ValueMapping *Map = AArch64::getCopyMapping( \
- AArch64::PMI_First##RBNameDst == AArch64::PMI_FirstGPR, \
- AArch64::PMI_First##RBNameSrc == AArch64::PMI_FirstGPR, Size); \
- (void) Map; \
- assert(Map[0].BreakDown == &AArch64::PartMappings[PartialMapDstIdx] && \
+ unsigned PartialMapDstIdx = PMI_##RBNameDst##Size - PMI_Min; \
+ unsigned PartialMapSrcIdx = PMI_##RBNameSrc##Size - PMI_Min; \
+ (void)PartialMapDstIdx; \
+ (void)PartialMapSrcIdx; \
+ const ValueMapping *Map = getCopyMapping( \
+ AArch64::RBNameDst##RegBankID, AArch64::RBNameSrc##RegBankID, Size); \
+ (void)Map; \
+ assert(Map[0].BreakDown == \
+ &AArch64GenRegisterBankInfo::PartMappings[PartialMapDstIdx] && \
Map[0].NumBreakDowns == 1 && #RBNameDst #Size \
" Dst is incorrectly initialized"); \
- assert(Map[1].BreakDown == &AArch64::PartMappings[PartialMapSrcIdx] && \
+ assert(Map[1].BreakDown == \
+ &AArch64GenRegisterBankInfo::PartMappings[PartialMapSrcIdx] && \
Map[1].NumBreakDowns == 1 && #RBNameSrc #Size \
" Src is incorrectly initialized"); \
\
- } while (0)
+ } while (false)
CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
@@ -279,17 +260,15 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
if (MI.getNumOperands() != 3)
break;
InstructionMappings AltMappings;
- InstructionMapping GPRMapping(
- /*ID*/ 1, /*Cost*/ 1,
- AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
+ const InstructionMapping &GPRMapping = getInstructionMapping(
+ /*ID*/ 1, /*Cost*/ 1, getValueMapping(PMI_FirstGPR, Size),
/*NumOperands*/ 3);
- InstructionMapping FPRMapping(
- /*ID*/ 2, /*Cost*/ 1,
- AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
+ const InstructionMapping &FPRMapping = getInstructionMapping(
+ /*ID*/ 2, /*Cost*/ 1, getValueMapping(PMI_FirstFPR, Size),
/*NumOperands*/ 3);
- AltMappings.emplace_back(std::move(GPRMapping));
- AltMappings.emplace_back(std::move(FPRMapping));
+ AltMappings.push_back(&GPRMapping);
+ AltMappings.push_back(&FPRMapping);
return AltMappings;
}
case TargetOpcode::G_BITCAST: {
@@ -303,29 +282,29 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
break;
InstructionMappings AltMappings;
- InstructionMapping GPRMapping(
+ const InstructionMapping &GPRMapping = getInstructionMapping(
/*ID*/ 1, /*Cost*/ 1,
- AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ true, Size),
+ getCopyMapping(AArch64::GPRRegBankID, AArch64::GPRRegBankID, Size),
/*NumOperands*/ 2);
- InstructionMapping FPRMapping(
+ const InstructionMapping &FPRMapping = getInstructionMapping(
/*ID*/ 2, /*Cost*/ 1,
- AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ false, Size),
+ getCopyMapping(AArch64::FPRRegBankID, AArch64::FPRRegBankID, Size),
/*NumOperands*/ 2);
- InstructionMapping GPRToFPRMapping(
+ const InstructionMapping &GPRToFPRMapping = getInstructionMapping(
/*ID*/ 3,
/*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
- AArch64::getCopyMapping(/*DstIsGPR*/ false, /*SrcIsGPR*/ true, Size),
+ getCopyMapping(AArch64::FPRRegBankID, AArch64::GPRRegBankID, Size),
/*NumOperands*/ 2);
- InstructionMapping FPRToGPRMapping(
+ const InstructionMapping &FPRToGPRMapping = getInstructionMapping(
/*ID*/ 3,
/*Cost*/ copyCost(AArch64::GPRRegBank, AArch64::FPRRegBank, Size),
- AArch64::getCopyMapping(/*DstIsGPR*/ true, /*SrcIsGPR*/ false, Size),
+ getCopyMapping(AArch64::GPRRegBankID, AArch64::FPRRegBankID, Size),
/*NumOperands*/ 2);
- AltMappings.emplace_back(std::move(GPRMapping));
- AltMappings.emplace_back(std::move(FPRMapping));
- AltMappings.emplace_back(std::move(GPRToFPRMapping));
- AltMappings.emplace_back(std::move(FPRToGPRMapping));
+ AltMappings.push_back(&GPRMapping);
+ AltMappings.push_back(&FPRMapping);
+ AltMappings.push_back(&GPRToFPRMapping);
+ AltMappings.push_back(&FPRToGPRMapping);
return AltMappings;
}
case TargetOpcode::G_LOAD: {
@@ -339,23 +318,21 @@ AArch64RegisterBankInfo::getInstrAlternativeMappings(
break;
InstructionMappings AltMappings;
- InstructionMapping GPRMapping(
+ const InstructionMapping &GPRMapping = getInstructionMapping(
/*ID*/ 1, /*Cost*/ 1,
- getOperandsMapping(
- {AArch64::getValueMapping(AArch64::PMI_FirstGPR, Size),
- // Addresses are GPR 64-bit.
- AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+ getOperandsMapping({getValueMapping(PMI_FirstGPR, Size),
+ // Addresses are GPR 64-bit.
+ getValueMapping(PMI_FirstGPR, 64)}),
/*NumOperands*/ 2);
- InstructionMapping FPRMapping(
+ const InstructionMapping &FPRMapping = getInstructionMapping(
/*ID*/ 2, /*Cost*/ 1,
- getOperandsMapping(
- {AArch64::getValueMapping(AArch64::PMI_FirstFPR, Size),
- // Addresses are GPR 64-bit.
- AArch64::getValueMapping(AArch64::PMI_FirstGPR, 64)}),
+ getOperandsMapping({getValueMapping(PMI_FirstFPR, Size),
+ // Addresses are GPR 64-bit.
+ getValueMapping(PMI_FirstGPR, 64)}),
/*NumOperands*/ 2);
- AltMappings.emplace_back(std::move(GPRMapping));
- AltMappings.emplace_back(std::move(FPRMapping));
+ AltMappings.push_back(&GPRMapping);
+ AltMappings.push_back(&FPRMapping);
return AltMappings;
}
default:
@@ -369,13 +346,12 @@ void AArch64RegisterBankInfo::applyMappingImpl(
switch (OpdMapper.getMI().getOpcode()) {
case TargetOpcode::G_OR:
case TargetOpcode::G_BITCAST:
- case TargetOpcode::G_LOAD: {
+ case TargetOpcode::G_LOAD:
// Those ID must match getInstrAlternativeMappings.
assert((OpdMapper.getInstrMapping().getID() >= 1 &&
OpdMapper.getInstrMapping().getID() <= 4) &&
"Don't know how to handle that ID");
return applyDefaultMapping(OpdMapper);
- }
default:
llvm_unreachable("Don't know how to handle that operation");
}
@@ -397,8 +373,9 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
return false;
}
-RegisterBankInfo::InstructionMapping
-AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
+const RegisterBankInfo::InstructionMapping &
+AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
+ const MachineInstr &MI) const {
const unsigned Opc = MI.getOpcode();
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -411,6 +388,8 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
unsigned Size = Ty.getSizeInBits();
bool IsFPR = Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
+ PartialMappingIdx RBIdx = IsFPR ? PMI_FirstFPR : PMI_FirstGPR;
+
#ifndef NDEBUG
// Make sure all the operands are using similar size and type.
// Should probably be checked by the machine verifier.
@@ -422,23 +401,22 @@ AArch64RegisterBankInfo::getSameKindOfOperandsMapping(const MachineInstr &MI) {
// for each types.
for (unsigned Idx = 1; Idx != NumOperands; ++Idx) {
LLT OpTy = MRI.getType(MI.getOperand(Idx).getReg());
- assert(AArch64::getRegBankBaseIdxOffset(OpTy.getSizeInBits()) ==
- AArch64::getRegBankBaseIdxOffset(Size) &&
- "Operand has incompatible size");
+ assert(
+ AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(
+ RBIdx, OpTy.getSizeInBits()) ==
+ AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(RBIdx, Size) &&
+ "Operand has incompatible size");
bool OpIsFPR = OpTy.isVector() || isPreISelGenericFloatingPointOpcode(Opc);
(void)OpIsFPR;
assert(IsFPR == OpIsFPR && "Operand has incompatible type");
}
#endif // End NDEBUG.
- AArch64::PartialMappingIdx RBIdx =
- IsFPR ? AArch64::PMI_FirstFPR : AArch64::PMI_FirstGPR;
-
- return InstructionMapping{DefaultMappingID, 1,
- AArch64::getValueMapping(RBIdx, Size), NumOperands};
+ return getInstructionMapping(DefaultMappingID, 1,
+ getValueMapping(RBIdx, Size), NumOperands);
}
-RegisterBankInfo::InstructionMapping
+const RegisterBankInfo::InstructionMapping &
AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const unsigned Opc = MI.getOpcode();
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -447,7 +425,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Try the default logic for non-generic instructions that are either copies
// or already have some operands assigned to banks.
if (!isPreISelGenericOpcode(Opc)) {
- RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+ const RegisterBankInfo::InstructionMapping &Mapping =
+ getInstrMappingImpl(MI);
if (Mapping.isValid())
return Mapping;
}
@@ -485,14 +464,11 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
DstIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
const RegisterBank &SrcRB =
SrcIsGPR ? AArch64::GPRRegBank : AArch64::FPRRegBank;
- return InstructionMapping{DefaultMappingID, copyCost(DstRB, SrcRB, Size),
- AArch64::getCopyMapping(DstIsGPR, SrcIsGPR, Size),
- /*NumOperands*/ 2};
+ return getInstructionMapping(
+ DefaultMappingID, copyCost(DstRB, SrcRB, Size),
+ getCopyMapping(DstRB.getID(), SrcRB.getID(), Size),
+ /*NumOperands*/ 2);
}
- case TargetOpcode::G_SEQUENCE:
- // FIXME: support this, but the generic code is really not going to do
- // anything sane.
- return InstructionMapping();
default:
break;
}
@@ -501,10 +477,10 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Track the size and bank of each register. We don't do partial mappings.
SmallVector<unsigned, 4> OpSize(NumOperands);
- SmallVector<AArch64::PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+ SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
auto &MO = MI.getOperand(Idx);
- if (!MO.isReg())
+ if (!MO.isReg() || !MO.getReg())
continue;
LLT Ty = MRI.getType(MO.getReg());
@@ -513,9 +489,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// As a top-level guess, vectors go in FPRs, scalars and pointers in GPRs.
// For floating-point instructions, scalars go in FPRs.
if (Ty.isVector() || isPreISelGenericFloatingPointOpcode(Opc))
- OpRegBankIdx[Idx] = AArch64::PMI_FirstFPR;
+ OpRegBankIdx[Idx] = PMI_FirstFPR;
else
- OpRegBankIdx[Idx] = AArch64::PMI_FirstGPR;
+ OpRegBankIdx[Idx] = PMI_FirstGPR;
}
unsigned Cost = 1;
@@ -523,50 +499,74 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// fine-tune the computed mapping.
switch (Opc) {
case TargetOpcode::G_SITOFP:
- case TargetOpcode::G_UITOFP: {
- OpRegBankIdx = {AArch64::PMI_FirstFPR, AArch64::PMI_FirstGPR};
+ case TargetOpcode::G_UITOFP:
+ OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
break;
- }
case TargetOpcode::G_FPTOSI:
- case TargetOpcode::G_FPTOUI: {
- OpRegBankIdx = {AArch64::PMI_FirstGPR, AArch64::PMI_FirstFPR};
+ case TargetOpcode::G_FPTOUI:
+ OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
break;
- }
- case TargetOpcode::G_FCMP: {
- OpRegBankIdx = {AArch64::PMI_FirstGPR,
- /* Predicate */ AArch64::PMI_None, AArch64::PMI_FirstFPR,
- AArch64::PMI_FirstFPR};
+ case TargetOpcode::G_FCMP:
+ OpRegBankIdx = {PMI_FirstGPR,
+ /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR};
break;
- }
- case TargetOpcode::G_BITCAST: {
+ case TargetOpcode::G_BITCAST:
// This is going to be a cross register bank copy and this is expensive.
if (OpRegBankIdx[0] != OpRegBankIdx[1])
- Cost =
- copyCost(*AArch64::PartMappings[OpRegBankIdx[0]].RegBank,
- *AArch64::PartMappings[OpRegBankIdx[1]].RegBank, OpSize[0]);
+ Cost = copyCost(
+ *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[0]].RegBank,
+ *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank,
+ OpSize[0]);
break;
- }
- case TargetOpcode::G_LOAD: {
+ case TargetOpcode::G_LOAD:
// Loading in vector unit is slightly more expensive.
// This is actually only true for the LD1R and co instructions,
// but anyway for the fast mode this number does not matter and
// for the greedy mode the cost of the cross bank copy will
// offset this number.
// FIXME: Should be derived from the scheduling model.
- if (OpRegBankIdx[0] >= AArch64::PMI_FirstFPR)
+ if (OpRegBankIdx[0] != PMI_FirstGPR)
Cost = 2;
- }
+ else
+ // Check if that load feeds fp instructions.
+ // In that case, we want the default mapping to be on FPR
+ // instead of blind map every scalar to GPR.
+ for (const MachineInstr &UseMI :
+ MRI.use_instructions(MI.getOperand(0).getReg()))
+ // If we have at least one direct use in a FP instruction,
+ // assume this was a floating point load in the IR.
+ // If it was not, we would have had a bitcast before
+ // reaching that instruction.
+ if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) {
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ break;
+ }
+ break;
+ case TargetOpcode::G_STORE:
+ // Check if that store is fed by fp instructions.
+ if (OpRegBankIdx[0] == PMI_FirstGPR) {
+ unsigned VReg = MI.getOperand(0).getReg();
+ if (!VReg)
+ break;
+ MachineInstr *DefMI = MRI.getVRegDef(VReg);
+ if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode()))
+ OpRegBankIdx[0] = PMI_FirstFPR;
+ break;
+ }
}
// Finally construct the computed mapping.
- RegisterBankInfo::InstructionMapping Mapping =
- InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands};
SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
- for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
- if (MI.getOperand(Idx).isReg())
- OpdsMapping[Idx] =
- AArch64::getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+ for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+ if (MI.getOperand(Idx).isReg() && MI.getOperand(Idx).getReg()) {
+ auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+ if (!Mapping->isValid())
+ return getInvalidInstructionMapping();
+
+ OpdsMapping[Idx] = Mapping;
+ }
+ }
- Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
- return Mapping;
+ return getInstructionMapping(DefaultMappingID, Cost,
+ getOperandsMapping(OpdsMapping), NumOperands);
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
index f763235..6d74a47 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -16,25 +16,78 @@
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#define GET_REGBANK_DECLARATIONS
+#include "AArch64GenRegisterBank.inc"
+
namespace llvm {
class TargetRegisterInfo;
-namespace AArch64 {
-enum {
- GPRRegBankID = 0, /// General Purpose Registers: W, X.
- FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
- CCRRegBankID = 2, /// Conditional register: NZCV.
- NumRegisterBanks
-};
+class AArch64GenRegisterBankInfo : public RegisterBankInfo {
+protected:
+
+ enum PartialMappingIdx {
+ PMI_None = -1,
+ PMI_FPR32 = 1,
+ PMI_FPR64,
+ PMI_FPR128,
+ PMI_FPR256,
+ PMI_FPR512,
+ PMI_GPR32,
+ PMI_GPR64,
+ PMI_FirstGPR = PMI_GPR32,
+ PMI_LastGPR = PMI_GPR64,
+ PMI_FirstFPR = PMI_FPR32,
+ PMI_LastFPR = PMI_FPR512,
+ PMI_Min = PMI_FirstFPR,
+ };
+
+ static RegisterBankInfo::PartialMapping PartMappings[];
+ static RegisterBankInfo::ValueMapping ValMappings[];
+ static PartialMappingIdx BankIDToCopyMapIdx[];
+
+ enum ValueMappingIdx {
+ InvalidIdx = 0,
+ First3OpsIdx = 1,
+ Last3OpsIdx = 19,
+ DistanceBetweenRegBanks = 3,
+ FirstCrossRegCpyIdx = 22,
+ LastCrossRegCpyIdx = 34,
+ DistanceBetweenCrossRegCpy = 2
+ };
+
+ static bool checkPartialMap(unsigned Idx, unsigned ValStartIdx,
+ unsigned ValLength, const RegisterBank &RB);
+ static bool checkValueMapImpl(unsigned Idx, unsigned FirstInBank,
+ unsigned Size, unsigned Offset);
+ static bool checkPartialMappingIdx(PartialMappingIdx FirstAlias,
+ PartialMappingIdx LastAlias,
+ ArrayRef<PartialMappingIdx> Order);
-extern RegisterBank GPRRegBank;
-extern RegisterBank FPRRegBank;
-extern RegisterBank CCRRegBank;
-} // End AArch64 namespace.
+ static unsigned getRegBankBaseIdxOffset(unsigned RBIdx, unsigned Size);
+
+ /// Get the pointer to the ValueMapping representing the RegisterBank
+ /// at \p RBIdx with a size of \p Size.
+ ///
+ /// The returned mapping works for instructions with the same kind of
+ /// operands for up to 3 operands.
+ ///
+ /// \pre \p RBIdx != PartialMappingIdx::None
+ static const RegisterBankInfo::ValueMapping *
+ getValueMapping(PartialMappingIdx RBIdx, unsigned Size);
+
+ /// Get the pointer to the ValueMapping of the operands of a copy
+ /// instruction from the \p SrcBankID register bank to the \p DstBankID
+ /// register bank with a size of \p Size.
+ static const RegisterBankInfo::ValueMapping *
+ getCopyMapping(unsigned DstBankID, unsigned SrcBankID, unsigned Size);
+
+#define GET_TARGET_REGBANK_CLASS
+#include "AArch64GenRegisterBank.inc"
+};
/// This class provides the information for the target register banks.
-class AArch64RegisterBankInfo final : public RegisterBankInfo {
+class AArch64RegisterBankInfo final : public AArch64GenRegisterBankInfo {
/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
@@ -45,8 +98,8 @@ class AArch64RegisterBankInfo final : public RegisterBankInfo {
///
/// \return An InstructionMappings with a statically allocated
/// OperandsMapping.
- static InstructionMapping
- getSameKindOfOperandsMapping(const MachineInstr &MI);
+ const InstructionMapping &
+ getSameKindOfOperandsMapping(const MachineInstr &MI) const;
public:
AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
@@ -60,7 +113,8 @@ public:
InstructionMappings
getInstrAlternativeMappings(const MachineInstr &MI) const override;
- InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+ const InstructionMapping &
+ getInstrMapping(const MachineInstr &MI) const override;
};
} // End llvm namespace.
#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
new file mode 100644
index 0000000..c2b6c0b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterBanks.td
@@ -0,0 +1,20 @@
+//=- AArch64RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: W, X.
+def GPRRegBank : RegisterBank<"GPR", [GPR64all]>;
+
+/// Floating Point/Vector Registers: B, H, S, D, Q.
+def FPRRegBank : RegisterBank<"FPR", [QQQQ]>;
+
+/// Conditional register: NZCV.
+def CCRRegBank : RegisterBank<"CCR", [CCR]>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 98fad71..9f7dcb3 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -74,7 +74,7 @@ const uint32_t *
AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
if (CC == CallingConv::GHC)
- // This is academic becase all GHC calls are (supposed to be) tail calls
+ // This is academic because all GHC calls are (supposed to be) tail calls
return CSR_AArch64_NoRegs_RegMask;
if (CC == CallingConv::AnyReg)
return CSR_AArch64_AllRegs_RegMask;
@@ -94,7 +94,7 @@ const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
if (TT.isOSDarwin())
return CSR_AArch64_TLS_Darwin_RegMask;
- assert(TT.isOSBinFormatELF() && "only expect Darwin or ELF TLS");
+ assert(TT.isOSBinFormatELF() && "Invalid target");
return CSR_AArch64_TLS_ELF_RegMask;
}
@@ -118,25 +118,17 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// FIXME: avoid re-calculating this every time.
BitVector Reserved(getNumRegs());
- markSuperRegs(Reserved, AArch64::SP);
- markSuperRegs(Reserved, AArch64::XZR);
markSuperRegs(Reserved, AArch64::WSP);
markSuperRegs(Reserved, AArch64::WZR);
- if (TFI->hasFP(MF) || TT.isOSDarwin()) {
- markSuperRegs(Reserved, AArch64::FP);
+ if (TFI->hasFP(MF) || TT.isOSDarwin())
markSuperRegs(Reserved, AArch64::W29);
- }
- if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
- markSuperRegs(Reserved, AArch64::X18); // Platform register
- markSuperRegs(Reserved, AArch64::W18);
- }
+ if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
+ markSuperRegs(Reserved, AArch64::W18); // Platform register
- if (hasBasePointer(MF)) {
- markSuperRegs(Reserved, AArch64::X19);
+ if (hasBasePointer(MF))
markSuperRegs(Reserved, AArch64::W19);
- }
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
@@ -175,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
const TargetRegisterClass *
AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
unsigned Kind) const {
- return &AArch64::GPR64RegClass;
+ return &AArch64::GPR64spRegClass;
}
const TargetRegisterClass *
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
index 93ca079..18d000a 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -13,7 +13,7 @@
// ===---------------------------------------------------------------------===//
// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
+// This works with MachineScheduler. See MCSchedule.h for details.
// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
def CortexA53Model : SchedMachineModel {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 99c48d0..5d1608e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -13,7 +13,7 @@
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// The Cortex-A57 is a traditional superscaler microprocessor with a
+// The Cortex-A57 is a traditional superscalar microprocessor with a
// conservative 3-wide in-order stage for decode and dispatch. Combined with the
// much wider out-of-order issue stage, this produced a need to carefully
// schedule micro-ops so that all three decoded each cycle are successfully
@@ -162,7 +162,9 @@ def : InstRW<[A57Write_2cyc_1M], (instregex "BFM")>;
// Cryptography Extensions
// -----------------------------------------------------------------------------
-def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def A57ReadAES : SchedReadAdvance<3, [A57Write_3cyc_1W]>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES[DE]")>;
+def : InstRW<[A57Write_3cyc_1W, A57ReadAES], (instregex "^AESI?MC")>;
def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index 19a6d6f..44fd94f 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -17,10 +17,98 @@
// instruction cost model.
def FalkorModel : SchedMachineModel {
- let IssueWidth = 4; // 4-wide issue for expanded uops.
+ let IssueWidth = 8; // 8 uops are dispatched per cycle.
let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer.
let LoopMicroOpBufferSize = 16;
let LoadLatency = 3; // Optimistic load latency.
let MispredictPenalty = 11; // Minimum branch misprediction penalty.
- let CompleteModel = 0;
+ let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Falkor.
+
+let SchedModel = FalkorModel in {
+
+ def FalkorUnitB : ProcResource<1>; // Branch
+ def FalkorUnitLD : ProcResource<1>; // Load pipe
+ def FalkorUnitSD : ProcResource<1>; // Store data
+ def FalkorUnitST : ProcResource<1>; // Store pipe
+ def FalkorUnitX : ProcResource<1>; // Complex arithmetic
+ def FalkorUnitY : ProcResource<1>; // Simple arithmetic
+ def FalkorUnitZ : ProcResource<1>; // Simple arithmetic
+
+ def FalkorUnitVSD : ProcResource<1>; // Vector store data
+ def FalkorUnitVX : ProcResource<1>; // Vector X-pipe
+ def FalkorUnitVY : ProcResource<1>; // Vector Y-pipe
+
+ def FalkorUnitGTOV : ProcResource<1>; // Scalar to Vector
+ def FalkorUnitVTOG : ProcResource<1>; // Vector to Scalar
+
+ // Define the resource groups.
+ def FalkorUnitXY : ProcResGroup<[FalkorUnitX, FalkorUnitY]>;
+ def FalkorUnitXYZ : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ]>;
+ def FalkorUnitXYZB : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ,
+ FalkorUnitB]>;
+ def FalkorUnitZB : ProcResGroup<[FalkorUnitZ, FalkorUnitB]>;
+ def FalkorUnitVXVY : ProcResGroup<[FalkorUnitVX, FalkorUnitVY]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Falkor.
+
+let SchedModel = FalkorModel in {
+
+// These WriteRes entries are not used in the Falkor sched model.
+def : WriteRes<WriteImm, []> { let Unsupported = 1; }
+def : WriteRes<WriteI, []> { let Unsupported = 1; }
+def : WriteRes<WriteISReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteIEReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteExtr, []> { let Unsupported = 1; }
+def : WriteRes<WriteIS, []> { let Unsupported = 1; }
+def : WriteRes<WriteID32, []> { let Unsupported = 1; }
+def : WriteRes<WriteID64, []> { let Unsupported = 1; }
+def : WriteRes<WriteIM32, []> { let Unsupported = 1; }
+def : WriteRes<WriteIM64, []> { let Unsupported = 1; }
+def : WriteRes<WriteBr, []> { let Unsupported = 1; }
+def : WriteRes<WriteBrReg, []> { let Unsupported = 1; }
+def : WriteRes<WriteLD, []> { let Unsupported = 1; }
+def : WriteRes<WriteST, []> { let Unsupported = 1; }
+def : WriteRes<WriteSTP, []> { let Unsupported = 1; }
+def : WriteRes<WriteAdr, []> { let Unsupported = 1; }
+def : WriteRes<WriteLDIdx, []> { let Unsupported = 1; }
+def : WriteRes<WriteSTIdx, []> { let Unsupported = 1; }
+def : WriteRes<WriteF, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCmp, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCvt, []> { let Unsupported = 1; }
+def : WriteRes<WriteFCopy, []> { let Unsupported = 1; }
+def : WriteRes<WriteFImm, []> { let Unsupported = 1; }
+def : WriteRes<WriteFMul, []> { let Unsupported = 1; }
+def : WriteRes<WriteFDiv, []> { let Unsupported = 1; }
+def : WriteRes<WriteV, []> { let Unsupported = 1; }
+def : WriteRes<WriteVLD, []> { let Unsupported = 1; }
+def : WriteRes<WriteVST, []> { let Unsupported = 1; }
+def : WriteRes<WriteSys, []> { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Unsupported = 1; }
+def : WriteRes<WriteHint, []> { let Unsupported = 1; }
+def : WriteRes<WriteLDHi, []> { let Unsupported = 1; }
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// These ReadAdvance entries are not used in the Falkor sched model.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// Detailed Refinements
+// -----------------------------------------------------------------------------
+include "AArch64SchedFalkorDetails.td"
+
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
new file mode 100644
index 0000000..0aeb1f3
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -0,0 +1,1288 @@
+//==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Falkor subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+// Contains all of the Falkor specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+// Prefix: FalkorWr
+// MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
+// Latency: #cyc
+//
+// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
+// down one Z pipe, six SD pipes, four VX pipes and the total latency is
+// six cycles.
+//
+// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
+//
+// Contains all of the Falkor specific WriteVariant types for immediate zero
+// and LSLFast.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define 0 micro-op types
+def FalkorWr_LdStInc_none_3cyc : SchedWriteRes<[]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+def FalkorWr_none_3cyc : SchedWriteRes<[]> {
+ let Latency = 3;
+ let NumMicroOps = 0;
+}
+def FalkorWr_none_4cyc : SchedWriteRes<[]> {
+ let Latency = 4;
+ let NumMicroOps = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 1 micro-op types
+
+def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
+def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; }
+def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; }
+def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; }
+def FalkorWr_1LD_4cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 4; }
+def FalkorWr_1XYZ_0cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 0; }
+def FalkorWr_1XYZ_1cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
+def FalkorWr_1XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
+def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
+def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
+def FalkorWr_1none_0cyc : SchedWriteRes<[]> { let Latency = 0; }
+
+def FalkorWr_1VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 0; }
+def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
+def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
+def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
+def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+
+def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; }
+def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; }
+def FalkorWr_1ST_3cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 3; }
+
+def FalkorWr_1GTOV_0cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 0; }
+def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
+def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
+def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define 2 micro-op types
+
+def FalkorWr_2VXVY_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_4cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_2LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_12cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_14cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 14;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_21cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_2GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_2XYZ_2cyc : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1X_1Z_8cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2, 8];
+}
+
+def FalkorWr_1X_1Z_11cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2, 11];
+}
+
+def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+
+def FalkorWr_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitVSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 3 micro-op types
+
+def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+ FalkorUnitLD]> {
+ let Latency = 0;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+ FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_1LD_2VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_3LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitZ]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+
+def FalkorWr_1XYZ_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 3;
+}
+def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, FalkorUnitST]> {
+ let Latency = 0;
+ let NumMicroOps = 3;
+}
+//===----------------------------------------------------------------------===//
+// Define 4 micro-op types
+
+def FalkorWr_2VX_2VY_14cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_20cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 20;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_21cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 21;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VX_2VY_24cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+ FalkorUnitVX, FalkorUnitVY]> {
+ let Latency = 24;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_4VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 2;
+ let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_4LD_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST,
+ FalkorUnitSD, FalkorUnitLD]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+}
+
+def FalkorWr_2VSD_2ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 5 micro-op types
+
+def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+}
+def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 5;
+}
+def FalkorWr_5VXVY_7cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitVXVY]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+}
+def FalkorWr_1XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitST,
+ FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 5;
+}
+def FalkorWr_1VXVY_2ST_2VSD_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitST,
+ FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 5;
+}
+//===----------------------------------------------------------------------===//
+// Define 6 micro-op types
+
+def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 6;
+}
+
+def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitXYZ,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 6;
+}
+
+def FalkorWr_2VXVY_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 6;
+}
+
+def FalkorWr_3VSD_3ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 8 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY,
+ FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 8;
+}
+
+def FalkorWr_4VSD_4ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 9 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitXYZ,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 9;
+}
+
+def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+ FalkorUnitLD, FalkorUnitVXVY,
+ FalkorUnitVXVY, FalkorUnitXYZ,
+ FalkorUnitLD, FalkorUnitLD,
+ FalkorUnitVXVY, FalkorUnitVXVY]> {
+ let Latency = 4;
+ let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 10 micro-op types
+
+def FalkorWr_2VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 12 micro-op types
+
+def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD,
+ FalkorUnitVXVY, FalkorUnitST,
+ FalkorUnitVSD, FalkorUnitVXVY,
+ FalkorUnitST, FalkorUnitVSD]> {
+ let Latency = 0;
+ let NumMicroOps = 12;
+}
+
+// Forwarding logic is modeled for multiply add/accumulate and
+// load/store base register increment.
+// -----------------------------------------------------------------------------
+def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
+
+def FalkorReadIncLd : SchedReadAdvance<2, [FalkorWr_LdStInc_none_3cyc]>;
+def FalkorReadIncSt : SchedReadAdvance<1, [FalkorWr_LdStInc_none_3cyc]>;
+
+// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
+// -----------------------------------------------------------------------------
+def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).isImm() &&
+ MI->getOperand(1).getImm() == 0}]>;
+def FalkorOp1ZrReg : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
+
+ MI->getOperand(1).getReg() == AArch64::XZR}]>;
+def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>;
+
+def FalkorWr_FMOV : SchedWriteVariant<[
+ SchedVar<FalkorOp1ZrReg, [FalkorWr_1none_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1GTOV_1cyc]>]>;
+
+def FalkorWr_MOVZ : SchedWriteVariant<[
+ SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZB_0cyc]>]>; // imm fwd
+
+
+def FalkorWr_ADDSUBsx : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_2XYZ_2cyc]>]>;
+
+def FalkorWr_LDRro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_3cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_4cyc]>]>;
+
+def FalkorWr_LDRSro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1LD_5cyc]>]>;
+
+def FalkorWr_ORRi : SchedWriteVariant<[
+ SchedVar<FalkorOp1ZrReg, [FalkorWr_1XYZ_0cyc]>, // imm fwd
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1cyc]>]>;
+
+def FalkorWr_PRFMro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1ST_4cyc]>]>;
+
+def FalkorWr_STRVro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1VSD_1ST_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1VSD_1ST_0cyc]>]>;
+
+def FalkorWr_STRQro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_2ST_2VSD_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_2XYZ_2ST_2VSD_0cyc]>]>;
+
+def FalkorWr_STRro : SchedWriteVariant<[
+ SchedVar<FalkorShiftExtFastPred, [FalkorWr_1SD_1ST_0cyc]>,
+ SchedVar<NoSchedPred, [FalkorWr_1XYZ_1SD_1ST_0cyc]>]>;
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the earlier mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs COPY)>;
+
+// SIMD Floating-point Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)v2f32$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v2i32p)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FAC(GE|GT)(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|GE|GT)(32|64|v2f32|v2i32)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64|v2i32)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)v2f32$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^F(MAX|MIN)(NM)?Vv4i32v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FABD|FADD|FSUB)v2f32$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FADDP(v2i32p|v2i64p|v2f32)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v2f32)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)v2i32(_shift)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instregex "^(FMUL|FMULX)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instrs FMULX32)>;
+
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instrs FMULX64)>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v2i64p)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs FCVTLv4i16, FCVTLv2i32)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVv2f32)>;
+def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTv2f32)>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs FCVTLv8i16, FCVTLv4i32)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+ (instregex "^(FMUL|FMULX)(v2f64|v4f32|v4i32_indexed)$")>;
+
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+ (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+
+def : InstRW<[FalkorWr_3VXVY_4cyc], (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>;
+def : InstRW<[FalkorWr_3VXVY_5cyc], (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>;
+
+def : InstRW<[FalkorWr_2VX_2VY_14cyc],(instrs FDIVv2f64)>;
+def : InstRW<[FalkorWr_2VX_2VY_20cyc],(instrs FDIVv4f32)>;
+def : InstRW<[FalkorWr_2VX_2VY_21cyc],(instrs FSQRTv2f64)>;
+def : InstRW<[FalkorWr_2VX_2VY_24cyc],(instrs FSQRTv4f32)>;
+
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+ (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32],
+ (instregex "^FML(A|S)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64],
+ (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32],
+ (instregex "^FML(A|S)(v4f32|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64],
+ (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
+
+// SIMD Integer Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs ADDPv2i64p)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(AND|ORR|ORN|BIC|EOR)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIC|ORR)(v2i32|v4i16)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^NEG(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHLv1i64$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(S|U)SHRd$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs PMULv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^SHLd$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)(ABD|ADALP)(v8i8|v4i16|v2i32)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)ADDLVv4i16v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(s|h|b)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)RSHRd$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^R?SHRN(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs ADDVv4i16v)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQABS(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+ (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+ (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+ (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs ADDVv4i32v)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs ADDVv8i16v)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(ADD|SUB)HNv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)ABA(v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs ADDVv16i8v)>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32)_shift?$")>;
+def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^R(ADD|SUB)HNv.*$")>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^ADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs ADDPv2i64)>; // sz==11
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(AND|ORR|ORN|BIC|EOR)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIC|ORR)(v8i16|v4i32)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(NEG|SUB)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)ADDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v16i8|v2i64|v4i32|v8i16)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SHR(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(S|U)SUBLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^((S|U)?(MAX|MIN)P?|ABS)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; // sz!=11
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^PMULL(v8i8|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)ABD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)ABDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16|v4i32|v2i64)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^R?SHRN(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL(v1i64|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+ (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+ (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(S|U)ADDLVv8i16v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_6cyc], (instregex "^(S|U)ADDLVv16i8v$")>;
+
+def : InstRW<[FalkorWr_4VXVY_2cyc], (instregex "^(S|U)(ADD|SUB)Wv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
+
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+
+// SIMD Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd], (instrs LD2i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd], (instregex "^LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD3i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorReadIncLd], (instrs LD4i64)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorReadIncLd],
+ (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd], (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_1none_3cyc, FalkorReadIncLd],
+ (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+ (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc, FalkorReadIncLd], (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_3LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd], (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2none_3cyc, FalkorReadIncLd],
+ (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+ (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc, FalkorReadIncLd], (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_4LD_3cyc, FalkorReadIncLd],
+ (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd], (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1none_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2none_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(8b|4h|2s)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
+
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, FalkorReadIncLd],
+ (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+
+// Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ADD(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^AND(S)?(W|X)r(i|r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^BIC(S)?(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EON(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^EOR(W|X)r(i|r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORN(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_ORRi], (instregex "^ORR(W|X)ri$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^ORR(W|X)r(r|s)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SBC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^SUB(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx], (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>;
+
+// SIMD Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^CPY(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^INSv(i8|i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^(S|U)MOVv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs EXTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs TBLv8i8One)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs NOTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^REV(16|32|64)v.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "(S|U)QXTU?Nv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instrs FRECPS64, FRSQRTS64)>;
+
+def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
+ (instregex "^INSv(i32|i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(BIF|BIT|BSL)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs EXTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_0cyc], (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs NOTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs TBLv16i8One)>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>;
+
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+ (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+ (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>;
+def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>;
+
+def : InstRW<[FalkorWr_4VXVY_6cyc], (instregex "^TBL(v8i8Four|v16i8Three)$")>;
+def : InstRW<[FalkorWr_4VXVY_6cyc], (instregex "^TBX(v8i8Three|v16i8Three)$")>;
+
+def : InstRW<[FalkorWr_5VXVY_7cyc], (instrs TBLv16i8Four)>;
+def : InstRW<[FalkorWr_5VXVY_7cyc], (instregex "^TBX(v8i8Four|v16i8Four)$")>;
+
+// SIMD Store Instructions
+// -----------------------------------------------------------------------------
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(Q|D|S|H|B)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRVro, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(D|S|H|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STPQi$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STPQ(post|pre)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STP(D|S)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRQro, ReadDefault, FalkorReadIncSt],
+ (instregex "^STRQro(W|X)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STUR(Q|D|S|B|H)i$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instrs STNPDi, STNPSi)>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instrs STNPQi)>;
+
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST2(Two(v8b|v4h|v2s)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4(i8|i16|i32|i64)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3Three(v8b|v4h|v2s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3Three(v8b|v4h|v2s)_POST$")>;
+
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instrs ST3Threev2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instrs ST3Threev2d_POST)>;
+
+def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4Four(v8b|v4h|v2s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4Four(v8b|v4h|v2s)_POST$")>;
+
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instrs ST4Fourv2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instrs ST4Fourv2d_POST)>;
+
+def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3Three(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4Four(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1none_0cyc], (instrs B, TCRETURNdi)>;
+def : InstRW<[FalkorWr_1Z_0cyc], (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>;
+def : InstRW<[FalkorWr_1Z_0cyc], (instrs RET_ReallyLR, TCRETURNri)>;
+def : InstRW<[FalkorWr_1ZB_0cyc], (instrs Bcc)>;
+def : InstRW<[FalkorWr_1XYZB_0cyc], (instrs BL)>;
+def : InstRW<[FalkorWr_1Z_1XY_0cyc], (instrs BLR)>;
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instrs SHA1Hrr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs AESIMCrr, AESMCrr)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs AESDrr, AESErr)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc], (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def : InstRW<[FalkorWr_1VX_1VY_4cyc], (instregex "^SHA1(C|M|P)rrr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_5cyc], (instrs SHA256H2rrr, SHA256Hrrr)>;
+def : InstRW<[FalkorWr_4VXVY_3cyc], (instrs SHA256SU1rrr)>;
+
+// FP Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+ (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instrs LDNPQi)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instrs LDPQi)>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "LDNP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "LDP(D|S)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "LDP(D|S)(pre|post)$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "^LDPQ(pre|post)$")>;
+
+// FP Data Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCCMP(E)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCMP(E)?(S|D)r(r|i)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^(FABS|FNEG)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FCSEL(S|D)rrr$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^F(MAX|MIN)(NM)?Pv2i(32|64)p$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instrs FCVTSHr, FCVTDHr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc], (instregex "^FRINT(A|I|M|N|P|X|Z)(S|D)r$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^FABD(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^(FADD|FSUB)(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTHSr, FCVTHDr)>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+ (instregex "^F(N)?MULSrr$")>;
+
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+ (instregex "^F(N)?MULDrr$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instrs FDIVSrr)>;
+def : InstRW<[FalkorWr_1VX_1VY_14cyc],(instrs FDIVDrr)>;
+def : InstRW<[FalkorWr_1VX_1VY_12cyc],(instrs FSQRTSr)>;
+def : InstRW<[FalkorWr_1VX_1VY_21cyc],(instrs FSQRTDr)>;
+
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32],
+ (instregex "^F(N)?M(ADD|SUB)Srrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64],
+ (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+
+// FP Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(WS|XD|XDHigh)r$")>;
+def : InstRW<[FalkorWr_1GTOV_0cyc], (instregex "^FMOV(S|D)i$")>; // imm fwd
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(d|s)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_1VXVY_0cyc], (instregex "^FMOV(Sr|Dr|v.*_ns)$")>; // imm fwd
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr
+def : InstRW<[FalkorWr_2VXVY_0cyc], (instrs FMOVD0, FMOVS0)>; // imm fwd
+
+def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
+
+// Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFMui, PRFMl)>;
+def : InstRW<[FalkorWr_1ST_0cyc], (instrs PRFUMi)>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "^LDP(W|X)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "^LDP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRro, FalkorReadIncLd],
+ (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDR(W|X)l$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_PRFMro], (instregex "^PRFMro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
+ (instrs LDPSWi)>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc, FalkorReadIncLd],
+ (instregex "^LDPSW(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRSro, FalkorReadIncLd],
+ (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instrs LDRSWl)>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorReadIncLd],
+ (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(S|U)?BFM(W|X)ri$")>;
+def : InstRW<[FalkorWr_1X_2cyc], (instregex "^CRC32.*$")>;
+def : InstRW<[FalkorWr_1XYZ_2cyc], (instregex "^(CLS|CLZ|RBIT|REV|REV16|REV32)(W|X)r$")>;
+def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>;
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+ (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32],
+ (instregex "^M(ADD|SUB)Wrrr$")>;
+
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+ (instregex "^M(ADD|SUB)Xrrr$")>;
+
+def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[FalkorWr_1X_1Z_11cyc], (instregex "^(S|U)DIVXr$")>;
+
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+ (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+ (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
+
+// Move and Shift Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instregex "^(LSLV|LSRV|ASRV|RORV)(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_0cyc], (instregex "^MOVK(W|X)i$")>; // imm fwd
+def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^ADRP?$")>; // imm fwd
+def : InstRW<[FalkorWr_1XYZB_0cyc], (instregex "^MOVN(W|X)i$")>; // imm fwd
+def : InstRW<[FalkorWr_MOVZ], (instregex "^MOVZ(W|X)i$")>;
+def : InstRW<[FalkorWr_1XYZ_0cyc], (instrs MOVi32imm, MOVi64imm)>; // imm fwd (approximation)
+def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>],
+ (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>;
+def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>],
+ (instrs LOADgot)>;
+
+// Other Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_0cyc], (instrs CLREX, DMB, DSB)>;
+def : InstRW<[FalkorWr_1none_0cyc], (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HLT, HVC, ISB, SMC, SVC)>;
+def : InstRW<[FalkorWr_1ST_0cyc], (instrs SYSxt, SYSLxt)>;
+def : InstRW<[FalkorWr_1Z_0cyc], (instrs MSRpstateImm1, MSRpstateImm4)>;
+
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorReadIncLd],
+ (instregex "^(LDAR(B|H|W|X)|LDAXR(B|H|W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc, FalkorReadIncLd],
+ (instregex "^(LDAXP(W|X)|LDXP(W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc], (instrs MRS, MOVbaseTLS)>;
+
+def : InstRW<[FalkorWr_1LD_1Z_3cyc], (instrs DRPS)>;
+
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_2LD_1Z_3cyc], (instrs ERET)>;
+
+def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STXP(W|X)$")>;
+def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STXR(B|H|W|X)$")>;
+
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STLXP(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STLXR(B|H|W|X)$")>;
+
+// Store Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STP(W|X)i$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, ReadDefault, FalkorReadIncSt],
+ (instregex "^STP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_LdStInc_none_3cyc, FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRro, ReadDefault, FalkorReadIncSt],
+ (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc, ReadDefault, FalkorReadIncSt],
+ (instregex "^STUR(BB|HH|W|X)i$")>;
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 426ae61..cf4cdab 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -776,23 +776,29 @@ def KryoWrite_4cyc_X_X_115ln :
}
def : InstRW<[KryoWrite_4cyc_X_X_115ln],
(instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
-def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+def KryoWrite_10cyc_XA_Y_noRSV_43ln :
SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
- let Latency = 1; let NumMicroOps = 3;
+ let Latency = 10; let NumMicroOps = 3;
}
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
- (instrs FDIVDrr, FDIVSrr)>;
-def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_43ln],
+ (instrs FDIVSrr)>;
+def KryoWrite_14cyc_XA_Y_noRSV_43ln :
SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
- let Latency = 1; let NumMicroOps = 3;
+ let Latency = 14; let NumMicroOps = 3;
}
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+def : InstRW<[KryoWrite_14cyc_XA_Y_noRSV_43ln],
+ (instrs FDIVDrr)>;
+def KryoWrite_10cyc_XA_Y_noRSV_121ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 10; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_10cyc_XA_Y_noRSV_121ln],
(instrs FDIVv2f32)>;
-def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+def KryoWrite_14cyc_XA_Y_XA_Y_123ln :
SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
- let Latency = 1; let NumMicroOps = 4;
+ let Latency = 14; let NumMicroOps = 4;
}
-def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+def : InstRW<[KryoWrite_14cyc_XA_Y_XA_Y_123ln],
(instrs FDIVv2f64, FDIVv4f32)>;
def KryoWrite_5cyc_X_noRSV_55ln :
SchedWriteRes<[KryoUnitX]> {
@@ -968,24 +974,36 @@ def KryoWrite_2cyc_XY_XY_109ln :
}
def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
(instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
-def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+def KryoWrite_12cyc_XA_Y_noRSV_42ln :
SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
- let Latency = 1; let NumMicroOps = 3;
+ let Latency = 12; let NumMicroOps = 3;
}
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
- (instregex "FSQRT(S|D)r")>;
-def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_42ln],
+ (instrs FSQRTSr)>;
+def KryoWrite_21cyc_XA_Y_noRSV_42ln :
SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
- let Latency = 1; let NumMicroOps = 3;
+ let Latency = 21; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_21cyc_XA_Y_noRSV_42ln],
+ (instrs FSQRTDr)>;
+def KryoWrite_12cyc_XA_Y_noRSV_120ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+ let Latency = 12; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_12cyc_XA_Y_noRSV_120ln],
+ (instrs FSQRTv2f32)>;
+def KryoWrite_21cyc_XA_Y_XA_Y_122ln :
+ SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+ let Latency = 21; let NumMicroOps = 4;
}
-def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
- (instregex "FSQRTv2f32")>;
-def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+def : InstRW<[KryoWrite_21cyc_XA_Y_XA_Y_122ln],
+ (instrs FSQRTv4f32)>;
+def KryoWrite_36cyc_XA_Y_XA_Y_122ln :
SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
- let Latency = 1; let NumMicroOps = 4;
+ let Latency = 36; let NumMicroOps = 4;
}
-def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
- (instregex "FSQRT(v2f64|v4f32)")>;
+def : InstRW<[KryoWrite_36cyc_XA_Y_XA_Y_122ln],
+ (instrs FSQRTv2f64)>;
def KryoWrite_1cyc_X_201ln :
SchedWriteRes<[KryoUnitX]> {
let Latency = 1; let NumMicroOps = 1;
@@ -1356,7 +1374,9 @@ def KryoWrite_3cyc_LS_LS_400ln :
let Latency = 3; let NumMicroOps = 2;
}
def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
- (instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+ (instregex "LDAX?R(B|H|W|X)")>;
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln, WriteLDHi],
+ (instregex "LDAXP(W|X)")>;
def KryoWrite_3cyc_LS_LS_401ln :
SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
let Latency = 3; let NumMicroOps = 2;
@@ -1547,7 +1567,7 @@ def KryoWrite_3cyc_LS_258ln :
SchedWriteRes<[KryoUnitLS]> {
let Latency = 3; let NumMicroOps = 1;
}
-def : InstRW<[KryoWrite_3cyc_LS_258ln],
+def : InstRW<[KryoWrite_3cyc_LS_258ln, WriteLDHi],
(instregex "LDXP(W|X)")>;
def KryoWrite_3cyc_LS_258_1ln :
SchedWriteRes<[KryoUnitLS]> {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
index 14d6891..3b71cf8 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedM1.td
@@ -23,7 +23,7 @@ def ExynosM1Model : SchedMachineModel {
let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
let LoadLatency = 4; // Optimistic load cases.
let MispredictPenalty = 14; // Minimum branch misprediction penalty.
- let CompleteModel = 0; // Use the default model otherwise.
+ let CompleteModel = 1; // Use the default model otherwise.
}
//===----------------------------------------------------------------------===//
@@ -72,14 +72,14 @@ def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; }
def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; }
-def M1WriteLA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
+def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteL5,
M1WriteA1]>,
SchedVar<NoSchedPred, [M1WriteL5]>]>;
def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; }
def M1WriteS2 : SchedWriteRes<[M1UnitS]> { let Latency = 2; }
def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
-def M1WriteSA : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
+def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteS2,
M1WriteA1]>,
SchedVar<NoSchedPred, [M1WriteS1]>]>;
@@ -125,13 +125,13 @@ def : WriteRes<WriteAdr, []> { let Latency = 0; }
// Load instructions.
def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; }
def : WriteRes<WriteLDHi, [M1UnitALU]> { let Latency = 4; }
-def : SchedAlias<WriteLDIdx, M1WriteLA>;
+def : SchedAlias<WriteLDIdx, M1WriteLX>;
// Store instructions.
def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; }
def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; }
def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; }
-def : SchedAlias<WriteSTIdx, M1WriteSA>;
+def : SchedAlias<WriteSTIdx, M1WriteSX>;
// FP data instructions.
def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; }
@@ -231,6 +231,111 @@ def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; }
def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; }
def M1WriteTB : SchedWriteRes<[M1UnitC,
M1UnitALU]> { let Latency = 2; }
+def M1WriteVLDA : SchedWriteRes<[M1UnitL,
+ M1UnitL]> { let Latency = 6; }
+def M1WriteVLDB : SchedWriteRes<[M1UnitL,
+ M1UnitL,
+ M1UnitL]> { let Latency = 7; }
+def M1WriteVLDC : SchedWriteRes<[M1UnitL,
+ M1UnitL,
+ M1UnitL,
+ M1UnitL]> { let Latency = 8; }
+def M1WriteVLDD : SchedWriteRes<[M1UnitL,
+ M1UnitNALU]> { let Latency = 7;
+ let ResourceCycles = [2]; }
+def M1WriteVLDE : SchedWriteRes<[M1UnitL,
+ M1UnitNALU]> { let Latency = 6; }
+def M1WriteVLDF : SchedWriteRes<[M1UnitL,
+ M1UnitL]> { let Latency = 10;
+ let ResourceCycles = [5]; }
+def M1WriteVLDG : SchedWriteRes<[M1UnitL,
+ M1UnitNALU,
+ M1UnitNALU]> { let Latency = 7;
+ let ResourceCycles = [2]; }
+def M1WriteVLDH : SchedWriteRes<[M1UnitL,
+ M1UnitNALU,
+ M1UnitNALU]> { let Latency = 6; }
+def M1WriteVLDI : SchedWriteRes<[M1UnitL,
+ M1UnitL,
+ M1UnitL]> { let Latency = 12;
+ let ResourceCycles = [6]; }
+def M1WriteVLDJ : SchedWriteRes<[M1UnitL,
+ M1UnitNALU,
+ M1UnitNALU,
+ M1UnitNALU]> { let Latency = 9;
+ let ResourceCycles = [4]; }
+def M1WriteVLDK : SchedWriteRes<[M1UnitL,
+ M1UnitNALU,
+ M1UnitNALU,
+ M1UnitNALU,
+ M1UnitNALU]> { let Latency = 9;
+ let ResourceCycles = [4]; }
+def M1WriteVLDL : SchedWriteRes<[M1UnitL,
+ M1UnitNALU,
+ M1UnitNALU,
+ M1UnitNALU]> { let Latency = 7;
+ let ResourceCycles = [2]; }
+def M1WriteVLDM : SchedWriteRes<[M1UnitL,
+ M1UnitNALU,
+ M1UnitNALU,
+ M1UnitNALU,
+ M1UnitNALU]> { let Latency = 7;
+ let ResourceCycles = [2]; }
+def M1WriteVLDN : SchedWriteRes<[M1UnitL,
+ M1UnitL,
+ M1UnitL,
+ M1UnitL]> { let Latency = 14;
+ let ResourceCycles = [7]; }
+
+def M1WriteVSTA : WriteSequence<[WriteVST], 2>;
+def M1WriteVSTB : WriteSequence<[WriteVST], 3>;
+def M1WriteVSTC : WriteSequence<[WriteVST], 4>;
+def M1WriteVSTD : SchedWriteRes<[M1UnitS,
+ M1UnitFST,
+ M1UnitFST]> { let Latency = 7;
+ let ResourceCycles = [7]; }
+def M1WriteVSTE : SchedWriteRes<[M1UnitS,
+ M1UnitFST,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitFST]> { let Latency = 8;
+ let ResourceCycles = [8]; }
+def M1WriteVSTF : SchedWriteRes<[M1UnitNALU,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitFST,
+ M1UnitFST]> { let Latency = 15;
+ let ResourceCycles = [15]; }
+def M1WriteVSTG : SchedWriteRes<[M1UnitNALU,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitFST,
+ M1UnitFST]> { let Latency = 16;
+ let ResourceCycles = [16]; }
+def M1WriteVSTH : SchedWriteRes<[M1UnitNALU,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitFST,
+ M1UnitFST]> { let Latency = 14;
+ let ResourceCycles = [14]; }
+def M1WriteVSTI : SchedWriteRes<[M1UnitNALU,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitS,
+ M1UnitFST,
+ M1UnitFST,
+ M1UnitFST]> { let Latency = 17;
+ let ResourceCycles = [17]; }
// Branch instructions
def : InstRW<[M1WriteB1], (instrs Bcc)>;
@@ -360,13 +465,239 @@ def : InstRW<[M1WriteNALU2], (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64
def : InstRW<[M1WriteNALU1], (instregex "^ZIP[12]v")>;
// ASIMD load instructions.
+def : InstRW<[M1WriteVLDD], (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[M1WriteVLDD,
+ WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVLDE], (instregex "LD1i(64)$")>;
+def : InstRW<[M1WriteVLDE,
+ WriteAdr], (instregex "LD1i(64)_POST$")>;
+
+def : InstRW<[M1WriteL5], (instregex "LD1Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteL5,
+ WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteL5], (instregex "LD1Rv(1d)$")>;
+def : InstRW<[M1WriteL5,
+ WriteAdr], (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[M1WriteL5], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteL5,
+ WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteL5], (instregex "LD1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteL5,
+ WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteL5], (instregex "LD1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteL5,
+ WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDA,
+ WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDA], (instregex "LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDA,
+ WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDB,
+ WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDB], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDB,
+ WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVLDC,
+ WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVLDC], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDC,
+ WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDG], (instregex "LD2i(8|16)$")>;
+def : InstRW<[M1WriteVLDG,
+ WriteAdr], (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDG], (instregex "LD2i(32)$")>;
+def : InstRW<[M1WriteVLDG,
+ WriteAdr], (instregex "LD2i(32)_POST$")>;
+def : InstRW<[M1WriteVLDH], (instregex "LD2i(64)$")>;
+def : InstRW<[M1WriteVLDH,
+ WriteAdr], (instregex "LD2i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDA,
+ WriteAdr], (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(1d)$")>;
+def : InstRW<[M1WriteVLDA,
+ WriteAdr], (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDA], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVLDA,
+ WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDF,
+ WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDF,
+ WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDF], (instregex "LD2Twov(2d)$")>;
+def : InstRW<[M1WriteVLDF,
+ WriteAdr], (instregex "LD2Twov(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDJ], (instregex "LD3i(8|16)$")>;
+def : InstRW<[M1WriteVLDJ,
+ WriteAdr], (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDJ], (instregex "LD3i(32)$")>;
+def : InstRW<[M1WriteVLDJ,
+ WriteAdr], (instregex "LD3i(32)_POST$")>;
+def : InstRW<[M1WriteVLDL], (instregex "LD3i(64)$")>;
+def : InstRW<[M1WriteVLDL,
+ WriteAdr], (instregex "LD3i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDB,
+ WriteAdr], (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(1d)$")>;
+def : InstRW<[M1WriteVLDB,
+ WriteAdr], (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDB,
+ WriteAdr], (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDB], (instregex "LD3Rv(2d)$")>;
+def : InstRW<[M1WriteVLDB,
+ WriteAdr], (instregex "LD3Rv(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDI,
+ WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDI,
+ WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDI], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[M1WriteVLDI,
+ WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDK], (instregex "LD4i(8|16)$")>;
+def : InstRW<[M1WriteVLDK,
+ WriteAdr], (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[M1WriteVLDK], (instregex "LD4i(32)$")>;
+def : InstRW<[M1WriteVLDK,
+ WriteAdr], (instregex "LD4i(32)_POST$")>;
+def : InstRW<[M1WriteVLDM], (instregex "LD4i(64)$")>;
+def : InstRW<[M1WriteVLDM,
+ WriteAdr], (instregex "LD4i(64)_POST$")>;
+
+def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDC,
+ WriteAdr], (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(1d)$")>;
+def : InstRW<[M1WriteVLDC,
+ WriteAdr], (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDC,
+ WriteAdr], (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDC], (instregex "LD4Rv(2d)$")>;
+def : InstRW<[M1WriteVLDC,
+ WriteAdr], (instregex "LD4Rv(2d)_POST$")>;
+
+def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVLDN,
+ WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVLDN,
+ WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVLDN], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[M1WriteVLDN,
+ WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
// ASIMD store instructions.
+def : InstRW<[M1WriteVSTD], (instregex "ST1i(8|16|32)$")>;
+def : InstRW<[M1WriteVSTD,
+ WriteAdr], (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVSTD], (instregex "ST1i(64)$")>;
+def : InstRW<[M1WriteVSTD,
+ WriteAdr], (instregex "ST1i(64)_POST$")>;
+
+def : InstRW<[WriteVST], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteVST,
+ WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteVST], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVST,
+ WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTA,
+ WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTA], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTA,
+ WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTB,
+ WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTB], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTB,
+ WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[M1WriteVSTC,
+ WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[M1WriteVSTC], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[M1WriteVSTC,
+ WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTD], (instregex "ST2i(8|16|32)$")>;
+def : InstRW<[M1WriteVSTD,
+ WriteAdr], (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[M1WriteVSTD], (instregex "ST2i(64)$")>;
+def : InstRW<[M1WriteVSTD,
+ WriteAdr], (instregex "ST2i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTD], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTD,
+ WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTE,
+ WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTE], (instregex "ST2Twov(2d)$")>;
+def : InstRW<[M1WriteVSTE,
+ WriteAdr], (instregex "ST2Twov(2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTH], (instregex "ST3i(8|16)$")>;
+def : InstRW<[M1WriteVSTH,
+ WriteAdr], (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[M1WriteVSTH], (instregex "ST3i(32)$")>;
+def : InstRW<[M1WriteVSTH,
+ WriteAdr], (instregex "ST3i(32)_POST$")>;
+def : InstRW<[M1WriteVSTF], (instregex "ST3i(64)$")>;
+def : InstRW<[M1WriteVSTF,
+ WriteAdr], (instregex "ST3i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTF], (instregex "ST3Threev(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTF,
+ WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTG,
+ WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTG], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[M1WriteVSTG,
+ WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[M1WriteVSTH], (instregex "ST4i(8|16)$")>;
+def : InstRW<[M1WriteVSTH,
+ WriteAdr], (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[M1WriteVSTH], (instregex "ST4i(32)$")>;
+def : InstRW<[M1WriteVSTH,
+ WriteAdr], (instregex "ST4i(32)_POST$")>;
+def : InstRW<[M1WriteVSTF], (instregex "ST4i(64)$")>;
+def : InstRW<[M1WriteVSTF,
+ WriteAdr], (instregex "ST4i(64)_POST$")>;
+
+def : InstRW<[M1WriteVSTF], (instregex "ST4Fourv(8b|4h|2s)$")>;
+def : InstRW<[M1WriteVSTF,
+ WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(16b|8h|4s)$")>;
+def : InstRW<[M1WriteVSTI,
+ WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[M1WriteVSTI], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[M1WriteVSTI,
+ WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
// Cryptography instructions.
def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>;
-def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
new file mode 100644
index 0000000..9a0cb70
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -0,0 +1,352 @@
+//==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM ThunderX T8X
+// (T88, T81, T83) processors.
+// Loosely based on Cortex-A53 which is somewhat similar.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details.
+
+// Cavium ThunderX T8X scheduling machine model.
+def ThunderXT8XModel : SchedMachineModel {
+ let IssueWidth = 2; // 2 micro-ops dispatched per cycle.
+ let MicroOpBufferSize = 0; // ThunderX T88/T81/T83 are in-order.
+ let LoadLatency = 3; // Optimistic load latency.
+ let MispredictPenalty = 8; // Branch mispredict penalty.
+ let PostRAScheduler = 1; // Use PostRA scheduler.
+ let CompleteModel = 1;
+}
+
+// Modeling each pipeline with BufferSize == 0 since T8X is in-order.
+def THXT8XUnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def THXT8XUnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def THXT8XUnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division
+def THXT8XUnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def THXT8XUnitBr : ProcResource<1> { let BufferSize = 0; } // Branch
+def THXT8XUnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def THXT8XUnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mul/Div/Sqrt
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types mapping the ProcResources and
+// latencies.
+
+let SchedModel = ThunderXT8XModel in {
+
+// ALU
+def : WriteRes<WriteImm, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteI, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIEReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIS, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [THXT8XUnitALU]> { let Latency = 2; }
+
+// MAC
+def : WriteRes<WriteIM32, [THXT8XUnitMAC]> {
+ let Latency = 4;
+ let ResourceCycles = [1];
+}
+
+def : WriteRes<WriteIM64, [THXT8XUnitMAC]> {
+ let Latency = 4;
+ let ResourceCycles = [1];
+}
+
+// Div
+def : WriteRes<WriteID32, [THXT8XUnitDiv]> {
+ let Latency = 12;
+ let ResourceCycles = [6];
+}
+
+def : WriteRes<WriteID64, [THXT8XUnitDiv]> {
+ let Latency = 14;
+ let ResourceCycles = [8];
+}
+
+// Load
+def : WriteRes<WriteLD, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDIdx, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDHi, [THXT8XUnitLdSt]> { let Latency = 3; }
+
+// Vector Load
+def : WriteRes<WriteVLD, [THXT8XUnitLdSt]> {
+ let Latency = 8;
+ let ResourceCycles = [3];
+}
+
+def THXT8XWriteVLD1 : SchedWriteRes<[THXT8XUnitLdSt]> {
+ let Latency = 6;
+ let ResourceCycles = [1];
+}
+
+def THXT8XWriteVLD2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+ let Latency = 11;
+ let ResourceCycles = [7];
+}
+
+def THXT8XWriteVLD3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+ let Latency = 12;
+ let ResourceCycles = [8];
+}
+
+def THXT8XWriteVLD4 : SchedWriteRes<[THXT8XUnitLdSt]> {
+ let Latency = 13;
+ let ResourceCycles = [9];
+}
+
+def THXT8XWriteVLD5 : SchedWriteRes<[THXT8XUnitLdSt]> {
+ let Latency = 13;
+ let ResourceCycles = [9];
+}
+
+// Pre/Post Indexing
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTX, [THXT8XUnitLdSt]> { let Latency = 1; }
+
+// Vector Store
+def : WriteRes<WriteVST, [THXT8XUnitLdSt]>;
+def THXT8XWriteVST1 : SchedWriteRes<[THXT8XUnitLdSt]>;
+
+def THXT8XWriteVST2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+ let Latency = 10;
+ let ResourceCycles = [9];
+}
+
+def THXT8XWriteVST3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+ let Latency = 11;
+ let ResourceCycles = [10];
+}
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [THXT8XUnitBr]>;
+def THXT8XWriteBR : SchedWriteRes<[THXT8XUnitBr]>;
+def : WriteRes<WriteBrReg, [THXT8XUnitBr]>;
+def THXT8XWriteBRR : SchedWriteRes<[THXT8XUnitBr]>;
+def THXT8XWriteRET : SchedWriteRes<[THXT8XUnitALU]>;
+def : WriteRes<WriteSys, [THXT8XUnitBr]>;
+def : WriteRes<WriteBarrier, [THXT8XUnitBr]>;
+def : WriteRes<WriteHint, [THXT8XUnitBr]>;
+
+// FP ALU
+def : WriteRes<WriteF, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [THXT8XUnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [THXT8XUnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [THXT8XUnitFPMDS]> {
+ let Latency = 22;
+ let ResourceCycles = [19];
+}
+
+def THXT8XWriteFMAC : SchedWriteRes<[THXT8XUnitFPMDS]> { let Latency = 10; }
+
+def THXT8XWriteFDivSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+ let Latency = 12;
+ let ResourceCycles = [9];
+}
+
+def THXT8XWriteFDivDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+ let Latency = 22;
+ let ResourceCycles = [19];
+}
+
+def THXT8XWriteFSqrtSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+ let Latency = 17;
+ let ResourceCycles = [14];
+}
+
+def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+ let Latency = 31;
+ let ResourceCycles = [28];
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 1>;
+def : ReadAdvance<ReadAdrBase, 2>;
+def : ReadAdvance<ReadVLD, 2>;
+
+// FIXME: This needs more targeted benchmarking.
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+// operands are needed one cycle later if and only if they are to be
+// shifted. Otherwise, they too are needed two cycles later. This same
+// ReadAdvance applies to Extended registers as well, even though there is
+// a separate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm, WriteI,
+ WriteISReg, WriteIEReg, WriteIS,
+ WriteID32, WriteID64,
+ WriteIM32, WriteIM64]>;
+def THXT8XReadShifted : SchedReadAdvance<1, [WriteImm, WriteI,
+ WriteISReg, WriteIEReg, WriteIS,
+ WriteID32, WriteID64,
+ WriteIM32, WriteIM64]>;
+def THXT8XReadNotShifted : SchedReadAdvance<2, [WriteImm, WriteI,
+ WriteISReg, WriteIEReg, WriteIS,
+ WriteID32, WriteID64,
+ WriteIM32, WriteIM64]>;
+def THXT8XReadISReg : SchedReadVariant<[
+ SchedVar<RegShiftedPred, [THXT8XReadShifted]>,
+ SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, THXT8XReadISReg>;
+
+def THXT8XReadIEReg : SchedReadVariant<[
+ SchedVar<RegExtendedPred, [THXT8XReadShifted]>,
+ SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, THXT8XReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+// Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+ WriteISReg, WriteIEReg, WriteIS,
+ WriteID32, WriteID64,
+ WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm, WriteI,
+ WriteISReg, WriteIEReg, WriteIS,
+ WriteID32, WriteID64,
+ WriteIM32, WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm, WriteI,
+ WriteISReg, WriteIEReg, WriteIS,
+ WriteID32, WriteID64,
+ WriteIM32, WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRW.
+
+//---
+// Branch
+//---
+def : InstRW<[THXT8XWriteBR], (instregex "^B")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^BL")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^B.*")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBZ")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BR")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BLR")>;
+
+//---
+// Ret
+//---
+def : InstRW<[THXT8XWriteRET], (instregex "^RET")>;
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[THXT8XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THXT8XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THXT8XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THXT8XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THXT8XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THXT8XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
new file mode 100644
index 0000000..10df50b
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -0,0 +1,1745 @@
+//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 ---*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for Cavium ThunderX2T99
+// processors.
+// Based on Broadcom Vulcan.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 2. Pipeline Description.
+
+def ThunderX2T99Model : SchedMachineModel {
+ let IssueWidth = 4; // 4 micro-ops dispatched at a time.
+ let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer.
+ let LoadLatency = 4; // Optimistic load latency.
+ let MispredictPenalty = 12; // Extra cycles for mispredicted branch.
+ // Determined via a mix of micro-arch details and experimentation.
+ let LoopMicroOpBufferSize = 32;
+ let PostRAScheduler = 1; // Using PostRA sched.
+ let CompleteModel = 1;
+}
+
+// Define the issue ports.
+
+// Port 0: ALU, FP/SIMD.
+def THX2T99P0 : ProcResource<1>;
+
+// Port 1: ALU, FP/SIMD, integer mul/div.
+def THX2T99P1 : ProcResource<1>;
+
+// Port 2: ALU, Branch.
+def THX2T99P2 : ProcResource<1>;
+
+// Port 3: Store data.
+def THX2T99P3 : ProcResource<1>;
+
+// Port 4: Load/store.
+def THX2T99P4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def THX2T99P5 : ProcResource<1>;
+
+let SchedModel = ThunderX2T99Model in {
+
+// Define groups for the functional units on each issue port. Each group
+// created will be used by a WriteRes later on.
+//
+// NOTE: Some groups only contain one member. This is a way to create names for
+// the various functional units that share a single issue port. For example,
+// "THX2T99I1" for ALU ops on port 1 and "THX2T99F1" for FP ops on port 1.
+
+// Integer divide and multiply micro-ops only on port 1.
+def THX2T99I1 : ProcResGroup<[THX2T99P1]>;
+
+// Branch micro-ops only on port 2.
+def THX2T99I2 : ProcResGroup<[THX2T99P2]>;
+
+// ALU micro-ops on ports 0, 1, and 2.
+def THX2T99I012 : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2]>;
+
+// Crypto FP/SIMD micro-ops only on port 1.
+def THX2T99F1 : ProcResGroup<[THX2T99P1]>;
+
+// FP/SIMD micro-ops on ports 0 and 1.
+def THX2T99F01 : ProcResGroup<[THX2T99P0, THX2T99P1]>;
+
+// Store data micro-ops only on port 3.
+def THX2T99SD : ProcResGroup<[THX2T99P3]>;
+
+// Load/store micro-ops on ports 4 and 5.
+def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>;
+
+// 60 entry unified scheduler.
+def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2,
+ THX2T99P3, THX2T99P4, THX2T99P5]> {
+ let BufferSize = 60;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+}
+
+// 1 cycles on I2.
+def THX2T99Write_1Cyc_I2 : SchedWriteRes<[THX2T99I2]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on I1.
+def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// 23 cycles on I1.
+def THX2T99Write_23Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+ let Latency = 23;
+ let ResourceCycles = [13, 23];
+ let NumMicroOps = 4;
+}
+
+// 39 cycles on I1.
+def THX2T99Write_39Cyc_I1 : SchedWriteRes<[THX2T99I1]> {
+ let Latency = 39;
+ let ResourceCycles = [13, 39];
+ let NumMicroOps = 4;
+}
+
+// 1 cycle on I0, I1, or I2.
+def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 2 cycles on I0, I1, or I2.
+def THX2T99Write_2Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on I0, I1, or I2.
+def THX2T99Write_4Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on I0, I1, or I2.
+def THX2T99Write_5Cyc_I012 : SchedWriteRes<[THX2T99I012]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on F1.
+def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 7 cycles on F1.
+def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on F0 or F1.
+def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on F0 or F1.
+def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// 6 cycles on F0 or F1.
+def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 7 cycles on F0 or F1.
+def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// 8 cycles on F0 or F1.
+def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+// 10 cycles on F0 or F1.
+def THX2T99Write_10Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+}
+
+// 16 cycles on F0 or F1.
+def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 16;
+ let NumMicroOps = 3;
+ let ResourceCycles = [8];
+}
+
+// 23 cycles on F0 or F1.
+def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 23;
+ let NumMicroOps = 3;
+ let ResourceCycles = [11];
+}
+
+// 1 cycles on LS0 or LS1.
+def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+ let Latency = 0;
+}
+
+// 1 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+ let Latency = 0;
+ let NumMicroOps = 2;
+}
+
+// 1 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_1Cyc_LS01_I012_I012 :
+ SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+ let Latency = 0;
+ let NumMicroOps = 3;
+}
+
+// 2 cycles on LS0 or LS1.
+def THX2T99Write_2Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0 or LS1.
+def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+// 5 cycles on LS0 or LS1.
+def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1.
+def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_4Cyc_LS01_I012_I012 :
+ SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_5Cyc_LS01_I012_I012 :
+ SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1 and I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def THX2T99Write_6Cyc_LS01_I012_I012 :
+ SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 1 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+}
+
+// 7 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// 8 cycles on LS0 or LS1 and F0 or F1.
+def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+}
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+let SchedModel = ThunderX2T99Model in {
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr, [THX2T99I2]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg, [THX2T99I2]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def : WriteRes<WriteSys, []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint, []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic, []> {
+ let Unsupported = 1;
+ let NumMicroOps = 2;
+}
+
+//---
+// Branch
+//---
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs B, BL, BR, BLR)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instrs RET)>;
+def : InstRW<[THX2T99Write_1Cyc_I2], (instregex "^B.*")>;
+def : InstRW<[THX2T99Write_1Cyc_I2],
+ (instregex "^CBZ", "^CBNZ", "^TBZ", "^TBNZ")>;
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI, [THX2T99I012]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 3];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteI],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)",
+ "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)",
+ "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)",
+ "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+ "CSNEG?(W|X)r(i|r|s|x)")>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg, [THX2T99I012]> {
+ let Latency = 2;
+ let ResourceCycles = [2, 3];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteISReg],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)",
+ "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)",
+ "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)",
+ "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+ "CSNEG?(W|X)r(i|r|s|x)")>;
+
+def : WriteRes<WriteIEReg, [THX2T99I012]> {
+ let Latency = 1;
+ let ResourceCycles = [1, 3];
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteIEReg],
+ (instregex "ADD?(W|X)r(i|r|s|x)", "ADDS?(W|X)r(i|r|s|x)(64)?",
+ "AND?(W|X)r(i|r|s|x)", "ANDS?(W|X)r(i|r|s|x)",
+ "ADC?(W|X)r(i|r|s|x)", "ADCS?(W|X)r(i|r|s|x)",
+ "BIC?(W|X)r(i|r|s|x)", "BICS?(W|X)r(i|r|s|x)",
+ "EON?(W|X)r(i|r|s|x)", "ORN?(W|X)r(i|r|s|x)",
+ "ORR?(W|X)r(i|r|s|x)", "SUB?(W|X)r(i|r|s|x)",
+ "SUBS?(W|X)r(i|r|s|x)", "SBC?(W|X)r(i|r|s|x)",
+ "SBCS?(W|X)r(i|r|s|x)", "CCMN?(W|X)r(i|r|s|x)",
+ "CCMP?(W|X)r(i|r|s|x)", "CSEL?(W|X)r(i|r|s|x)",
+ "CSINC?(W|X)r(i|r|s|x)", "CSINV?(W|X)r(i|r|s|x)",
+ "CSNEG?(W|X)r(i|r|s|x)")>;
+
+// Move immed
+def : WriteRes<WriteImm, [THX2T99I012]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+ (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_I012],
+ (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
+
+// Variable shift
+def : WriteRes<WriteIS, [THX2T99I012]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23/13-39.
+def : WriteRes<WriteID32, [THX2T99I1]> {
+ let Latency = 39;
+ let ResourceCycles = [13, 39];
+ let NumMicroOps = 4;
+}
+
+// Divide, X-form
+def : WriteRes<WriteID64, [THX2T99I1]> {
+ let Latency = 23;
+ let ResourceCycles = [13, 23];
+ let NumMicroOps = 4;
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32, [THX2T99I012]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64, [THX2T99I012]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+}
+
+//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX2T99Write_5Cyc_I012],
+// (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[THX2T99Write_5Cyc_I012],
+ (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr, [THX2T99I012]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Multiply high
+def : InstRW<[THX2T99Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[THX2T99Write_1Cyc_I012],
+ (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
+
+// Bitfield move, insert
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "^BFM")>;
+def : InstRW<[THX2T99Write_1Cyc_I012], (instregex "(S|U)?BFM.*")>;
+
+// Count leading
+def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
+ "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[THX2T99Write_1Cyc_I012], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES[DE]")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AESI?MC")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA1[CMP]")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256SU0")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA256(H|H2|SU1)")>;
+
+// CRC Instructions
+// def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>;
+def : InstRW<[THX2T99Write_4Cyc_I1],
+ (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>;
+
+def : InstRW<[THX2T99Write_4Cyc_I1],
+ (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD, [THX2T99LS01]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr, [THX2T99I012]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op. Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi, []> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+}
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def THX2T99WriteLDIdx : SchedWriteVariant<[
+ SchedVar<ScaledIdxPred, [THX2T99Write_6Cyc_LS01_I012_I012]>,
+ SchedVar<NoSchedPred, [THX2T99Write_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, THX2T99WriteLDIdx>;
+
+def THX2T99ReadAdrBase : SchedReadVariant<[
+ SchedVar<ScaledIdxPred, [ReadDefault]>,
+ SchedVar<NoSchedPred, [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>;
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRBui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDui)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRHui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRQui)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01], (instrs LDRSui)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRDl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRQl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRWl)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDRXl)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRXi)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDTRSWi)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSBXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHWpost)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRSHXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRBBpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpre)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, WriteAdr], (instrs LDRHHpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteLDHi, WriteAdr],
+ (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRWpre)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteAdr], (instrs LDRXpre)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteLDHi, WriteAdr],
+ (instrs LDPXpost)>;
+
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRBpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRDpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRHpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRQpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRSpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRWpost)>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_I012_I012, WriteI], (instrs LDRXpost)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRBroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRDroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRHHroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRQroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRSroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRSHWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRSHXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRWroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRXroW)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRBroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRDroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRHHroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRQroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRSroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRSHWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRSHXroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRWroX)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs LDRXroX)>;
+
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURBBi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURDi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURHHi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURQi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSBXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHWi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSHXi)>;
+def : InstRW<[THX2T99Write_4Cyc_LS01], (instrs LDURSWi)>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMl)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFUMi)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMui)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroW)>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_I012], (instrs PRFMroX)>;
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST, [THX2T99LS01, THX2T99SD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [THX2T99LS01, THX2T99SD, THX2T99I012]> {
+ let Latency = 1;
+ let NumMicroOps = 3;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP, [THX2T99LS01, THX2T99SD]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURBBi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURHHi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURSi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURWi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01], (instrs STTRXi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STNPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPDi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPQi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPXi)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01], (instrs STPWi)>;
+
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRBui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRDui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRHui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRQui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRXui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012_I012], (instrs STRWui)>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_I012], (instrs STRWui)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STPXpre, STPXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012],
+ (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRXpre, STRXpost)>;
+
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012, ReadAdrBase],
+ (instrs STRXroW, STRXroX)>;
+def : InstRW<[WriteAdr, THX2T99Write_1Cyc_LS01_I012_I012, ReadAdrBase],
+ (instrs STRXroW, STRXroX)>;
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF, [THX2T99F01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// FP arithmetic
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp, [THX2T99F01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFDiv, [THX2T99F01]> {
+ let Latency = 22;
+ let ResourceCycles = [19];
+}
+
+def THX2T99XWriteFDiv : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFDivSP : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFDivDP : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 23;
+ let ResourceCycles = [12];
+ let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtSP : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 16;
+ let ResourceCycles = [8];
+ let NumMicroOps = 4;
+}
+
+def THX2T99XWriteFSqrtDP : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 23;
+ let ResourceCycles = [12];
+ let NumMicroOps = 4;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[THX2T99XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[THX2T99XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THX2T99XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSrr")>;
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[THX2T99XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[THX2T99XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THX2T99XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDrr")>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [THX2T99F01]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+ let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMul : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+ let NumMicroOps = 3;
+}
+
+def THX2T99XWriteFMulAcc : SchedWriteRes<[THX2T99F01]> {
+ let Latency = 6;
+ let ResourceCycles = [2];
+ let NumMicroOps = 3;
+}
+
+def : InstRW<[THX2T99XWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[THX2T99XWriteFMulAcc],
+ (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
+
+// FP round to integral
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [THX2T99F01]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+}
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [THX2T99F01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [THX2T99F01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [THX2T99F01]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4, 23];
+}
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+
+// ASIMD logical (MOV, MVN, ORN, ORR)
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^ANDv", "^BICv", "^EORv", "^MOVv", "^MVNv",
+ "^ORRv", "^ORNv", "^NOTv")>;
+// ASIMD arith, reduce
+def : InstRW<[THX2T99Write_10Cyc_F01],
+ (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+ (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[THX2T99Write_10Cyc_F01],
+ (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[THX2T99Write_10Cyc_F01],
+ (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^(P?MUL|SQR?DMULH)" #
+ "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+ "(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv",
+ "SQSHRNv","SQSHRUNv", "UQRSHRNv",
+ "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[SU][QR]{1,2}SHL" #
+ "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+ "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+ "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADALP","^UADALP")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLPv","^UADDLPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SADDLV","^UADDLV")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^SUQADDv","^USQADDv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv",
+ "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+ "^SRHADD", "^SUBHNv", "^SUQADD",
+ "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^CMEQv","^CMGEv","^CMGTv",
+ "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv",
+ "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>;
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[THX2T99Write_6Cyc_F01],
+ (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+ "^FCMGTv", "^FCMLEv",
+ "^FCMLTv")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv2f32")>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instregex "FDIVv4f32")>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instregex "FDIVv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+ "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+ "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+ "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+ (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+ (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+ (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[THX2T99Write_6Cyc_F01],
+ (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP negate
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>;
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^XTNv")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[THX2T99Write_7Cyc_F01],
+ (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
+
+// ASIMD insert, element to element
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
+
+// ASIMD table lookup, D-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v8i8Four")>;
+
+// ASIMD table lookup, Q-form
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8One")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Two")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Three")>;
+def : InstRW<[THX2T99Write_7Cyc_F01], (instregex "^TB[LX]v16i8Four")>;
+
+// ASIMD transpose
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+ "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[THX2T99Write_5Cyc_F01],
+ (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "(S|U)MOVv.*")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+ "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX2T99Write_4Cyc_LS01],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX2T99Write_4Cyc_LS01],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr],
+ (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX2T99Write_5Cyc_LS01],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr],
+ (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX2T99Write_6Cyc_LS01],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr],
+ (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr],
+ (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX2T99Write_1Cyc_LS01],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr],
+ (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+ (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+ (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr],
+ (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+} // SchedModel = ThunderX2T99Model
+
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td b/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
deleted file mode 100644
index 35a40c3..0000000
--- a/contrib/llvm/lib/Target/AArch64/AArch64SchedVulcan.td
+++ /dev/null
@@ -1,852 +0,0 @@
-//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// 1. Introduction
-//
-// This file defines the machine model for Broadcom Vulcan to support
-// instruction scheduling and other instruction cost heuristics.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// 2. Pipeline Description.
-
-def VulcanModel : SchedMachineModel {
- let IssueWidth = 4; // 4 micro-ops dispatched at a time.
- let MicroOpBufferSize = 180; // 180 entries in micro-op re-order buffer.
- let LoadLatency = 4; // Optimistic load latency.
- let MispredictPenalty = 12; // Extra cycles for mispredicted branch.
- // Determined via a mix of micro-arch details and experimentation.
- let LoopMicroOpBufferSize = 32;
- let PostRAScheduler = 1; // Using PostRA sched.
- let CompleteModel = 1;
-}
-
-// Define the issue ports.
-
-// Port 0: ALU, FP/SIMD.
-def VulcanP0 : ProcResource<1>;
-
-// Port 1: ALU, FP/SIMD, integer mul/div.
-def VulcanP1 : ProcResource<1>;
-
-// Port 2: ALU, Branch.
-def VulcanP2 : ProcResource<1>;
-
-// Port 3: Store data.
-def VulcanP3 : ProcResource<1>;
-
-// Port 4: Load/store.
-def VulcanP4 : ProcResource<1>;
-
-// Port 5: Load/store.
-def VulcanP5 : ProcResource<1>;
-
-let SchedModel = VulcanModel in {
-
-// Define groups for the functional units on each issue port. Each group
-// created will be used by a WriteRes later on.
-//
-// NOTE: Some groups only contain one member. This is a way to create names for
-// the various functional units that share a single issue port. For example,
-// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1.
-
-// Integer divide and multiply micro-ops only on port 1.
-def VulcanI1 : ProcResGroup<[VulcanP1]>;
-
-// Branch micro-ops only on port 2.
-def VulcanI2 : ProcResGroup<[VulcanP2]>;
-
-// ALU micro-ops on ports 0, 1, and 2.
-def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
-
-// Crypto FP/SIMD micro-ops only on port 1.
-def VulcanF1 : ProcResGroup<[VulcanP1]>;
-
-// FP/SIMD micro-ops on ports 0 and 1.
-def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
-
-// Store data micro-ops only on port 3.
-def VulcanSD : ProcResGroup<[VulcanP3]>;
-
-// Load/store micro-ops on ports 4 and 5.
-def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
-
-// 60 entry unified scheduler.
-def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
- VulcanP3, VulcanP4, VulcanP5]> {
- let BufferSize=60;
-}
-
-// Define commonly used write types for InstRW specializations.
-// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
-
-// 3 cycles on I1.
-def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
-
-// 4 cycles on I1.
-def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
-
-// 1 cycle on I0, I1, or I2.
-def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
-
-// 5 cycles on F1.
-def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
-
-// 7 cycles on F1.
-def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
-
-// 4 cycles on F0 or F1.
-def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
-
-// 5 cycles on F0 or F1.
-def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
-
-// 6 cycles on F0 or F1.
-def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
-
-// 7 cycles on F0 or F1.
-def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
-
-// 8 cycles on F0 or F1.
-def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
-
-// 16 cycles on F0 or F1.
-def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
- let Latency = 16;
- let ResourceCycles = [8];
-}
-
-// 23 cycles on F0 or F1.
-def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
- let Latency = 23;
- let ResourceCycles = [11];
-}
-
-// 1 cycles on LS0 or LS1.
-def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
-
-// 4 cycles on LS0 or LS1.
-def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
-
-// 5 cycles on LS0 or LS1.
-def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
-
-// 6 cycles on LS0 or LS1.
-def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
-
-// 5 cycles on LS0 or LS1 and I0, I1, or I2.
-def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
- let Latency = 5;
- let NumMicroOps = 2;
-}
-
-// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
-def VulcanWrite_6Cyc_LS01_I012_I012 :
- SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
- let Latency = 6;
- let NumMicroOps = 3;
-}
-
-// 1 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
- let Latency = 1;
- let NumMicroOps = 2;
-}
-
-// 5 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
- let Latency = 5;
- let NumMicroOps = 2;
-}
-
-// 6 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
- let Latency = 6;
- let NumMicroOps = 2;
-}
-
-// 7 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
- let Latency = 7;
- let NumMicroOps = 2;
-}
-
-// 8 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
- let Latency = 8;
- let NumMicroOps = 2;
-}
-
-// Define commonly used read types.
-
-// No forwarding is provided for these types.
-def : ReadAdvance<ReadI, 0>;
-def : ReadAdvance<ReadISReg, 0>;
-def : ReadAdvance<ReadIEReg, 0>;
-def : ReadAdvance<ReadIM, 0>;
-def : ReadAdvance<ReadIMA, 0>;
-def : ReadAdvance<ReadID, 0>;
-def : ReadAdvance<ReadExtrHi, 0>;
-def : ReadAdvance<ReadAdrBase, 0>;
-def : ReadAdvance<ReadVLD, 0>;
-
-}
-
-
-//===----------------------------------------------------------------------===//
-// 3. Instruction Tables.
-
-let SchedModel = VulcanModel in {
-
-//---
-// 3.1 Branch Instructions
-//---
-
-// Branch, immed
-// Branch and link, immed
-// Compare and branch
-def : WriteRes<WriteBr, [VulcanI2]> { let Latency = 1; }
-
-def : WriteRes<WriteSys, []> { let Latency = 1; }
-def : WriteRes<WriteBarrier, []> { let Latency = 1; }
-def : WriteRes<WriteHint, []> { let Latency = 1; }
-
-def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
-
-// Branch, register
-// Branch and link, register != LR
-// Branch and link, register = LR
-def : WriteRes<WriteBrReg, [VulcanI2]> { let Latency = 1; }
-
-//---
-// 3.2 Arithmetic and Logical Instructions
-// 3.3 Move and Shift Instructions
-//---
-
-// ALU, basic
-// Conditional compare
-// Conditional select
-// Address generation
-def : WriteRes<WriteI, [VulcanI012]> { let Latency = 1; }
-def : InstRW<[WriteI], (instrs COPY)>;
-
-// ALU, extend and/or shift
-def : WriteRes<WriteISReg, [VulcanI012]> {
- let Latency = 2;
- let ResourceCycles = [2];
-}
-
-def : WriteRes<WriteIEReg, [VulcanI012]> {
- let Latency = 2;
- let ResourceCycles = [2];
-}
-
-// Move immed
-def : WriteRes<WriteImm, [VulcanI012]> { let Latency = 1; }
-
-// Variable shift
-def : WriteRes<WriteIS, [VulcanI012]> { let Latency = 1; }
-
-//---
-// 3.4 Divide and Multiply Instructions
-//---
-
-// Divide, W-form
-// Latency range of 13-23. Take the average.
-def : WriteRes<WriteID32, [VulcanI1]> {
- let Latency = 18;
- let ResourceCycles = [18];
-}
-
-// Divide, X-form
-// Latency range of 13-39. Take the average.
-def : WriteRes<WriteID64, [VulcanI1]> {
- let Latency = 26;
- let ResourceCycles = [26];
-}
-
-// Multiply accumulate, W-form
-def : WriteRes<WriteIM32, [VulcanI012]> { let Latency = 5; }
-
-// Multiply accumulate, X-form
-def : WriteRes<WriteIM64, [VulcanI012]> { let Latency = 5; }
-
-// Bitfield extract, two reg
-def : WriteRes<WriteExtr, [VulcanI012]> { let Latency = 1; }
-
-// Bitfield move, basic
-// Bitfield move, insert
-// NOTE: Handled by WriteIS.
-
-// Count leading
-def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
- "^CLZ(W|X)r$")>;
-
-// Reverse bits/bytes
-// NOTE: Handled by WriteI.
-
-//---
-// 3.6 Load Instructions
-// 3.10 FP Load Instructions
-//---
-
-// Load register, literal
-// Load register, unscaled immed
-// Load register, immed unprivileged
-// Load register, unsigned immed
-def : WriteRes<WriteLD, [VulcanLS01]> { let Latency = 4; }
-
-// Load register, immed post-index
-// NOTE: Handled by WriteLD, WriteI.
-// Load register, immed pre-index
-// NOTE: Handled by WriteLD, WriteAdr.
-def : WriteRes<WriteAdr, [VulcanI012]> { let Latency = 1; }
-
-// Load register offset, basic
-// Load register, register offset, scale by 4/8
-// Load register, register offset, scale by 2
-// Load register offset, extend
-// Load register, register offset, extend, scale by 4/8
-// Load register, register offset, extend, scale by 2
-def VulcanWriteLDIdx : SchedWriteVariant<[
- SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
- SchedVar<NoSchedPred, [VulcanWrite_5Cyc_LS01_I012]>]>;
-def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
-
-def VulcanReadAdrBase : SchedReadVariant<[
- SchedVar<ScaledIdxPred, [ReadDefault]>,
- SchedVar<NoSchedPred, [ReadDefault]>]>;
-def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
-
-// Load pair, immed offset, normal
-// Load pair, immed offset, signed words, base != SP
-// Load pair, immed offset signed words, base = SP
-// LDP only breaks into *one* LS micro-op. Thus
-// the resources are handling by WriteLD.
-def : WriteRes<WriteLDHi, []> {
- let Latency = 5;
-}
-
-// Load pair, immed pre-index, normal
-// Load pair, immed pre-index, signed words
-// Load pair, immed post-index, normal
-// Load pair, immed post-index, signed words
-// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
-
-//--
-// 3.7 Store Instructions
-// 3.11 FP Store Instructions
-//--
-
-// Store register, unscaled immed
-// Store register, immed unprivileged
-// Store register, unsigned immed
-def : WriteRes<WriteST, [VulcanLS01, VulcanSD]> {
- let Latency = 1;
- let NumMicroOps = 2;
-}
-
-// Store register, immed post-index
-// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
-
-// Store register, immed pre-index
-// NOTE: Handled by WriteAdr, WriteST
-
-// Store register, register offset, basic
-// Store register, register offset, scaled by 4/8
-// Store register, register offset, scaled by 2
-// Store register, register offset, extend
-// Store register, register offset, extend, scale by 4/8
-// Store register, register offset, extend, scale by 1
-def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
- let Latency = 1;
- let NumMicroOps = 3;
-}
-
-// Store pair, immed offset, W-form
-// Store pair, immed offset, X-form
-def : WriteRes<WriteSTP, [VulcanLS01, VulcanSD]> {
- let Latency = 1;
- let NumMicroOps = 2;
-}
-
-// Store pair, immed post-index, W-form
-// Store pair, immed post-index, X-form
-// Store pair, immed pre-index, W-form
-// Store pair, immed pre-index, X-form
-// NOTE: Handled by WriteAdr, WriteSTP.
-
-//---
-// 3.8 FP Data Processing Instructions
-//---
-
-// FP absolute value
-// FP min/max
-// FP negate
-def : WriteRes<WriteF, [VulcanF01]> { let Latency = 5; }
-
-// FP arithmetic
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
-
-// FP compare
-def : WriteRes<WriteFCmp, [VulcanF01]> { let Latency = 5; }
-
-// FP divide, S-form
-// FP square root, S-form
-def : WriteRes<WriteFDiv, [VulcanF01]> {
- let Latency = 16;
- let ResourceCycles = [8];
-}
-
-// FP divide, D-form
-// FP square root, D-form
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
-
-// FP multiply
-// FP multiply accumulate
-def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
-
-// FP round to integral
-def : InstRW<[VulcanWrite_7Cyc_F01],
- (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
-
-// FP select
-def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
-
-//---
-// 3.9 FP Miscellaneous Instructions
-//---
-
-// FP convert, from vec to vec reg
-// FP convert, from gen to vec reg
-// FP convert, from vec to gen reg
-def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
-
-// FP move, immed
-// FP move, register
-def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
-
-// FP transfer, from gen to vec reg
-// FP transfer, from vec to gen reg
-def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
-
-//---
-// 3.12 ASIMD Integer Instructions
-//---
-
-// ASIMD absolute diff, D-form
-// ASIMD absolute diff, Q-form
-// ASIMD absolute diff accum, D-form
-// ASIMD absolute diff accum, Q-form
-// ASIMD absolute diff accum long
-// ASIMD absolute diff long
-// ASIMD arith, basic
-// ASIMD arith, complex
-// ASIMD compare
-// ASIMD logical (AND, BIC, EOR)
-// ASIMD max/min, basic
-// ASIMD max/min, reduce, 4H/4S
-// ASIMD max/min, reduce, 8B/8H
-// ASIMD max/min, reduce, 16B
-// ASIMD multiply, D-form
-// ASIMD multiply, Q-form
-// ASIMD multiply accumulate long
-// ASIMD multiply accumulate saturating long
-// ASIMD multiply long
-// ASIMD pairwise add and accumulate
-// ASIMD shift accumulate
-// ASIMD shift by immed, basic
-// ASIMD shift by immed and insert, basic, D-form
-// ASIMD shift by immed and insert, basic, Q-form
-// ASIMD shift by immed, complex
-// ASIMD shift by register, basic, D-form
-// ASIMD shift by register, basic, Q-form
-// ASIMD shift by register, complex, D-form
-// ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
-
-// ASIMD arith, reduce, 4H/4S
-// ASIMD arith, reduce, 8B/8H
-// ASIMD arith, reduce, 16B
-def : InstRW<[VulcanWrite_5Cyc_F01],
- (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
-
-// ASIMD logical (MOV, MVN, ORN, ORR)
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
-
-// ASIMD polynomial (8x8) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
-
-//---
-// 3.13 ASIMD Floating-point Instructions
-//---
-
-// ASIMD FP absolute value
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
-
-// ASIMD FP arith, normal, D-form
-// ASIMD FP arith, normal, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
-
-// ASIMD FP arith,pairwise, D-form
-// ASIMD FP arith, pairwise, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
-
-// ASIMD FP compare, D-form
-// ASIMD FP compare, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
- "^FCMGTv", "^FCMLEv",
- "^FCMLTv")>;
-
-// ASIMD FP convert, long
-// ASIMD FP convert, narrow
-// ASIMD FP convert, other, D-form
-// ASIMD FP convert, other, Q-form
-// NOTE: Handled by WriteV.
-
-// ASIMD FP divide, D-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
-
-// ASIMD FP divide, Q-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
-
-// ASIMD FP divide, Q-form, F64
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
-
-// ASIMD FP max/min, normal, D-form
-// ASIMD FP max/min, normal, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
- "^FMINv", "^FMINNMv")>;
-
-// ASIMD FP max/min, pairwise, D-form
-// ASIMD FP max/min, pairwise, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
- "^FMINPv", "^FMINNMPv")>;
-
-// ASIMD FP max/min, reduce
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
- "^FMINVv", "^FMINNMVv")>;
-
-// ASIMD FP multiply, D-form, FZ
-// ASIMD FP multiply, D-form, no FZ
-// ASIMD FP multiply, Q-form, FZ
-// ASIMD FP multiply, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
-
-// ASIMD FP multiply accumulate, Dform, FZ
-// ASIMD FP multiply accumulate, Dform, no FZ
-// ASIMD FP multiply accumulate, Qform, FZ
-// ASIMD FP multiply accumulate, Qform, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
-
-// ASIMD FP negate
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
-
-// ASIMD FP round, D-form
-// ASIMD FP round, Q-form
-// NOTE: Handled by WriteV.
-
-//--
-// 3.14 ASIMD Miscellaneous Instructions
-//--
-
-// ASIMD bit reverse
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
-
-// ASIMD bitwise insert, D-form
-// ASIMD bitwise insert, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
-
-// ASIMD count, D-form
-// ASIMD count, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
-
-// ASIMD duplicate, gen reg
-// ASIMD duplicate, element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
-
-// ASIMD extract
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
-
-// ASIMD extract narrow
-// ASIMD extract narrow, saturating
-// NOTE: Handled by WriteV.
-
-// ASIMD insert, element to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
-
-// ASIMD move, integer immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
-
-// ASIMD move, FP immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
-
-// ASIMD reciprocal estimate, D-form
-// ASIMD reciprocal estimate, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01],
- (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
- "^FRSQRTEv", "^URSQRTEv")>;
-
-// ASIMD reciprocal step, D-form, FZ
-// ASIMD reciprocal step, D-form, no FZ
-// ASIMD reciprocal step, Q-form, FZ
-// ASIMD reciprocal step, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
-
-// ASIMD reverse
-def : InstRW<[VulcanWrite_5Cyc_F01],
- (instregex "^REV16v", "^REV32v", "^REV64v")>;
-
-// ASIMD table lookup, D-form
-// ASIMD table lookup, Q-form
-def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
-
-// ASIMD transfer, element to word or word
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
-
-// ASIMD transfer, element to gen reg
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
-
-// ASIMD transfer gen reg to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
-
-// ASIMD transpose
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
- "^UZP1v", "^UZP2v")>;
-
-// ASIMD unzip/zip
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
-
-//--
-// 3.15 ASIMD Load Instructions
-//--
-
-// ASIMD load, 1 element, multiple, 1 reg, D-form
-// ASIMD load, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01],
- (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr],
- (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, multiple, 2 reg, D-form
-// ASIMD load, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01],
- (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr],
- (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, multiple, 3 reg, D-form
-// ASIMD load, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01],
- (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr],
- (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, multiple, 4 reg, D-form
-// ASIMD load, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_6Cyc_LS01],
- (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr],
- (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 1 element, one lane, B/H/S
-// ASIMD load, 1 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
- (instregex "^LD1i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 1 element, all lanes, D-form, B/H/S
-// ASIMD load, 1 element, all lanes, D-form, D
-// ASIMD load, 1 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
- (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
- (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 2 element, multiple, D-form, B/H/S
-// ASIMD load, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
- (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
- (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 2 element, one lane, B/H
-// ASIMD load, 2 element, one lane, S
-// ASIMD load, 2 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
- (instregex "^LD2i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 2 element, all lanes, D-form, B/H/S
-// ASIMD load, 2 element, all lanes, D-form, D
-// ASIMD load, 2 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01],
- (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr],
- (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 3 element, multiple, D-form, B/H/S
-// ASIMD load, 3 element, multiple, Q-form, B/H/S
-// ASIMD load, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01],
- (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr],
- (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 3 element, one lone, B/H
-// ASIMD load, 3 element, one lane, S
-// ASIMD load, 3 element, one lane, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr],
- (instregex "^LD3i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 3 element, all lanes, D-form, B/H/S
-// ASIMD load, 3 element, all lanes, D-form, D
-// ASIMD load, 3 element, all lanes, Q-form, B/H/S
-// ASIMD load, 3 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01],
- (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr],
- (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 4 element, multiple, D-form, B/H/S
-// ASIMD load, 4 element, multiple, Q-form, B/H/S
-// ASIMD load, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01],
- (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr],
- (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD load, 4 element, one lane, B/H
-// ASIMD load, 4 element, one lane, S
-// ASIMD load, 4 element, one lane, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
- (instregex "^LD4i(8|16|32|64)_POST$")>;
-
-// ASIMD load, 4 element, all lanes, D-form, B/H/S
-// ASIMD load, 4 element, all lanes, D-form, D
-// ASIMD load, 4 element, all lanes, Q-form, B/H/S
-// ASIMD load, 4 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01],
- (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
- (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-//--
-// 3.16 ASIMD Store Instructions
-//--
-
-// ASIMD store, 1 element, multiple, 1 reg, D-form
-// ASIMD store, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01],
- (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
- (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, multiple, 2 reg, D-form
-// ASIMD store, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01],
- (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
- (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, multiple, 3 reg, D-form
-// ASIMD store, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01],
- (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
- (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, multiple, 4 reg, D-form
-// ASIMD store, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01],
- (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr],
- (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 1 element, one lane, B/H/S
-// ASIMD store, 1 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
- (instregex "^ST1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
- (instregex "^ST1i(8|16|32|64)_POST$")>;
-
-// ASIMD store, 2 element, multiple, D-form, B/H/S
-// ASIMD store, 2 element, multiple, Q-form, B/H/S
-// ASIMD store, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
- (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
- (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 2 element, one lane, B/H/S
-// ASIMD store, 2 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
- (instregex "^ST2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
- (instregex "^ST2i(8|16|32|64)_POST$")>;
-
-// ASIMD store, 3 element, multiple, D-form, B/H/S
-// ASIMD store, 3 element, multiple, Q-form, B/H/S
-// ASIMD store, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
- (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
- (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 3 element, one lane, B/H
-// ASIMD store, 3 element, one lane, S
-// ASIMD store, 3 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
- (instregex "^ST3i(8|16|32|64)_POST$")>;
-
-// ASIMD store, 4 element, multiple, D-form, B/H/S
-// ASIMD store, 4 element, multiple, Q-form, B/H/S
-// ASIMD store, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01],
- (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
- (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
-
-// ASIMD store, 4 element, one lane, B/H
-// ASIMD store, 4 element, one lane, S
-// ASIMD store, 4 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
- (instregex "^ST4i(8|16|32|64)_POST$")>;
-
-//--
-// 3.17 Cryptography Extensions
-//--
-
-// Crypto AES ops
-def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
-
-// Crypto polynomial (64x64) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
-
-// Crypto SHA1 xor ops
-// Crypto SHA1 schedule acceleration ops
-// Crypto SHA256 schedule acceleration op (1 u-op)
-// Crypto SHA256 schedule acceleration op (2 u-ops)
-// Crypto SHA256 hash acceleration ops
-def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
-
-//--
-// 3.18 CRC
-//--
-
-// CRC checksum ops
-def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
-
-} // SchedModel = VulcanModel
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 66a8f33..7f55073 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -42,10 +42,12 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
Entry.Node = Size;
Args.push_back(Entry);
TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl).setChain(Chain)
- .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
- DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
- .setDiscardResult();
+ CLI.setDebugLoc(dl)
+ .setChain(Chain)
+ .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+ DAG.getExternalSymbol(bzeroEntry, IntPtr),
+ std::move(Args))
+ .setDiscardResult();
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
return CallResult.second;
}
@@ -53,7 +55,5 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
}
bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
CodeGenOpt::Level OptLevel) const {
- if (OptLevel >= CodeGenOpt::Aggressive)
- return true;
- return false;
+ return OptLevel >= CodeGenOpt::Aggressive;
}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 03e0132..ea61124 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -12,8 +12,22 @@
//===----------------------------------------------------------------------===//
#include "AArch64Subtarget.h"
+
+#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64PBQPRegAlloc.h"
+#include "AArch64TargetMachine.h"
+
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "AArch64CallLowering.h"
+#include "AArch64LegalizerInfo.h"
+#include "AArch64RegisterBankInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#endif
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/Support/TargetRegistry.h"
@@ -35,6 +49,11 @@ static cl::opt<bool>
UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of "
"an address is ignored"), cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ UseNonLazyBind("aarch64-enable-nonlazybind",
+ cl::desc("Call nonlazybind functions via direct GOT load"),
+ cl::init(false), cl::Hidden);
+
AArch64Subtarget &
AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
StringRef CPUString) {
@@ -62,6 +81,7 @@ void AArch64Subtarget::initializeProperties() {
break;
case CortexA57:
MaxInterleaveFactor = 4;
+ PrefFunctionAlignment = 4;
break;
case ExynosM1:
MaxInterleaveFactor = 4;
@@ -71,7 +91,12 @@ void AArch64Subtarget::initializeProperties() {
break;
case Falkor:
MaxInterleaveFactor = 4;
- VectorInsertExtractBaseCost = 2;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
+ CacheLineSize = 128;
+ PrefetchDistance = 820;
+ MinPrefetchStride = 2048;
+ MaxPrefetchIterationsAhead = 8;
break;
case Kryo:
MaxInterleaveFactor = 4;
@@ -80,25 +105,99 @@ void AArch64Subtarget::initializeProperties() {
PrefetchDistance = 740;
MinPrefetchStride = 1024;
MaxPrefetchIterationsAhead = 11;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
- case Vulcan:
+ case ThunderX2T99:
+ CacheLineSize = 64;
+ PrefFunctionAlignment = 3;
+ PrefLoopAlignment = 2;
MaxInterleaveFactor = 4;
+ PrefetchDistance = 128;
+ MinPrefetchStride = 1024;
+ MaxPrefetchIterationsAhead = 4;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
+ break;
+ case ThunderX:
+ case ThunderXT88:
+ case ThunderXT81:
+ case ThunderXT83:
+ CacheLineSize = 128;
+ PrefFunctionAlignment = 3;
+ PrefLoopAlignment = 2;
+ // FIXME: remove this to enable 64-bit SLP if performance looks good.
+ MinVectorRegisterBitWidth = 128;
break;
case CortexA35: break;
case CortexA53: break;
- case CortexA72: break;
- case CortexA73: break;
+ case CortexA72:
+ PrefFunctionAlignment = 4;
+ break;
+ case CortexA73:
+ PrefFunctionAlignment = 4;
+ break;
case Others: break;
}
}
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+
+struct AArch64GISelActualAccessor : public GISelAccessor {
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+
+ const CallLowering *getCallLowering() const override {
+ return CallLoweringInfo.get();
+ }
+
+ const InstructionSelector *getInstructionSelector() const override {
+ return InstSelector.get();
+ }
+
+ const LegalizerInfo *getLegalizerInfo() const override {
+ return Legalizer.get();
+ }
+
+ const RegisterBankInfo *getRegBankInfo() const override {
+ return RegBankInfo.get();
+ }
+};
+
+} // end anonymous namespace
+#endif
+
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
- : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+ : AArch64GenSubtargetInfo(TT, CPU, FS),
+ ReserveX18(TT.isOSDarwin() || TT.isOSWindows()),
IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(),
InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(),
- TLInfo(TM, *this), GISel() {}
+ TLInfo(TM, *this), GISel() {
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+ GISelAccessor *AArch64GISel = new GISelAccessor();
+#else
+ AArch64GISelActualAccessor *AArch64GISel = new AArch64GISelActualAccessor();
+ AArch64GISel->CallLoweringInfo.reset(
+ new AArch64CallLowering(*getTargetLowering()));
+ AArch64GISel->Legalizer.reset(new AArch64LegalizerInfo());
+
+ auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
+
+ // FIXME: At this point, we can't rely on Subtarget having RBI.
+ // It's awkward to mix passing RBI and the Subtarget; should we pass
+ // TII/TRI as well?
+ AArch64GISel->InstSelector.reset(createAArch64InstructionSelector(
+ *static_cast<const AArch64TargetMachine *>(&TM), *this, *RBI));
+
+ AArch64GISel->RegBankInfo.reset(RBI);
+#endif
+ setGISelAccessor(*AArch64GISel);
+}
const CallLowering *AArch64Subtarget::getCallLowering() const {
assert(GISel && "Access to GlobalISel APIs not set");
@@ -133,9 +232,26 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
return AArch64II::MO_GOT;
- // The small code mode's direct accesses use ADRP, which cannot necessarily
- // produce the value 0 (if the code is above 4GB).
- if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+ // The small code model's direct accesses use ADRP, which cannot
+ // necessarily produce the value 0 (if the code is above 4GB).
+ if (useSmallAddressing() && GV->hasExternalWeakLinkage())
+ return AArch64II::MO_GOT;
+
+ return AArch64II::MO_NO_FLAG;
+}
+
+unsigned char AArch64Subtarget::classifyGlobalFunctionReference(
+ const GlobalValue *GV, const TargetMachine &TM) const {
+ // MachO large model always goes via a GOT, because we don't have the
+ // relocations available to do anything else..
+ if (TM.getCodeModel() == CodeModel::Large && isTargetMachO() &&
+ !GV->hasInternalLinkage())
+ return AArch64II::MO_GOT;
+
+ // NonLazyBind goes via GOT unless we know it's available locally.
+ auto *F = dyn_cast<Function>(GV);
+ if (UseNonLazyBind && F && F->hasFnAttribute(Attribute::NonLazyBind) &&
+ !TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
return AArch64II::MO_GOT;
return AArch64II::MO_NO_FLAG;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
index a993402..5a1f45e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -45,7 +45,11 @@ public:
ExynosM1,
Falkor,
Kryo,
- Vulcan
+ ThunderX2T99,
+ ThunderX,
+ ThunderXT81,
+ ThunderXT83,
+ ThunderXT88
};
protected:
@@ -61,9 +65,12 @@ protected:
bool HasCRC = false;
bool HasLSE = false;
bool HasRAS = false;
+ bool HasRDM = false;
bool HasPerfMon = false;
bool HasFullFP16 = false;
bool HasSPE = false;
+ bool HasLSLFast = false;
+ bool HasSVE = false;
// HasZeroCycleRegMove - Has zero-cycle register mov instructions.
bool HasZeroCycleRegMove = false;
@@ -73,6 +80,13 @@ protected:
// StrictAlign - Disallow unaligned memory accesses.
bool StrictAlign = false;
+
+ // NegativeImmediates - transform instructions with negative immediates
+ bool NegativeImmediates = true;
+
+ // Enable 64-bit vectorization in SLP.
+ unsigned MinVectorRegisterBitWidth = 64;
+
bool UseAA = false;
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
@@ -83,6 +97,8 @@ protected:
bool UseAlternateSExtLoadCVTF32Pattern = false;
bool HasArithmeticBccFusion = false;
bool HasArithmeticCbzFusion = false;
+ bool HasFuseAES = false;
+ bool HasFuseLiterals = false;
bool DisableLatencySchedHeuristic = false;
bool UseRSqrt = false;
uint8_t MaxInterleaveFactor = 2;
@@ -94,6 +110,7 @@ protected:
unsigned PrefFunctionAlignment = 0;
unsigned PrefLoopAlignment = 0;
unsigned MaxJumpTableSize = 0;
+ unsigned WideningBaseCost = 0;
// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
@@ -176,6 +193,10 @@ public:
bool isXRaySupported() const override { return true; }
+ unsigned getMinVectorRegisterBitWidth() const {
+ return MinVectorRegisterBitWidth;
+ }
+
bool isX18Reserved() const { return ReserveX18; }
bool hasFPARMv8() const { return HasFPARMv8; }
bool hasNEON() const { return HasNEON; }
@@ -183,6 +204,7 @@ public:
bool hasCRC() const { return HasCRC; }
bool hasLSE() const { return HasLSE; }
bool hasRAS() const { return HasRAS; }
+ bool hasRDM() const { return HasRDM; }
bool balanceFPOps() const { return BalanceFPOps; }
bool predictableSelectIsExpensive() const {
return PredictableSelectIsExpensive;
@@ -195,6 +217,15 @@ public:
}
bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+ bool hasFuseAES() const { return HasFuseAES; }
+ bool hasFuseLiterals() const { return HasFuseLiterals; }
+
+ /// \brief Return true if the CPU supports any kind of instruction fusion.
+ bool hasFusion() const {
+ return hasArithmeticBccFusion() || hasArithmeticCbzFusion() ||
+ hasFuseAES() || hasFuseLiterals();
+ }
+
bool useRSqrt() const { return UseRSqrt; }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
unsigned getVectorInsertExtractBaseCost() const {
@@ -211,6 +242,8 @@ public:
unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
+ unsigned getWideningBaseCost() const { return WideningBaseCost; }
+
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
@@ -218,6 +251,8 @@ public:
bool hasPerfMon() const { return HasPerfMon; }
bool hasFullFP16() const { return HasFullFP16; }
bool hasSPE() const { return HasSPE; }
+ bool hasLSLFast() const { return HasLSLFast; }
+ bool hasSVE() const { return HasSVE; }
bool isLittleEndian() const { return IsLittle; }
@@ -226,6 +261,7 @@ public:
bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+ bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
@@ -233,9 +269,17 @@ public:
bool useAA() const override { return UseAA; }
- /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
- /// that still makes it profitable to inline the call.
- unsigned getMaxInlineSizeThreshold() const { return 64; }
+ bool useSmallAddressing() const {
+ switch (TLInfo.getTargetMachine().getCodeModel()) {
+ case CodeModel::Kernel:
+ // Kernel is currently allowed only for Fuchsia targets,
+ // where it is the same as Small for almost all purposes.
+ case CodeModel::Small:
+ return true;
+ default:
+ return false;
+ }
+ }
/// ParseSubtargetFeatures - Parses features string setting specified
/// subtarget options. Definition of function is auto generated by tblgen.
@@ -246,6 +290,9 @@ public:
unsigned char ClassifyGlobalReference(const GlobalValue *GV,
const TargetMachine &TM) const;
+ unsigned char classifyGlobalFunctionReference(const GlobalValue *GV,
+ const TargetMachine &TM) const;
+
/// This function returns the name of a function which has an interface
/// like the non-standard bzero function, if such a function exists on
/// the current subtarget and it is considered prefereable over
@@ -259,6 +306,17 @@ public:
bool enableEarlyIfConversion() const override;
std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override;
+
+ bool isCallingConvWin64(CallingConv::ID CC) const {
+ switch (CC) {
+ case CallingConv::C:
+ return isTargetWindows();
+ case CallingConv::Win64:
+ return true;
+ default:
+ return false;
+ }
+ }
};
} // End llvm namespace
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index a3736c0..7c5dcb0 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/contrib/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,35 +18,37 @@ include "llvm/TableGen/SearchableTable.td"
// AT (address translate) instruction options.
//===----------------------------------------------------------------------===//
-class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm,
bits<3> op2> : SearchableTable {
let SearchableFields = ["Name", "Encoding"];
let EnumValueField = "Encoding";
string Name = name;
- bits<16> Encoding;
- let Encoding{15-14} = op0;
+ bits<14> Encoding;
let Encoding{13-11} = op1;
let Encoding{10-7} = crn;
let Encoding{6-3} = crm;
let Encoding{2-0} = op2;
+ code Requires = [{ {} }];
}
-def : AT<"S1E1R", 0b01, 0b000, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E2R", 0b01, 0b100, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E3R", 0b01, 0b110, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E1W", 0b01, 0b000, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E2W", 0b01, 0b100, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E3W", 0b01, 0b110, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E0R", 0b01, 0b000, 0b0111, 0b1000, 0b010>;
-def : AT<"S1E0W", 0b01, 0b000, 0b0111, 0b1000, 0b011>;
-def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
-def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
-def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
-def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
-def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
-def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
-
+def : AT<"S1E1R", 0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R", 0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R", 0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W", 0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W", 0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W", 0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R", 0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W", 0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in {
+def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>;
+}
//===----------------------------------------------------------------------===//
// DMB/DSB (data barrier) instruction options.
@@ -77,28 +79,31 @@ def : DB<"sy", 0xf>;
// DC (data cache maintenance) instruction options.
//===----------------------------------------------------------------------===//
-class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm,
bits<3> op2> : SearchableTable {
let SearchableFields = ["Name", "Encoding"];
let EnumValueField = "Encoding";
string Name = name;
- bits<16> Encoding;
- let Encoding{15-14} = op0;
+ bits<14> Encoding;
let Encoding{13-11} = op1;
let Encoding{10-7} = crn;
let Encoding{6-3} = crm;
let Encoding{2-0} = op2;
+ code Requires = [{ {} }];
}
-def : DC<"ZVA", 0b01, 0b011, 0b0111, 0b0100, 0b001>;
-def : DC<"IVAC", 0b01, 0b000, 0b0111, 0b0110, 0b001>;
-def : DC<"ISW", 0b01, 0b000, 0b0111, 0b0110, 0b010>;
-def : DC<"CVAC", 0b01, 0b011, 0b0111, 0b1010, 0b001>;
-def : DC<"CSW", 0b01, 0b000, 0b0111, 0b1010, 0b010>;
-def : DC<"CVAU", 0b01, 0b011, 0b0111, 0b1011, 0b001>;
-def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
-def : DC<"CISW", 0b01, 0b000, 0b0111, 0b1110, 0b010>;
+def : DC<"ZVA", 0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC", 0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW", 0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC", 0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW", 0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU", 0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW", 0b000, 0b0111, 0b1110, 0b010>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : DC<"CVAP", 0b011, 0b0111, 0b1100, 0b001>;
//===----------------------------------------------------------------------===//
// IC (instruction cache maintenance) instruction options.
@@ -120,7 +125,7 @@ class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
def : IC<"IALLU", 0b000, 0b0111, 0b0101, 0b000, 0>;
-def : IC<"IVAU", 0b000, 0b0111, 0b0001, 0b000, 1>;
+def : IC<"IVAU", 0b011, 0b0111, 0b0101, 0b001, 1>;
//===----------------------------------------------------------------------===//
// ISB (instruction-fetch barrier) instruction options.
@@ -213,14 +218,13 @@ def : PSB<"csync", 0x11>;
// TLBI (translation lookaside buffer invalidate) instruction options.
//===----------------------------------------------------------------------===//
-class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
bits<3> op2, bit needsreg = 1> : SearchableTable {
let SearchableFields = ["Name", "Encoding"];
let EnumValueField = "Encoding";
string Name = name;
- bits<16> Encoding;
- let Encoding{15-14} = op0;
+ bits<14> Encoding;
let Encoding{13-11} = op1;
let Encoding{10-7} = crn;
let Encoding{6-3} = crm;
@@ -228,38 +232,38 @@ class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
bit NeedsReg = needsreg;
}
-def : TLBI<"IPAS2E1IS", 0b01, 0b100, 0b1000, 0b0000, 0b001>;
-def : TLBI<"IPAS2LE1IS", 0b01, 0b100, 0b1000, 0b0000, 0b101>;
-def : TLBI<"VMALLE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"VAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b001>;
-def : TLBI<"ASIDE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b010>;
-def : TLBI<"VAAE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b011>;
-def : TLBI<"ALLE1IS", 0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
-def : TLBI<"VALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE2IS", 0b01, 0b100, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE3IS", 0b01, 0b110, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
-def : TLBI<"VAALE1IS", 0b01, 0b000, 0b1000, 0b0011, 0b111>;
-def : TLBI<"IPAS2E1", 0b01, 0b100, 0b1000, 0b0100, 0b001>;
-def : TLBI<"IPAS2LE1", 0b01, 0b100, 0b1000, 0b0100, 0b101>;
-def : TLBI<"VMALLE1", 0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE2", 0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE3", 0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"VAE1", 0b01, 0b000, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE2", 0b01, 0b100, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE3", 0b01, 0b110, 0b1000, 0b0111, 0b001>;
-def : TLBI<"ASIDE1", 0b01, 0b000, 0b1000, 0b0111, 0b010>;
-def : TLBI<"VAAE1", 0b01, 0b000, 0b1000, 0b0111, 0b011>;
-def : TLBI<"ALLE1", 0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
-def : TLBI<"VALE1", 0b01, 0b000, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE2", 0b01, 0b100, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE3", 0b01, 0b110, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VMALLS12E1", 0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
-def : TLBI<"VAALE1", 0b01, 0b000, 0b1000, 0b0111, 0b111>;
+def : TLBI<"IPAS2E1IS", 0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS", 0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS", 0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS", 0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS", 0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS", 0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS", 0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS", 0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS", 0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS", 0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS", 0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS", 0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS", 0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS", 0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS", 0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1", 0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1", 0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1", 0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2", 0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3", 0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1", 0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2", 0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3", 0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1", 0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1", 0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1", 0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1", 0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2", 0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3", 0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1", 0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1", 0b000, 0b1000, 0b0111, 0b111>;
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index d288394..ba28c01 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -10,23 +10,20 @@
//
//===----------------------------------------------------------------------===//
+#include "AArch64TargetMachine.h"
#include "AArch64.h"
-#include "AArch64CallLowering.h"
-#include "AArch64InstructionSelector.h"
-#include "AArch64LegalizerInfo.h"
-#include "AArch64RegisterBankInfo.h"
+#include "AArch64MacroFusion.h"
#include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
#include "AArch64TargetObjectFile.h"
#include "AArch64TargetTransformInfo.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Triple.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
@@ -50,6 +47,11 @@ static cl::opt<bool> EnableCCMP("aarch64-enable-ccmp",
cl::desc("Enable the CCMP formation pass"),
cl::init(true), cl::Hidden);
+static cl::opt<bool>
+ EnableCondBrTuning("aarch64-enable-cond-br-tune",
+ cl::desc("Enable the conditional branch tuning pass"),
+ cl::init(true), cl::Hidden);
+
static cl::opt<bool> EnableMCR("aarch64-enable-mcr",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
@@ -113,11 +115,6 @@ EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
cl::init(false));
static cl::opt<bool>
- EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden,
- cl::desc("Enable the type promotion pass"),
- cl::init(true));
-
-static cl::opt<bool>
EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
cl::desc("Enable optimizations on complex GEPs"),
cl::init(false));
@@ -136,6 +133,14 @@ static cl::opt<bool>
cl::desc("Enable the loop data prefetch pass"),
cl::init(true));
+static cl::opt<int> EnableGlobalISelAtO(
+ "aarch64-enable-global-isel-at-O", cl::Hidden,
+ cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
+ cl::init(-1));
+
+static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
+ cl::init(true), cl::Hidden);
+
extern "C" void LLVMInitializeAArch64Target() {
// Register the target.
RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -145,7 +150,6 @@ extern "C" void LLVMInitializeAArch64Target() {
initializeGlobalISel(*PR);
initializeAArch64A53Fix835769Pass(*PR);
initializeAArch64A57FPLoadBalancingPass(*PR);
- initializeAArch64AddressTypePromotionPass(*PR);
initializeAArch64AdvSIMDScalarPass(*PR);
initializeAArch64CollectLOHPass(*PR);
initializeAArch64ConditionalComparesPass(*PR);
@@ -157,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() {
initializeAArch64PromoteConstantPass(*PR);
initializeAArch64RedundantCopyEliminationPass(*PR);
initializeAArch64StorePairSuppressPass(*PR);
+ initializeFalkorHWPFFixPass(*PR);
+ initializeFalkorMarkStridedAccessesLegacyPass(*PR);
initializeLDTLSCleanupPass(*PR);
}
@@ -166,6 +172,8 @@ extern "C" void LLVMInitializeAArch64Target() {
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
if (TT.isOSBinFormatMachO())
return llvm::make_unique<AArch64_MachoTargetObjectFile>();
+ if (TT.isOSBinFormatCOFF())
+ return llvm::make_unique<AArch64_COFFTargetObjectFile>();
return llvm::make_unique<AArch64_ELFTargetObjectFile>();
}
@@ -178,6 +186,8 @@ static std::string computeDataLayout(const Triple &TT,
return "e-m:e-p:32:32-i8:8-i16:16-i64:64-S128";
if (TT.isOSBinFormatMachO())
return "e-m:o-i64:64-i128:128-n32:64-S128";
+ if (TT.isOSBinFormatCOFF())
+ return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128";
if (LittleEndian)
return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
@@ -215,35 +225,6 @@ AArch64TargetMachine::AArch64TargetMachine(
AArch64TargetMachine::~AArch64TargetMachine() = default;
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct AArch64GISelActualAccessor : public GISelAccessor {
- std::unique_ptr<CallLowering> CallLoweringInfo;
- std::unique_ptr<InstructionSelector> InstSelector;
- std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
-
- const CallLowering *getCallLowering() const override {
- return CallLoweringInfo.get();
- }
-
- const InstructionSelector *getInstructionSelector() const override {
- return InstSelector.get();
- }
-
- const LegalizerInfo *getLegalizerInfo() const override {
- return Legalizer.get();
- }
-
- const RegisterBankInfo *getRegBankInfo() const override {
- return RegBankInfo.get();
- }
-};
-
-} // end anonymous namespace
-#endif
-
const AArch64Subtarget *
AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -264,25 +245,6 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
resetTargetOptions(F);
I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
isLittle);
-#ifndef LLVM_BUILD_GLOBAL_ISEL
- GISelAccessor *GISel = new GISelAccessor();
-#else
- AArch64GISelActualAccessor *GISel =
- new AArch64GISelActualAccessor();
- GISel->CallLoweringInfo.reset(
- new AArch64CallLowering(*I->getTargetLowering()));
- GISel->Legalizer.reset(new AArch64LegalizerInfo());
-
- auto *RBI = new AArch64RegisterBankInfo(*I->getRegisterInfo());
-
- // FIXME: At this point, we can't rely on Subtarget having RBI.
- // It's awkward to mix passing RBI and the Subtarget; should we pass
- // TII/TRI as well?
- GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI));
-
- GISel->RegBankInfo.reset(RBI);
-#endif
- I->setGISelAccessor(*GISel);
}
return I.get();
}
@@ -308,9 +270,9 @@ namespace {
/// AArch64 Code Generator Pass Configuration Options.
class AArch64PassConfig : public TargetPassConfig {
public:
- AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
+ AArch64PassConfig(AArch64TargetMachine &TM, PassManagerBase &PM)
: TargetPassConfig(TM, PM) {
- if (TM->getOptLevel() != CodeGenOpt::None)
+ if (TM.getOptLevel() != CodeGenOpt::None)
substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
}
@@ -320,13 +282,29 @@ public:
ScheduleDAGInstrs *
createMachineScheduler(MachineSchedContext *C) const override {
+ const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+ if (ST.hasFusion())
+ DAG->addMutation(createAArch64MacroFusionDAGMutation());
return DAG;
}
+ ScheduleDAGInstrs *
+ createPostMachineScheduler(MachineSchedContext *C) const override {
+ const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
+ if (ST.hasFusion()) {
+ // Run the Macro Fusion after RA again since literals are expanded from
+ // pseudos then (v. addPreSched2()).
+ ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+ DAG->addMutation(createAArch64MacroFusionDAGMutation());
+ return DAG;
+ }
+
+ return nullptr;
+ }
+
void addIRPasses() override;
bool addPreISel() override;
bool addInstSelector() override;
@@ -334,6 +312,7 @@ public:
bool addIRTranslator() override;
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
+ void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
#endif
bool addILPOpts() override;
@@ -341,6 +320,8 @@ public:
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
+
+ bool isGlobalISelEnabled() const override;
};
} // end anonymous namespace
@@ -352,13 +333,13 @@ TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() {
}
TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
- return new AArch64PassConfig(this, PM);
+ return new AArch64PassConfig(*this, PM);
}
void AArch64PassConfig::addIRPasses() {
// Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
// ourselves.
- addPass(createAtomicExpandPass(TM));
+ addPass(createAtomicExpandPass());
// Cmpxchg instructions are often used with a subsequent comparison to
// determine whether it succeeded. We can exploit existing control-flow in
@@ -370,14 +351,18 @@ void AArch64PassConfig::addIRPasses() {
//
// Run this before LSR to remove the multiplies involved in computing the
// pointer values N iterations ahead.
- if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
- addPass(createLoopDataPrefetchPass());
+ if (TM->getOptLevel() != CodeGenOpt::None) {
+ if (EnableLoopDataPrefetch)
+ addPass(createLoopDataPrefetchPass());
+ if (EnableFalkorHWPFFix)
+ addPass(createFalkorMarkStridedAccessesPass());
+ }
TargetPassConfig::addIRPasses();
// Match interleaved memory accesses to ldN/stN intrinsics.
if (TM->getOptLevel() != CodeGenOpt::None)
- addPass(createInterleavedAccessPass(TM));
+ addPass(createInterleavedAccessPass());
if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
// Call SeparateConstOffsetFromGEP pass to extract constants within indices
@@ -410,9 +395,6 @@ bool AArch64PassConfig::addPreISel() {
addPass(createGlobalMergePass(TM, 4095, OnlyOptimizeForSize));
}
- if (TM->getOptLevel() != CodeGenOpt::None && EnableAddressTypePromotion)
- addPass(createAArch64AddressTypePromotionPass());
-
return false;
}
@@ -444,12 +426,22 @@ bool AArch64PassConfig::addRegBankSelect() {
return false;
}
+void AArch64PassConfig::addPreGlobalInstructionSelect() {
+ // Workaround the deficiency of the fast register allocator.
+ if (TM->getOptLevel() == CodeGenOpt::None)
+ addPass(new Localizer());
+}
+
bool AArch64PassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect());
return false;
}
#endif
+bool AArch64PassConfig::isGlobalISelEnabled() const {
+ return TM->getOptLevel() <= EnableGlobalISelAtO;
+}
+
bool AArch64PassConfig::addILPOpts() {
if (EnableCondOpt)
addPass(createAArch64ConditionOptimizerPass());
@@ -457,6 +449,8 @@ bool AArch64PassConfig::addILPOpts() {
addPass(createAArch64ConditionalCompares());
if (EnableMCR)
addPass(&MachineCombinerID);
+ if (EnableCondBrTuning)
+ addPass(createAArch64CondBrTuning());
if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterID);
if (EnableStPairSuppress)
@@ -493,8 +487,12 @@ void AArch64PassConfig::addPreSched2() {
// Expand some pseudo instructions to allow proper scheduling.
addPass(createAArch64ExpandPseudoPass());
// Use load/store pair instructions when possible.
- if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
- addPass(createAArch64LoadStoreOptimizationPass());
+ if (TM->getOptLevel() != CodeGenOpt::None) {
+ if (EnableLoadStoreOpt)
+ addPass(createAArch64LoadStoreOptimizationPass());
+ if (EnableFalkorHWPFFix)
+ addPass(createFalkorHWPFFixPass());
+ }
}
void AArch64PassConfig::addPreEmitPass() {
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 6fa5e83..85de02e 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -21,6 +21,8 @@
namespace llvm {
+class AArch64RegisterBankInfo;
+
class AArch64TargetMachine : public LLVMTargetMachine {
protected:
std::unique_ptr<TargetLoweringObjectFile> TLOF;
@@ -34,6 +36,9 @@ public:
~AArch64TargetMachine() override;
const AArch64Subtarget *getSubtargetImpl(const Function &F) const override;
+ // The no argument getSubtargetImpl, while it exists on some, targets is
+ // deprecated and should not be used.
+ const AArch64Subtarget *getSubtargetImpl() const = delete;
// Pass Pipeline Configuration
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 8875f9b..4bc2c06 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -9,12 +9,12 @@
#include "AArch64TargetObjectFile.h"
#include "AArch64TargetMachine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
#include "llvm/IR/Mangler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Dwarf.h"
using namespace llvm;
using namespace dwarf;
@@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
return MCBinaryExpr::createSub(Res, PC, getContext());
}
+
+void AArch64_MachoTargetObjectFile::getNameWithPrefix(
+ SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+ const TargetMachine &TM) const {
+ // AArch64 does not use section-relative relocations so any global symbol must
+ // be accessed via at least a linker-private symbol.
+ getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true);
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
index 05e1dfa..9077eb7 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -40,8 +40,14 @@ public:
const MCValue &MV, int64_t Offset,
MachineModuleInfo *MMI,
MCStreamer &Streamer) const override;
+
+ void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV,
+ const TargetMachine &TM) const override;
};
+/// This implementation is used for AArch64 COFF targets.
+class AArch64_COFFTargetObjectFile : public TargetLoweringObjectFileCOFF {};
+
} // end namespace llvm
#endif
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8833e5..a76f080 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -9,8 +9,8 @@
#include "AArch64TargetTransformInfo.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/Support/Debug.h"
#include "llvm/Target/CostTable.h"
@@ -20,6 +20,23 @@ using namespace llvm;
#define DEBUG_TYPE "aarch64tti"
+static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
+ cl::init(true), cl::Hidden);
+
+bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ // Inline a callee if its target-features are a subset of the callers
+ // target-features.
+ return (CallerBits & CalleeBits) == CalleeBits;
+}
+
/// \brief Calculate the cost of materializing a 64-bit value. This helper
/// method might only calculate a fraction of a larger immediate. Therefore it
/// is valid to return a cost of ZERO.
@@ -176,10 +193,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
return TTI::PSK_Software;
}
-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
+ ArrayRef<const Value *> Args) {
+
+ // A helper that returns a vector type from the given type. The number of
+ // elements in type Ty determine the vector width.
+ auto toVectorTy = [&](Type *ArgTy) {
+ return VectorType::get(ArgTy->getScalarType(),
+ DstTy->getVectorNumElements());
+ };
+
+ // Exit early if DstTy is not a vector type whose elements are at least
+ // 16-bits wide.
+ if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16)
+ return false;
+
+ // Determine if the operation has a widening variant. We consider both the
+ // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
+ // instructions.
+ //
+ // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we
+ // verify that their extending operands are eliminated during code
+ // generation.
+ switch (Opcode) {
+ case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
+ case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+ break;
+ default:
+ return false;
+ }
+
+ // To be a widening instruction (either the "wide" or "long" versions), the
+ // second operand must be a sign- or zero extend having a single user. We
+ // only consider extends having a single user because they may otherwise not
+ // be eliminated.
+ if (Args.size() != 2 ||
+ (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])) ||
+ !Args[1]->hasOneUse())
+ return false;
+ auto *Extend = cast<CastInst>(Args[1]);
+
+ // Legalize the destination type and ensure it can be used in a widening
+ // operation.
+ auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy);
+ unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
+ if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
+ return false;
+
+ // Legalize the source type and ensure it can be used in a widening
+ // operation.
+ Type *SrcTy = toVectorTy(Extend->getSrcTy());
+ auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
+ unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
+ if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
+ return false;
+
+ // Get the total number of vector elements in the legalized types.
+ unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements();
+ unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements();
+
+ // Return true if the legalized types have the same number of vector elements
+ // and the destination element type size is twice that of the source type.
+ return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
+}
+
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");
+ // If the cast is observable, and it is used by a widening instruction (e.g.,
+ // uaddl, saddw, etc.), it may be free.
+ if (I && I->hasOneUse()) {
+ auto *SingleUser = cast<Instruction>(*I->user_begin());
+ SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
+ if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
+ // If the cast is the second operand, it is free. We will generate either
+ // a "wide" or "long" version of the widening instruction.
+ if (I == SingleUser->getOperand(1))
+ return 0;
+ // If the cast is not the second operand, it will be free if it looks the
+ // same as the second operand. In this case, we will generate a "long"
+ // version of the widening instruction.
+ if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
+ if (I->getOpcode() == Cast->getOpcode() &&
+ cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
+ return 0;
+ }
+ }
+
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
@@ -378,6 +480,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+ // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.),
+ // add in the widening overhead specified by the sub-target. Since the
+ // extends feeding widening instructions are performed automatically, they
+ // aren't present in the generated code and have a zero cost. By adding a
+ // widening overhead here, we attach the total cost of the combined operation
+ // to the widening instruction.
+ int Cost = 0;
+ if (isWideningInstruction(Ty, Opcode, Args))
+ Cost += ST->getWideningBaseCost();
+
int ISD = TLI->InstructionOpcodeToISD(Opcode);
if (ISD == ISD::SDIV &&
@@ -387,9 +499,9 @@ int AArch64TTIImpl::getArithmeticInstrCost(
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
- int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
- TargetTransformInfo::OP_None,
- TargetTransformInfo::OP_None);
+ Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+ TargetTransformInfo::OP_None,
+ TargetTransformInfo::OP_None);
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -404,8 +516,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
switch (ISD) {
default:
- return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
- Opd1PropInfo, Opd2PropInfo);
+ return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+ Opd1PropInfo, Opd2PropInfo);
case ISD::ADD:
case ISD::MUL:
case ISD::XOR:
@@ -413,7 +525,7 @@ int AArch64TTIImpl::getArithmeticInstrCost(
case ISD::AND:
// These nodes are marked as 'custom' for combining purposes only.
// We know that they are legal. See LowerAdd in ISelLowering.
- return 1 * LT.first;
+ return (Cost + 1) * LT.first;
}
}
@@ -436,7 +548,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
}
int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) {
+ Type *CondTy, const Instruction *I) {
int ISD = TLI->InstructionOpcodeToISD(Opcode);
// We don't lower some vector selects well that are wider than the register
@@ -463,11 +575,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
return Entry->Cost;
}
}
- return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+ return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
}
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
- unsigned Alignment, unsigned AddressSpace) {
+ unsigned Alignment, unsigned AddressSpace,
+ const Instruction *I) {
auto LT = TLI->getTypeLegalizationCost(DL, Ty);
if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
@@ -505,12 +618,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
unsigned NumElts = VecTy->getVectorNumElements();
- Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
- unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+ auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
// ldN/stN only support legal vector types of size 64 or 128 in bits.
- if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
- return Factor;
+ // Accesses having vector types that are a multiple of 128 bits can be
+ // matched to more than one ldN/stN instruction.
+ if (NumElts % Factor == 0 &&
+ TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+ return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
}
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -533,10 +648,62 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return ST->getMaxInterleaveFactor();
}
-void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
+// For Falkor, we want to avoid having too many strided loads in a loop since
+// that can exhaust the HW prefetcher resources. We adjust the unroller
+// MaxCount preference below to attempt to ensure unrolling doesn't create too
+// many strided loads.
+static void
+getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ enum { MaxStridedLoads = 7 };
+ auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
+ int StridedLoads = 0;
+ // FIXME? We could make this more precise by looking at the CFG and
+ // e.g. not counting loads in each side of an if-then-else diamond.
+ for (const auto BB : L->blocks()) {
+ for (auto &I : *BB) {
+ LoadInst *LMemI = dyn_cast<LoadInst>(&I);
+ if (!LMemI)
+ continue;
+
+ Value *PtrValue = LMemI->getPointerOperand();
+ if (L->isLoopInvariant(PtrValue))
+ continue;
+
+ const SCEV *LSCEV = SE.getSCEV(PtrValue);
+ const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+ if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+ continue;
+
+ // FIXME? We could take pairing of unrolled load copies into account
+ // by looking at the AddRec, but we would probably have to limit this
+ // to loops with no stores or other memory optimization barriers.
+ ++StridedLoads;
+ // We've seen enough strided loads that seeing more won't make a
+ // difference.
+ if (StridedLoads > MaxStridedLoads / 2)
+ return StridedLoads;
+ }
+ }
+ return StridedLoads;
+ };
+
+ int StridedLoads = countStridedLoads(L, SE);
+ DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
+ << " strided loads\n");
+ // Pick the largest power of 2 unroll count that won't result in too many
+ // strided loads.
+ if (StridedLoads) {
+ UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
+ DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " << UP.MaxCount
+ << '\n');
+ }
+}
+
+void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
// Enable partial unrolling and runtime unrolling.
- BaseT::getUnrollingPreferences(L, UP);
+ BaseT::getUnrollingPreferences(L, SE, UP);
// For inner loop, it is more likely to be a hot one, and the runtime check
// can be promoted out from LICM pass, so the overhead is less, let's try
@@ -546,6 +713,10 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
// Disable partial & runtime unrolling on -Os.
UP.PartialOptSizeThreshold = 0;
+
+ if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
+ EnableFalkorHWPFUnrollFix)
+ getFalkorUnrollingPreferences(L, SE, UP);
}
Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
@@ -594,8 +765,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_ld4:
Info.ReadMem = true;
Info.WriteMem = false;
- Info.IsSimple = true;
- Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(0);
break;
case Intrinsic::aarch64_neon_st2:
@@ -603,8 +772,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
case Intrinsic::aarch64_neon_st4:
Info.ReadMem = false;
Info.WriteMem = true;
- Info.IsSimple = true;
- Info.NumMemRefs = 1;
Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
break;
}
@@ -628,6 +795,38 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
return true;
}
+/// See if \p I should be considered for address type promotion. We check if \p
+/// I is a sext with right type and used in memory accesses. If it used in a
+/// "complex" getelementptr, we allow it to be promoted without finding other
+/// sext instructions that sign extended the same initial value. A getelementptr
+/// is considered as "complex" if it has more than 2 operands.
+bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
+ const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
+ bool Considerable = false;
+ AllowPromotionWithoutCommonHeader = false;
+ if (!isa<SExtInst>(&I))
+ return false;
+ Type *ConsideredSExtType =
+ Type::getInt64Ty(I.getParent()->getParent()->getContext());
+ if (I.getType() != ConsideredSExtType)
+ return false;
+ // See if the sext is the one with the right type and used in at least one
+ // GetElementPtrInst.
+ for (const User *U : I.users()) {
+ if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
+ Considerable = true;
+ // A getelementptr is considered as "complex" if it has more than 2
+ // operands. We will promote a SExt used in such complex GEP as we
+ // expect some computation to be merged if they are done on 64 bits.
+ if (GEPInst->getNumOperands() > 2) {
+ AllowPromotionWithoutCommonHeader = true;
+ break;
+ }
+ }
+ }
+ return Considerable;
+}
+
unsigned AArch64TTIImpl::getCacheLineSize() {
return ST->getCacheLineSize();
}
@@ -643,3 +842,28 @@ unsigned AArch64TTIImpl::getMinPrefetchStride() {
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
return ST->getMaxPrefetchIterationsAhead();
}
+
+bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+ unsigned ScalarBits = Ty->getScalarSizeInBits();
+ switch (Opcode) {
+ case Instruction::FAdd:
+ case Instruction::FMul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Mul:
+ return false;
+ case Instruction::Add:
+ return ScalarBits * Ty->getVectorNumElements() >= 128;
+ case Instruction::ICmp:
+ return (ScalarBits < 64) &&
+ (ScalarBits * Ty->getVectorNumElements() >= 128);
+ case Instruction::FCmp:
+ return Flags.NoNaN;
+ default:
+ llvm_unreachable("Unhandled reduction opcode");
+ }
+ return false;
+}
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 18287ed..31c0373 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -34,10 +34,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
const AArch64Subtarget *ST;
const AArch64TargetLowering *TLI;
- /// Estimate the overhead of scalarizing an instruction. Insert and Extract
- /// are set if the result needs to be inserted and/or extracted from vectors.
- unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-
const AArch64Subtarget *getST() const { return ST; }
const AArch64TargetLowering *getTLI() const { return TLI; }
@@ -47,11 +43,17 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
VECTOR_LDST_FOUR_ELEMENTS
};
+ bool isWideningInstruction(Type *Ty, unsigned Opcode,
+ ArrayRef<const Value *> Args);
+
public:
explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
TLI(ST->getTargetLowering()) {}
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
+
/// \name Scalar TTI Implementations
/// @{
@@ -79,7 +81,7 @@ public:
return 31;
}
- unsigned getRegisterBitWidth(bool Vector) {
+ unsigned getRegisterBitWidth(bool Vector) const {
if (Vector) {
if (ST->hasNEON())
return 128;
@@ -88,9 +90,14 @@ public:
return 64;
}
+ unsigned getMinVectorRegisterBitWidth() {
+ return ST->getMinVectorRegisterBitWidth();
+ }
+
unsigned getMaxInterleaveFactor(unsigned VF);
- int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+ int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+ const Instruction *I = nullptr);
int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
unsigned Index);
@@ -107,14 +114,16 @@ public:
int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
- int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+ int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+ const Instruction *I = nullptr);
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace);
+ unsigned AddressSpace, const Instruction *I = nullptr);
int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
- void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+ void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
+ TTI::UnrollingPreferences &UP);
Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
Type *ExpectedType);
@@ -125,6 +134,10 @@ public:
ArrayRef<unsigned> Indices, unsigned Alignment,
unsigned AddressSpace);
+ bool
+ shouldConsiderAddressTypePromotion(const Instruction &I,
+ bool &AllowPromotionWithoutCommonHeader);
+
unsigned getCacheLineSize();
unsigned getPrefetchDistance();
@@ -132,6 +145,13 @@ public:
unsigned getMinPrefetchStride();
unsigned getMaxPrefetchIterationsAhead();
+
+ bool shouldExpandReduction(const IntrinsicInst *II) const {
+ return false;
+ }
+
+ bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const;
/// @}
};
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
index e3b1d7c..f53af23 100644
--- a/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp
@@ -19,13 +19,27 @@
// is rewritten into
// dup v3.4s, v2.s[1]
// fmla v0.4s, v1.4s, v3.4s
+//
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
using namespace llvm;
@@ -41,14 +55,15 @@ namespace {
struct AArch64VectorByElementOpt : public MachineFunctionPass {
static char ID;
- AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
- initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
- }
const TargetInstrInfo *TII;
MachineRegisterInfo *MRI;
TargetSchedModel SchedModel;
+ AArch64VectorByElementOpt() : MachineFunctionPass(ID) {
+ initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry());
+ }
+
/// Based only on latency of instructions, determine if it is cost efficient
/// to replace the instruction InstDesc by the two instructions InstDescRep1
/// and InstDescRep2.
@@ -90,8 +105,10 @@ struct AArch64VectorByElementOpt : public MachineFunctionPass {
return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
}
};
+
char AArch64VectorByElementOpt::ID = 0;
-} // namespace
+
+} // end anonymous namespace
INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt",
AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index b86a283..a79d518 100644
--- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -15,8 +15,8 @@
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
@@ -74,6 +74,7 @@ private:
SMLoc getLoc() const { return getParser().getTok().getLoc(); }
bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+ void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
AArch64CC::CondCode parseCondCodeString(StringRef Cond);
bool parseCondCode(OperandVector &Operands, bool invertCondCode);
unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
@@ -85,7 +86,7 @@ private:
bool parseOperand(OperandVector &Operands, bool isCondCode,
bool invertCondCode);
- bool showMatchError(SMLoc Loc, unsigned ErrCode);
+ bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands);
bool parseDirectiveArch(SMLoc L);
bool parseDirectiveCPU(SMLoc L);
@@ -537,154 +538,15 @@ public:
return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
}
- bool isImm0_1() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 2);
- }
-
- bool isImm0_7() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 8);
- }
-
- bool isImm1_8() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val > 0 && Val < 9);
- }
-
- bool isImm0_15() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 16);
- }
-
- bool isImm1_16() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val > 0 && Val < 17);
- }
-
- bool isImm0_31() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 32);
- }
-
- bool isImm1_31() const {
+ template <int N, int M>
+ bool isImmInRange() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
if (!MCE)
return false;
int64_t Val = MCE->getValue();
- return (Val >= 1 && Val < 32);
- }
-
- bool isImm1_32() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 1 && Val < 33);
- }
-
- bool isImm0_63() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 64);
- }
-
- bool isImm1_63() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 1 && Val < 64);
- }
-
- bool isImm1_64() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 1 && Val < 65);
- }
-
- bool isImm0_127() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 128);
- }
-
- bool isImm0_255() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 256);
- }
-
- bool isImm0_65535() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 0 && Val < 65536);
- }
-
- bool isImm32_63() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return false;
- int64_t Val = MCE->getValue();
- return (Val >= 32 && Val < 64);
+ return (Val >= N && Val <= M);
}
bool isLogicalImm32() const {
@@ -804,19 +666,8 @@ public:
return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
}
- bool isBranchTarget26() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return true;
- int64_t Val = MCE->getValue();
- if (Val & 0x3)
- return false;
- return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
- }
-
- bool isPCRelLabel19() const {
+ template<int N>
+ bool isBranchTarget() const {
if (!isImm())
return false;
const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
@@ -825,19 +676,8 @@ public:
int64_t Val = MCE->getValue();
if (Val & 0x3)
return false;
- return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
- }
-
- bool isBranchTarget14() const {
- if (!isImm())
- return false;
- const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
- if (!MCE)
- return true;
- int64_t Val = MCE->getValue();
- if (Val & 0x3)
- return false;
- return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+ assert(N > 0 && "Branch target immediate cannot be 0 bits!");
+ return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2));
}
bool
@@ -2260,27 +2100,9 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
bool isNegative = parseOptionalToken(AsmToken::Minus);
const AsmToken &Tok = Parser.getTok();
- if (Tok.is(AsmToken::Real)) {
- APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
- if (isNegative)
- RealVal.changeSign();
-
- uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
- int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
- Parser.Lex(); // Eat the token.
- // Check for out of range values. As an exception, we let Zero through,
- // as we handle that special case in post-processing before matching in
- // order to use the zero register for it.
- if (Val == -1 && !RealVal.isPosZero()) {
- TokError("expected compatible register or floating-point constant");
- return MatchOperand_ParseFail;
- }
- Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
- return MatchOperand_Success;
- }
- if (Tok.is(AsmToken::Integer)) {
+ if (Tok.is(AsmToken::Real) || Tok.is(AsmToken::Integer)) {
int64_t Val;
- if (!isNegative && Tok.getString().startswith("0x")) {
+ if (Tok.is(AsmToken::Integer) && !isNegative && Tok.getString().startswith("0x")) {
Val = Tok.getIntVal();
if (Val > 255 || Val < 0) {
TokError("encoded floating point value out of range");
@@ -2288,10 +2110,24 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
}
} else {
APFloat RealVal(APFloat::IEEEdouble(), Tok.getString());
+ if (isNegative)
+ RealVal.changeSign();
+
uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
- // If we had a '-' in front, toggle the sign bit.
- IntVal ^= (uint64_t)isNegative << 63;
Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+
+ // Check for out of range values. As an exception we let Zero through,
+ // but as tokens instead of an FPImm so that it can be matched by the
+ // appropriate alias if one exists.
+ if (RealVal.isPosZero()) {
+ Parser.Lex(); // Eat the token.
+ Operands.push_back(AArch64Operand::CreateToken("#0", false, S, getContext()));
+ Operands.push_back(AArch64Operand::CreateToken(".0", false, S, getContext()));
+ return MatchOperand_Success;
+ } else if (Val == -1) {
+ TokError("expected compatible register or floating-point constant");
+ return MatchOperand_ParseFail;
+ }
}
Parser.Lex(); // Eat the token.
Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
@@ -2494,6 +2330,35 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
return MatchOperand_Success;
}
+static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
+ if (FBS[AArch64::HasV8_1aOps])
+ Str += "ARMv8.1a";
+ else if (FBS[AArch64::HasV8_2aOps])
+ Str += "ARMv8.2a";
+ else
+ Str += "(unknown)";
+}
+
+void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands,
+ SMLoc S) {
+ const uint16_t Op2 = Encoding & 7;
+ const uint16_t Cm = (Encoding & 0x78) >> 3;
+ const uint16_t Cn = (Encoding & 0x780) >> 7;
+ const uint16_t Op1 = (Encoding & 0x3800) >> 11;
+
+ const MCExpr *Expr = MCConstantExpr::create(Op1, getContext());
+
+ Operands.push_back(
+ AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));
+ Operands.push_back(
+ AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));
+ Expr = MCConstantExpr::create(Op2, getContext());
+ Operands.push_back(
+ AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+}
+
/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
@@ -2510,228 +2375,48 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
StringRef Op = Tok.getString();
SMLoc S = Tok.getLoc();
- const MCExpr *Expr = nullptr;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2) \
- do { \
- Expr = MCConstantExpr::create(op1, getContext()); \
- Operands.push_back( \
- AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \
- Operands.push_back( \
- AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext())); \
- Operands.push_back( \
- AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext())); \
- Expr = MCConstantExpr::create(op2, getContext()); \
- Operands.push_back( \
- AArch64Operand::CreateImm(Expr, S, getLoc(), getContext())); \
- } while (false)
-
if (Mnemonic == "ic") {
- if (!Op.compare_lower("ialluis")) {
- // SYS #0, C7, C1, #0
- SYS_ALIAS(0, 7, 1, 0);
- } else if (!Op.compare_lower("iallu")) {
- // SYS #0, C7, C5, #0
- SYS_ALIAS(0, 7, 5, 0);
- } else if (!Op.compare_lower("ivau")) {
- // SYS #3, C7, C5, #1
- SYS_ALIAS(3, 7, 5, 1);
- } else {
+ const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
+ if (!IC)
return TokError("invalid operand for IC instruction");
+ else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
+ std::string Str("IC " + std::string(IC->Name) + " requires ");
+ setRequiredFeatureString(IC->getRequiredFeatures(), Str);
+ return TokError(Str.c_str());
}
+ createSysAlias(IC->Encoding, Operands, S);
} else if (Mnemonic == "dc") {
- if (!Op.compare_lower("zva")) {
- // SYS #3, C7, C4, #1
- SYS_ALIAS(3, 7, 4, 1);
- } else if (!Op.compare_lower("ivac")) {
- // SYS #3, C7, C6, #1
- SYS_ALIAS(0, 7, 6, 1);
- } else if (!Op.compare_lower("isw")) {
- // SYS #0, C7, C6, #2
- SYS_ALIAS(0, 7, 6, 2);
- } else if (!Op.compare_lower("cvac")) {
- // SYS #3, C7, C10, #1
- SYS_ALIAS(3, 7, 10, 1);
- } else if (!Op.compare_lower("csw")) {
- // SYS #0, C7, C10, #2
- SYS_ALIAS(0, 7, 10, 2);
- } else if (!Op.compare_lower("cvau")) {
- // SYS #3, C7, C11, #1
- SYS_ALIAS(3, 7, 11, 1);
- } else if (!Op.compare_lower("civac")) {
- // SYS #3, C7, C14, #1
- SYS_ALIAS(3, 7, 14, 1);
- } else if (!Op.compare_lower("cisw")) {
- // SYS #0, C7, C14, #2
- SYS_ALIAS(0, 7, 14, 2);
- } else if (!Op.compare_lower("cvap")) {
- if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
- // SYS #3, C7, C12, #1
- SYS_ALIAS(3, 7, 12, 1);
- } else {
- return TokError("DC CVAP requires ARMv8.2a");
- }
- } else {
+ const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op);
+ if (!DC)
return TokError("invalid operand for DC instruction");
+ else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
+ std::string Str("DC " + std::string(DC->Name) + " requires ");
+ setRequiredFeatureString(DC->getRequiredFeatures(), Str);
+ return TokError(Str.c_str());
}
+ createSysAlias(DC->Encoding, Operands, S);
} else if (Mnemonic == "at") {
- if (!Op.compare_lower("s1e1r")) {
- // SYS #0, C7, C8, #0
- SYS_ALIAS(0, 7, 8, 0);
- } else if (!Op.compare_lower("s1e2r")) {
- // SYS #4, C7, C8, #0
- SYS_ALIAS(4, 7, 8, 0);
- } else if (!Op.compare_lower("s1e3r")) {
- // SYS #6, C7, C8, #0
- SYS_ALIAS(6, 7, 8, 0);
- } else if (!Op.compare_lower("s1e1w")) {
- // SYS #0, C7, C8, #1
- SYS_ALIAS(0, 7, 8, 1);
- } else if (!Op.compare_lower("s1e2w")) {
- // SYS #4, C7, C8, #1
- SYS_ALIAS(4, 7, 8, 1);
- } else if (!Op.compare_lower("s1e3w")) {
- // SYS #6, C7, C8, #1
- SYS_ALIAS(6, 7, 8, 1);
- } else if (!Op.compare_lower("s1e0r")) {
- // SYS #0, C7, C8, #3
- SYS_ALIAS(0, 7, 8, 2);
- } else if (!Op.compare_lower("s1e0w")) {
- // SYS #0, C7, C8, #3
- SYS_ALIAS(0, 7, 8, 3);
- } else if (!Op.compare_lower("s12e1r")) {
- // SYS #4, C7, C8, #4
- SYS_ALIAS(4, 7, 8, 4);
- } else if (!Op.compare_lower("s12e1w")) {
- // SYS #4, C7, C8, #5
- SYS_ALIAS(4, 7, 8, 5);
- } else if (!Op.compare_lower("s12e0r")) {
- // SYS #4, C7, C8, #6
- SYS_ALIAS(4, 7, 8, 6);
- } else if (!Op.compare_lower("s12e0w")) {
- // SYS #4, C7, C8, #7
- SYS_ALIAS(4, 7, 8, 7);
- } else if (!Op.compare_lower("s1e1rp")) {
- if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
- // SYS #0, C7, C9, #0
- SYS_ALIAS(0, 7, 9, 0);
- } else {
- return TokError("AT S1E1RP requires ARMv8.2a");
- }
- } else if (!Op.compare_lower("s1e1wp")) {
- if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
- // SYS #0, C7, C9, #1
- SYS_ALIAS(0, 7, 9, 1);
- } else {
- return TokError("AT S1E1WP requires ARMv8.2a");
- }
- } else {
+ const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op);
+ if (!AT)
return TokError("invalid operand for AT instruction");
+ else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
+ std::string Str("AT " + std::string(AT->Name) + " requires ");
+ setRequiredFeatureString(AT->getRequiredFeatures(), Str);
+ return TokError(Str.c_str());
}
+ createSysAlias(AT->Encoding, Operands, S);
} else if (Mnemonic == "tlbi") {
- if (!Op.compare_lower("vmalle1is")) {
- // SYS #0, C8, C3, #0
- SYS_ALIAS(0, 8, 3, 0);
- } else if (!Op.compare_lower("alle2is")) {
- // SYS #4, C8, C3, #0
- SYS_ALIAS(4, 8, 3, 0);
- } else if (!Op.compare_lower("alle3is")) {
- // SYS #6, C8, C3, #0
- SYS_ALIAS(6, 8, 3, 0);
- } else if (!Op.compare_lower("vae1is")) {
- // SYS #0, C8, C3, #1
- SYS_ALIAS(0, 8, 3, 1);
- } else if (!Op.compare_lower("vae2is")) {
- // SYS #4, C8, C3, #1
- SYS_ALIAS(4, 8, 3, 1);
- } else if (!Op.compare_lower("vae3is")) {
- // SYS #6, C8, C3, #1
- SYS_ALIAS(6, 8, 3, 1);
- } else if (!Op.compare_lower("aside1is")) {
- // SYS #0, C8, C3, #2
- SYS_ALIAS(0, 8, 3, 2);
- } else if (!Op.compare_lower("vaae1is")) {
- // SYS #0, C8, C3, #3
- SYS_ALIAS(0, 8, 3, 3);
- } else if (!Op.compare_lower("alle1is")) {
- // SYS #4, C8, C3, #4
- SYS_ALIAS(4, 8, 3, 4);
- } else if (!Op.compare_lower("vale1is")) {
- // SYS #0, C8, C3, #5
- SYS_ALIAS(0, 8, 3, 5);
- } else if (!Op.compare_lower("vaale1is")) {
- // SYS #0, C8, C3, #7
- SYS_ALIAS(0, 8, 3, 7);
- } else if (!Op.compare_lower("vmalle1")) {
- // SYS #0, C8, C7, #0
- SYS_ALIAS(0, 8, 7, 0);
- } else if (!Op.compare_lower("alle2")) {
- // SYS #4, C8, C7, #0
- SYS_ALIAS(4, 8, 7, 0);
- } else if (!Op.compare_lower("vale2is")) {
- // SYS #4, C8, C3, #5
- SYS_ALIAS(4, 8, 3, 5);
- } else if (!Op.compare_lower("vale3is")) {
- // SYS #6, C8, C3, #5
- SYS_ALIAS(6, 8, 3, 5);
- } else if (!Op.compare_lower("alle3")) {
- // SYS #6, C8, C7, #0
- SYS_ALIAS(6, 8, 7, 0);
- } else if (!Op.compare_lower("vae1")) {
- // SYS #0, C8, C7, #1
- SYS_ALIAS(0, 8, 7, 1);
- } else if (!Op.compare_lower("vae2")) {
- // SYS #4, C8, C7, #1
- SYS_ALIAS(4, 8, 7, 1);
- } else if (!Op.compare_lower("vae3")) {
- // SYS #6, C8, C7, #1
- SYS_ALIAS(6, 8, 7, 1);
- } else if (!Op.compare_lower("aside1")) {
- // SYS #0, C8, C7, #2
- SYS_ALIAS(0, 8, 7, 2);
- } else if (!Op.compare_lower("vaae1")) {
- // SYS #0, C8, C7, #3
- SYS_ALIAS(0, 8, 7, 3);
- } else if (!Op.compare_lower("alle1")) {
- // SYS #4, C8, C7, #4
- SYS_ALIAS(4, 8, 7, 4);
- } else if (!Op.compare_lower("vale1")) {
- // SYS #0, C8, C7, #5
- SYS_ALIAS(0, 8, 7, 5);
- } else if (!Op.compare_lower("vale2")) {
- // SYS #4, C8, C7, #5
- SYS_ALIAS(4, 8, 7, 5);
- } else if (!Op.compare_lower("vale3")) {
- // SYS #6, C8, C7, #5
- SYS_ALIAS(6, 8, 7, 5);
- } else if (!Op.compare_lower("vaale1")) {
- // SYS #0, C8, C7, #7
- SYS_ALIAS(0, 8, 7, 7);
- } else if (!Op.compare_lower("ipas2e1")) {
- // SYS #4, C8, C4, #1
- SYS_ALIAS(4, 8, 4, 1);
- } else if (!Op.compare_lower("ipas2le1")) {
- // SYS #4, C8, C4, #5
- SYS_ALIAS(4, 8, 4, 5);
- } else if (!Op.compare_lower("ipas2e1is")) {
- // SYS #4, C8, C4, #1
- SYS_ALIAS(4, 8, 0, 1);
- } else if (!Op.compare_lower("ipas2le1is")) {
- // SYS #4, C8, C4, #5
- SYS_ALIAS(4, 8, 0, 5);
- } else if (!Op.compare_lower("vmalls12e1")) {
- // SYS #4, C8, C7, #6
- SYS_ALIAS(4, 8, 7, 6);
- } else if (!Op.compare_lower("vmalls12e1is")) {
- // SYS #4, C8, C3, #6
- SYS_ALIAS(4, 8, 3, 6);
- } else {
+ const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op);
+ if (!TLBI)
return TokError("invalid operand for TLBI instruction");
+ else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
+ std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
+ setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
+ return TokError(Str.c_str());
}
+ createSysAlias(TLBI->Encoding, Operands, S);
}
-#undef SYS_ALIAS
-
Parser.Lex(); // Eat operand.
bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
@@ -2744,12 +2429,10 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
HasRegister = true;
}
- if (ExpectRegister && !HasRegister) {
+ if (ExpectRegister && !HasRegister)
return TokError("specified " + Mnemonic + " op requires a register");
- }
- else if (!ExpectRegister && HasRegister) {
+ else if (!ExpectRegister && HasRegister)
return TokError("specified " + Mnemonic + " op does not use a register");
- }
if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
return true;
@@ -2790,16 +2473,14 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
return MatchOperand_ParseFail;
}
- auto DB = AArch64DB::lookupDBByName(Tok.getString());
- if (!DB) {
- TokError("invalid barrier option name");
- return MatchOperand_ParseFail;
- }
-
// The only valid named option for ISB is 'sy'
- if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
+ auto DB = AArch64DB::lookupDBByName(Tok.getString());
+ if (Mnemonic == "isb" && (!DB || DB->Encoding != AArch64DB::sy)) {
TokError("'sy' or #imm operand expected");
return MatchOperand_ParseFail;
+ } else if (!DB) {
+ TokError("invalid barrier option name");
+ return MatchOperand_ParseFail;
}
Operands.push_back(AArch64Operand::CreateBarrier(
@@ -2884,7 +2565,6 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
/// parseRegister - Parse a non-vector register operand.
bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
- MCAsmParser &Parser = getParser();
SMLoc S = getLoc();
// Try for a vector register.
if (!tryParseVectorRegister(Operands))
@@ -2897,30 +2577,6 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
Operands.push_back(
AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
- // A small number of instructions (FMOVXDhighr, for example) have "[1]"
- // as a string token in the instruction itself.
- SMLoc LBracS = getLoc();
- const AsmToken &Tok = Parser.getTok();
- if (parseOptionalToken(AsmToken::LBrac)) {
- if (Tok.is(AsmToken::Integer)) {
- SMLoc IntS = getLoc();
- int64_t Val = Tok.getIntVal();
- if (Val == 1) {
- Parser.Lex();
- SMLoc RBracS = getLoc();
- if (parseOptionalToken(AsmToken::RBrac)) {
- Operands.push_back(
- AArch64Operand::CreateToken("[", false, LBracS, getContext()));
- Operands.push_back(
- AArch64Operand::CreateToken("1", false, IntS, getContext()));
- Operands.push_back(
- AArch64Operand::CreateToken("]", false, RBracS, getContext()));
- return false;
- }
- }
- }
- }
-
return false;
}
@@ -3601,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst,
}
}
-bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS);
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
+ OperandVector &Operands) {
switch (ErrCode) {
case Match_MissingFeature:
return Error(Loc,
@@ -3696,6 +3355,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
return Error(Loc, "immediate must be an integer in range [0, 63].");
case Match_InvalidImm0_127:
return Error(Loc, "immediate must be an integer in range [0, 127].");
+ case Match_InvalidImm0_255:
+ return Error(Loc, "immediate must be an integer in range [0, 255].");
case Match_InvalidImm0_65535:
return Error(Loc, "immediate must be an integer in range [0, 65535].");
case Match_InvalidImm1_8:
@@ -3722,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
return Error(Loc, "expected readable system register");
case Match_MSR:
return Error(Loc, "expected writable system register or pstate");
- case Match_MnemonicFail:
- return Error(Loc, "unrecognized instruction mnemonic");
+ case Match_MnemonicFail: {
+ std::string Suggestion = AArch64MnemonicSpellCheck(
+ ((AArch64Operand &)*Operands[0]).getToken(),
+ ComputeAvailableFeatures(STI->getFeatureBits()));
+ return Error(Loc, "unrecognized instruction mnemonic" + Suggestion);
+ }
default:
llvm_unreachable("unexpected error code!");
}
@@ -3991,21 +3656,6 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
}
}
- // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
- if (NumOperands == 3 && Tok == "fmov") {
- AArch64Operand &RegOp = static_cast<AArch64Operand &>(*Operands[1]);
- AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]);
- if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) {
- unsigned zreg =
- !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(
- RegOp.getReg())
- ? AArch64::WZR
- : AArch64::XZR;
- Operands[2] = AArch64Operand::CreateReg(zreg, false, Op.getStartLoc(),
- Op.getEndLoc(), getContext());
- }
- }
-
MCInst Inst;
// First try to match against the secondary set of tables containing the
// short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
@@ -4064,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return Error(IDLoc, Msg);
}
case Match_MnemonicFail:
- return showMatchError(IDLoc, MatchResult);
+ return showMatchError(IDLoc, MatchResult, Operands);
case Match_InvalidOperand: {
SMLoc ErrorLoc = IDLoc;
@@ -4083,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix())
MatchResult = Match_InvalidSuffix;
- return showMatchError(ErrorLoc, MatchResult);
+ return showMatchError(ErrorLoc, MatchResult, Operands);
}
case Match_InvalidMemoryIndexed1:
case Match_InvalidMemoryIndexed2:
@@ -4120,6 +3770,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
case Match_InvalidImm0_31:
case Match_InvalidImm0_63:
case Match_InvalidImm0_127:
+ case Match_InvalidImm0_255:
case Match_InvalidImm0_65535:
case Match_InvalidImm1_8:
case Match_InvalidImm1_16:
@@ -4140,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc();
if (ErrorLoc == SMLoc())
ErrorLoc = IDLoc;
- return showMatchError(ErrorLoc, MatchResult);
+ return showMatchError(ErrorLoc, MatchResult, Operands);
}
}
@@ -4260,10 +3911,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
return false;
}
+static SMLoc incrementLoc(SMLoc L, int Offset) {
+ return SMLoc::getFromPointer(L.getPointer() + Offset);
+}
+
/// parseDirectiveCPU
/// ::= .cpu id
bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
- SMLoc CPULoc = getLoc();
+ SMLoc CurLoc = getLoc();
StringRef CPU, ExtensionString;
std::tie(CPU, ExtensionString) =
@@ -4279,15 +3934,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
// FIXME This is using tablegen data, but should be moved to ARMTargetParser
// once that is tablegen'ed
if (!getSTI().isCPUStringValid(CPU)) {
- Error(CPULoc, "unknown CPU name");
+ Error(CurLoc, "unknown CPU name");
return false;
}
MCSubtargetInfo &STI = copySTI();
STI.setDefaultFeatures(CPU, "");
+ CurLoc = incrementLoc(CurLoc, CPU.size());
FeatureBitset Features = STI.getFeatureBits();
for (auto Name : RequestedExtensions) {
+ // Advance source location past '+'.
+ CurLoc = incrementLoc(CurLoc, 1);
+
bool EnableFeature = true;
if (Name.startswith_lower("no")) {
@@ -4295,6 +3954,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
Name = Name.substr(2);
}
+ bool FoundExtension = false;
for (const auto &Extension : ExtensionMap) {
if (Extension.Name != Name)
continue;
@@ -4308,9 +3968,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
uint64_t Features =
ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
setAvailableFeatures(Features);
+ FoundExtension = true;
break;
}
+
+ if (!FoundExtension)
+ Error(CurLoc, "unsupported architectural extension");
+
+ CurLoc = incrementLoc(CurLoc, Name.size());
}
return false;
}
diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 0d860a7..7870dce 100644
--- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -756,7 +756,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
// if shift == '11' then ReservedValue()
if (shiftHi == 0x3)
return Fail;
- // Deliberate fallthrough
+ LLVM_FALLTHROUGH;
case AArch64::ANDWrs:
case AArch64::ANDSWrs:
case AArch64::BICWrs:
@@ -780,7 +780,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
// if shift == '11' then ReservedValue()
if (shiftHi == 0x3)
return Fail;
- // Deliberate fallthrough
+ LLVM_FALLTHROUGH;
case AArch64::ANDXrs:
case AArch64::ANDSXrs:
case AArch64::BICXrs:
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index b4f8520..fc89657 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -16,12 +16,21 @@
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+
using namespace llvm;
#define DEBUG_TYPE "asm-printer"
@@ -267,6 +276,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
}
}
+ if (Opcode == AArch64::CompilerBarrier) {
+ O << '\t' << MAI.getCommentString() << " COMPILER BARRIER";
+ printAnnotation(O, Annot);
+ return;
+ }
+
if (!printAliasInstr(MI, STI, O))
printInstruction(MI, STI, O);
@@ -451,8 +466,8 @@ static const LdStNInstrDesc LdStNInstInfo[] = {
{ AArch64::LD3i64, "ld3", ".d", 1, true, 0 },
{ AArch64::LD3i8_POST, "ld3", ".b", 2, true, 3 },
{ AArch64::LD3i16_POST, "ld3", ".h", 2, true, 6 },
- { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 },
- { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 },
+ { AArch64::LD3i32_POST, "ld3", ".s", 2, true, 12 },
+ { AArch64::LD3i64_POST, "ld3", ".d", 2, true, 24 },
{ AArch64::LD3Rv16b, "ld3r", ".16b", 0, false, 0 },
{ AArch64::LD3Rv8h, "ld3r", ".8h", 0, false, 0 },
{ AArch64::LD3Rv4s, "ld3r", ".4s", 0, false, 0 },
@@ -731,7 +746,6 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
#endif
- const char *Asm = nullptr;
const MCOperand &Op1 = MI->getOperand(0);
const MCOperand &Cn = MI->getOperand(1);
const MCOperand &Cm = MI->getOperand(2);
@@ -742,230 +756,74 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
unsigned CmVal = Cm.getImm();
unsigned Op2Val = Op2.getImm();
+ uint16_t Encoding = Op2Val;
+ Encoding |= CmVal << 3;
+ Encoding |= CnVal << 7;
+ Encoding |= Op1Val << 11;
+
+ bool NeedsReg;
+ std::string Ins;
+ std::string Name;
+
if (CnVal == 7) {
switch (CmVal) {
- default:
- break;
-
+ default: return false;
// IC aliases
- case 1:
- if (Op1Val == 0 && Op2Val == 0)
- Asm = "ic\tialluis";
- break;
- case 5:
- if (Op1Val == 0 && Op2Val == 0)
- Asm = "ic\tiallu";
- else if (Op1Val == 3 && Op2Val == 1)
- Asm = "ic\tivau";
- break;
-
+ case 1: case 5: {
+ const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding);
+ if (!IC || !IC->haveFeatures(STI.getFeatureBits()))
+ return false;
+
+ NeedsReg = IC->NeedsReg;
+ Ins = "ic\t";
+ Name = std::string(IC->Name);
+ }
+ break;
// DC aliases
- case 4:
- if (Op1Val == 3 && Op2Val == 1)
- Asm = "dc\tzva";
- break;
- case 6:
- if (Op1Val == 0 && Op2Val == 1)
- Asm = "dc\tivac";
- if (Op1Val == 0 && Op2Val == 2)
- Asm = "dc\tisw";
- break;
- case 10:
- if (Op1Val == 3 && Op2Val == 1)
- Asm = "dc\tcvac";
- else if (Op1Val == 0 && Op2Val == 2)
- Asm = "dc\tcsw";
- break;
- case 11:
- if (Op1Val == 3 && Op2Val == 1)
- Asm = "dc\tcvau";
- break;
- case 12:
- if (Op1Val == 3 && Op2Val == 1 &&
- (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
- Asm = "dc\tcvap";
- break;
- case 14:
- if (Op1Val == 3 && Op2Val == 1)
- Asm = "dc\tcivac";
- else if (Op1Val == 0 && Op2Val == 2)
- Asm = "dc\tcisw";
- break;
-
+ case 4: case 6: case 10: case 11: case 12: case 14:
+ {
+ const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding);
+ if (!DC || !DC->haveFeatures(STI.getFeatureBits()))
+ return false;
+
+ NeedsReg = true;
+ Ins = "dc\t";
+ Name = std::string(DC->Name);
+ }
+ break;
// AT aliases
- case 8:
- switch (Op1Val) {
- default:
- break;
- case 0:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "at\ts1e1r"; break;
- case 1: Asm = "at\ts1e1w"; break;
- case 2: Asm = "at\ts1e0r"; break;
- case 3: Asm = "at\ts1e0w"; break;
- }
- break;
- case 4:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "at\ts1e2r"; break;
- case 1: Asm = "at\ts1e2w"; break;
- case 4: Asm = "at\ts12e1r"; break;
- case 5: Asm = "at\ts12e1w"; break;
- case 6: Asm = "at\ts12e0r"; break;
- case 7: Asm = "at\ts12e0w"; break;
- }
- break;
- case 6:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "at\ts1e3r"; break;
- case 1: Asm = "at\ts1e3w"; break;
- }
- break;
- }
- break;
- case 9:
- switch (Op1Val) {
- default:
- break;
- case 0:
- if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "at\ts1e1rp"; break;
- case 1: Asm = "at\ts1e1wp"; break;
- }
- }
- break;
- }
+ case 8: case 9: {
+ const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding);
+ if (!AT || !AT->haveFeatures(STI.getFeatureBits()))
+ return false;
+
+ NeedsReg = true;
+ Ins = "at\t";
+ Name = std::string(AT->Name);
+ }
+ break;
}
} else if (CnVal == 8) {
// TLBI aliases
- switch (CmVal) {
- default:
- break;
- case 3:
- switch (Op1Val) {
- default:
- break;
- case 0:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "tlbi\tvmalle1is"; break;
- case 1: Asm = "tlbi\tvae1is"; break;
- case 2: Asm = "tlbi\taside1is"; break;
- case 3: Asm = "tlbi\tvaae1is"; break;
- case 5: Asm = "tlbi\tvale1is"; break;
- case 7: Asm = "tlbi\tvaale1is"; break;
- }
- break;
- case 4:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "tlbi\talle2is"; break;
- case 1: Asm = "tlbi\tvae2is"; break;
- case 4: Asm = "tlbi\talle1is"; break;
- case 5: Asm = "tlbi\tvale2is"; break;
- case 6: Asm = "tlbi\tvmalls12e1is"; break;
- }
- break;
- case 6:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "tlbi\talle3is"; break;
- case 1: Asm = "tlbi\tvae3is"; break;
- case 5: Asm = "tlbi\tvale3is"; break;
- }
- break;
- }
- break;
- case 0:
- switch (Op1Val) {
- default:
- break;
- case 4:
- switch (Op2Val) {
- default:
- break;
- case 1: Asm = "tlbi\tipas2e1is"; break;
- case 5: Asm = "tlbi\tipas2le1is"; break;
- }
- break;
- }
- break;
- case 4:
- switch (Op1Val) {
- default:
- break;
- case 4:
- switch (Op2Val) {
- default:
- break;
- case 1: Asm = "tlbi\tipas2e1"; break;
- case 5: Asm = "tlbi\tipas2le1"; break;
- }
- break;
- }
- break;
- case 7:
- switch (Op1Val) {
- default:
- break;
- case 0:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "tlbi\tvmalle1"; break;
- case 1: Asm = "tlbi\tvae1"; break;
- case 2: Asm = "tlbi\taside1"; break;
- case 3: Asm = "tlbi\tvaae1"; break;
- case 5: Asm = "tlbi\tvale1"; break;
- case 7: Asm = "tlbi\tvaale1"; break;
- }
- break;
- case 4:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "tlbi\talle2"; break;
- case 1: Asm = "tlbi\tvae2"; break;
- case 4: Asm = "tlbi\talle1"; break;
- case 5: Asm = "tlbi\tvale2"; break;
- case 6: Asm = "tlbi\tvmalls12e1"; break;
- }
- break;
- case 6:
- switch (Op2Val) {
- default:
- break;
- case 0: Asm = "tlbi\talle3"; break;
- case 1: Asm = "tlbi\tvae3"; break;
- case 5: Asm = "tlbi\tvale3"; break;
- }
- break;
- }
- break;
- }
+ const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding);
+ if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits()))
+ return false;
+
+ NeedsReg = TLBI->NeedsReg;
+ Ins = "tlbi\t";
+ Name = std::string(TLBI->Name);
}
+ else
+ return false;
- if (Asm) {
- unsigned Reg = MI->getOperand(4).getReg();
+ std::string Str = Ins + Name;
+ std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower);
- O << '\t' << Asm;
- if (StringRef(Asm).lower().find("all") == StringRef::npos)
- O << ", " << getRegisterName(Reg);
- }
+ O << '\t' << Str;
+ if (NeedsReg)
+ O << ", " << getRegisterName(MI->getOperand(4).getReg());
- return Asm != nullptr;
+ return true;
}
void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 65dca99..a45258c 100644
--- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -15,6 +15,7 @@
#define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
@@ -37,9 +38,11 @@ public:
unsigned PrintMethodIdx,
const MCSubtargetInfo &STI,
raw_ostream &O);
+
virtual StringRef getRegName(unsigned RegNo) const {
return getRegisterName(RegNo);
}
+
static const char *getRegisterName(unsigned RegNo,
unsigned AltIdx = AArch64::NoRegAltName);
@@ -177,12 +180,15 @@ public:
unsigned PrintMethodIdx,
const MCSubtargetInfo &STI,
raw_ostream &O) override;
+
StringRef getRegName(unsigned RegNo) const override {
return getRegisterName(RegNo);
}
+
static const char *getRegisterName(unsigned RegNo,
unsigned AltIdx = AArch64::NoRegAltName);
};
-}
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 14c0327..2bd0cbf 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -11,8 +11,9 @@
#include "AArch64RegisterInfo.h"
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDirectives.h"
#include "llvm/MC/MCELFObjectWriter.h"
@@ -22,7 +23,6 @@
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
using namespace llvm;
namespace {
@@ -43,26 +43,25 @@ public:
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
- // This table *must* be in the order that the fixup_* kinds are defined in
- // AArch64FixupKinds.h.
- //
- // Name Offset (bits) Size (bits) Flags
- { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
- { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
- { "fixup_aarch64_add_imm12", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
- { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
- { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
- { "fixup_aarch64_movw", 5, 16, 0 },
- { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
- { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
- { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
- { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
- { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
- };
+ // This table *must* be in the order that the fixup_* kinds are defined
+ // in AArch64FixupKinds.h.
+ //
+ // Name Offset (bits) Size (bits) Flags
+ {"fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal},
+ {"fixup_aarch64_add_imm12", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale1", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale2", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale4", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale8", 10, 12, 0},
+ {"fixup_aarch64_ldst_imm12_scale16", 10, 12, 0},
+ {"fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal},
+ {"fixup_aarch64_movw", 5, 16, 0},
+ {"fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal},
+ {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
+ {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
@@ -72,8 +71,9 @@ public:
return Infos[Kind - FirstTargetFixupKind];
}
- void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
- uint64_t Value, bool IsPCRel) const override;
+ void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target, MutableArrayRef<char> Data,
+ uint64_t Value, bool IsResolved) const override;
bool mayNeedRelaxation(const MCInst &Inst) const override;
bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
case FK_Data_1:
return 1;
- case FK_Data_2:
case AArch64::fixup_aarch64_movw:
+ case FK_Data_2:
+ case FK_SecRel_2:
return 2;
case AArch64::fixup_aarch64_pcrel_branch14:
@@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
case AArch64::fixup_aarch64_pcrel_branch26:
case AArch64::fixup_aarch64_pcrel_call26:
case FK_Data_4:
+ case FK_SecRel_4:
return 4;
case FK_Data_8:
@@ -138,15 +140,15 @@ static unsigned AdrImmBits(unsigned Value) {
}
static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
- MCContext *Ctx) {
+ MCContext &Ctx) {
unsigned Kind = Fixup.getKind();
int64_t SignedValue = static_cast<int64_t>(Value);
switch (Kind) {
default:
llvm_unreachable("Unknown fixup kind!");
case AArch64::fixup_aarch64_pcrel_adr_imm21:
- if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (SignedValue > 2097151 || SignedValue < -2097152)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
return AdrImmBits(Value & 0x1fffffULL);
case AArch64::fixup_aarch64_pcrel_adrp_imm21:
return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
@@ -154,71 +156,72 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
case AArch64::fixup_aarch64_pcrel_branch19:
// Signed 21-bit immediate
if (SignedValue > 2097151 || SignedValue < -2097152)
- if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
- if (Ctx && (Value & 0x3))
- Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Value & 0x3)
+ Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
// Low two bits are not encoded.
return (Value >> 2) & 0x7ffff;
case AArch64::fixup_aarch64_add_imm12:
case AArch64::fixup_aarch64_ldst_imm12_scale1:
// Unsigned 12-bit immediate
- if (Ctx && Value >= 0x1000)
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Value >= 0x1000)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
return Value;
case AArch64::fixup_aarch64_ldst_imm12_scale2:
// Unsigned 12-bit immediate which gets multiplied by 2
- if (Ctx && (Value >= 0x2000))
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
- if (Ctx && (Value & 0x1))
- Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
+ if (Value >= 0x2000)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Value & 0x1)
+ Ctx.reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
return Value >> 1;
case AArch64::fixup_aarch64_ldst_imm12_scale4:
// Unsigned 12-bit immediate which gets multiplied by 4
- if (Ctx && (Value >= 0x4000))
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
- if (Ctx && (Value & 0x3))
- Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
+ if (Value >= 0x4000)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Value & 0x3)
+ Ctx.reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
return Value >> 2;
case AArch64::fixup_aarch64_ldst_imm12_scale8:
// Unsigned 12-bit immediate which gets multiplied by 8
- if (Ctx && (Value >= 0x8000))
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
- if (Ctx && (Value & 0x7))
- Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
+ if (Value >= 0x8000)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Value & 0x7)
+ Ctx.reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
return Value >> 3;
case AArch64::fixup_aarch64_ldst_imm12_scale16:
// Unsigned 12-bit immediate which gets multiplied by 16
- if (Ctx && (Value >= 0x10000))
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
- if (Ctx && (Value & 0xf))
- Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
+ if (Value >= 0x10000)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+ if (Value & 0xf)
+ Ctx.reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
return Value >> 4;
case AArch64::fixup_aarch64_movw:
- if (Ctx)
- Ctx->reportError(Fixup.getLoc(),
- "no resolvable MOVZ/MOVK fixups supported yet");
+ Ctx.reportError(Fixup.getLoc(),
+ "no resolvable MOVZ/MOVK fixups supported yet");
return Value;
case AArch64::fixup_aarch64_pcrel_branch14:
// Signed 16-bit immediate
- if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (SignedValue > 32767 || SignedValue < -32768)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
// Low two bits are not encoded (4-byte alignment assumed).
- if (Ctx && (Value & 0x3))
- Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+ if (Value & 0x3)
+ Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
return (Value >> 2) & 0x3fff;
case AArch64::fixup_aarch64_pcrel_branch26:
case AArch64::fixup_aarch64_pcrel_call26:
// Signed 28-bit immediate
- if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
- Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+ if (SignedValue > 134217727 || SignedValue < -134217728)
+ Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
// Low two bits are not encoded (4-byte alignment assumed).
- if (Ctx && (Value & 0x3))
- Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+ if (Value & 0x3)
+ Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
return (Value >> 2) & 0x3ffffff;
case FK_Data_1:
case FK_Data_2:
case FK_Data_4:
case FK_Data_8:
+ case FK_SecRel_2:
+ case FK_SecRel_4:
return Value;
}
}
@@ -262,21 +265,23 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
}
}
-void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
- unsigned DataSize, uint64_t Value,
- bool IsPCRel) const {
+void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data, uint64_t Value,
+ bool IsResolved) const {
unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
if (!Value)
return; // Doesn't change encoding.
MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+ MCContext &Ctx = Asm.getContext();
// Apply any target-specific value adjustments.
- Value = adjustFixupValue(Fixup, Value, nullptr);
+ Value = adjustFixupValue(Fixup, Value, Ctx);
// Shift the value into position.
Value <<= Info.TargetOffset;
unsigned Offset = Fixup.getOffset();
- assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+ assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
// Used to point to big endian bytes.
unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
@@ -290,7 +295,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
}
} else {
// Handle as big-endian
- assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+ assert((Offset + FulleSizeInBytes) <= Data.size() && "Invalid fixup size!");
assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
for (unsigned i = 0; i != NumBytes; ++i) {
unsigned Idx = FulleSizeInBytes - 1 - i;
@@ -521,17 +526,6 @@ public:
return CompactUnwindEncoding;
}
-
- void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override {
- // Try to get the encoded value for the fixup as-if we're mapping it into
- // the instruction. This allows adjustFixupValue() to issue a diagnostic
- // if the value is invalid.
- if (IsResolved)
- (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
- }
};
} // end anonymous namespace
@@ -551,16 +545,13 @@ public:
return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian, IsILP32);
}
- void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
- const MCFixup &Fixup, const MCFragment *DF,
- const MCValue &Target, uint64_t &Value,
- bool &IsResolved) override;
+ bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target) override;
};
-void ELFAArch64AsmBackend::processFixupValue(
- const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
- const MCFragment *DF, const MCValue &Target, uint64_t &Value,
- bool &IsResolved) {
+bool ELFAArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
+ const MCFixup &Fixup,
+ const MCValue &Target) {
// The ADRP instruction adds some multiple of 0x1000 to the current PC &
// ~0xfff. This means that the required offset to reach a symbol can vary by
// up to one step depending on where the ADRP is in memory. For example:
@@ -574,15 +565,22 @@ void ELFAArch64AsmBackend::processFixupValue(
// section isn't 0x1000-aligned, we therefore need to delegate this decision
// to the linker -- a relocation!
if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
- IsResolved = false;
+ return true;
+ return false;
+}
- // Try to get the encoded value for the fixup as-if we're mapping it into
- // the instruction. This allows adjustFixupValue() to issue a diagnostic
- // if the value is invalid.
- if (IsResolved)
- (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
}
+namespace {
+class COFFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+ COFFAArch64AsmBackend(const Target &T, const Triple &TheTriple)
+ : AArch64AsmBackend(T, /*IsLittleEndian*/true) {}
+
+ MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ return createAArch64WinCOFFObjectWriter(OS);
+ }
+};
}
MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
@@ -593,7 +591,11 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
if (TheTriple.isOSBinFormatMachO())
return new DarwinAArch64AsmBackend(T, MRI);
- assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+ if (TheTriple.isOSBinFormatCOFF())
+ return new COFFAArch64AsmBackend(T, TheTriple);
+
+ assert(TheTriple.isOSBinFormatELF() && "Invalid target");
+
uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
bool IsILP32 = Options.getABIName() == "ilp32";
return new ELFAArch64AsmBackend(T, OSABI, /*IsLittleEndian=*/true, IsILP32);
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index c954c0e..89c3e5b 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -15,11 +15,11 @@
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCExpr.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ELF.h"
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
#include <cstdint>
@@ -49,10 +49,11 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
/*HasRelocationAddend*/ true),
IsILP32(IsILP32) {}
-#define R_CLS(rtype) \
- IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
-#define BAD_ILP32_MOV(lp64rtype) "ILP32 absolute MOV relocation not "\
- "supported (LP64 eqv: " #lp64rtype ")"
+#define R_CLS(rtype) \
+ IsILP32 ? ELF::R_AARCH64_P32_##rtype : ELF::R_AARCH64_##rtype
+#define BAD_ILP32_MOV(lp64rtype) \
+ "ILP32 absolute MOV relocation not " \
+ "supported (LP64 eqv: " #lp64rtype ")"
// assumes IsILP32 is true
static bool isNonILP32reloc(const MCFixup &Fixup,
@@ -60,44 +61,45 @@ static bool isNonILP32reloc(const MCFixup &Fixup,
MCContext &Ctx) {
if ((unsigned)Fixup.getKind() != AArch64::fixup_aarch64_movw)
return false;
- switch(RefKind) {
- case AArch64MCExpr::VK_ABS_G3:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
- return true;
- case AArch64MCExpr::VK_ABS_G2:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
- return true;
- case AArch64MCExpr::VK_ABS_G2_S:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_ABS_G2_NC:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_ABS_G1_S:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_ABS_G1_NC:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_DTPREL_G2:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_DTPREL_G1_NC:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_TPREL_G2:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_TPREL_G1_NC:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_GOTTPREL_G1:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
- return ELF::R_AARCH64_NONE;
- case AArch64MCExpr::VK_GOTTPREL_G0_NC:
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
- return ELF::R_AARCH64_NONE;
- default: return false;
+ switch (RefKind) {
+ case AArch64MCExpr::VK_ABS_G3:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G3));
+ return true;
+ case AArch64MCExpr::VK_ABS_G2:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2));
+ return true;
+ case AArch64MCExpr::VK_ABS_G2_S:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G2));
+ return true;
+ case AArch64MCExpr::VK_ABS_G2_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G2_NC));
+ return true;
+ case AArch64MCExpr::VK_ABS_G1_S:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_SABS_G1));
+ return true;
+ case AArch64MCExpr::VK_ABS_G1_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(MOVW_UABS_G1_NC));
+ return true;
+ case AArch64MCExpr::VK_DTPREL_G2:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G2));
+ return true;
+ case AArch64MCExpr::VK_DTPREL_G1_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLD_MOVW_DTPREL_G1_NC));
+ return true;
+ case AArch64MCExpr::VK_TPREL_G2:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G2));
+ return true;
+ case AArch64MCExpr::VK_TPREL_G1_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSLE_MOVW_TPREL_G1_NC));
+ return true;
+ case AArch64MCExpr::VK_GOTTPREL_G1:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G1));
+ return true;
+ case AArch64MCExpr::VK_GOTTPREL_G0_NC:
+ Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(TLSIE_MOVW_GOTTPREL_G0_NC));
+ return true;
+ default:
+ return false;
}
return false;
}
@@ -130,7 +132,8 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
return R_CLS(PREL32);
case FK_Data_8:
if (IsILP32) {
- Ctx.reportError(Fixup.getLoc(), "ILP32 8 byte PC relative data "
+ Ctx.reportError(Fixup.getLoc(),
+ "ILP32 8 byte PC relative data "
"relocation not supported (LP64 eqv: PREL64)");
return ELF::R_AARCH64_NONE;
} else
@@ -141,6 +144,16 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
case AArch64::fixup_aarch64_pcrel_adrp_imm21:
if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
return R_CLS(ADR_PREL_PG_HI21);
+ if (SymLoc == AArch64MCExpr::VK_ABS && IsNC) {
+ if (IsILP32) {
+ Ctx.reportError(Fixup.getLoc(),
+ "invalid fixup for 32-bit pcrel ADRP instruction "
+ "VK_ABS VK_NC");
+ return ELF::R_AARCH64_NONE;
+ } else {
+ return ELF::R_AARCH64_ADR_PREL_PG_HI21_NC;
+ }
+ }
if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
return R_CLS(ADR_GOT_PAGE);
if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
@@ -168,7 +181,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
}
} else {
if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
- return ELF::R_AARCH64_NONE;
+ return ELF::R_AARCH64_NONE;
switch ((unsigned)Fixup.getKind()) {
case FK_Data_1:
Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
@@ -179,7 +192,9 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
return R_CLS(ABS32);
case FK_Data_8:
if (IsILP32) {
- Ctx.reportError(Fixup.getLoc(), BAD_ILP32_MOV(ABS64));
+ Ctx.reportError(Fixup.getLoc(),
+ "ILP32 8 byte absolute data "
+ "relocation not supported (LP64 eqv: ABS64)");
return ELF::R_AARCH64_NONE;
} else
return ELF::R_AARCH64_ABS64;
@@ -197,7 +212,7 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
return R_CLS(TLSLE_ADD_TPREL_LO12);
if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
- return R_CLS(TLSDESC_ADD_LO12_NC);
+ return R_CLS(TLSDESC_ADD_LO12);
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return R_CLS(ADD_ABS_LO12_NC);
@@ -245,15 +260,68 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
return R_CLS(TLSLE_LDST32_TPREL_LO12);
if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
return R_CLS(TLSLE_LDST32_TPREL_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) {
+ if (IsILP32) {
+ return ELF::R_AARCH64_P32_LD32_GOT_LO12_NC;
+ } else {
+ Ctx.reportError(Fixup.getLoc(),
+ "LP64 4 byte unchecked GOT load/store relocation "
+ "not supported (ILP32 eqv: LD32_GOT_LO12_NC");
+ return ELF::R_AARCH64_NONE;
+ }
+ }
+ if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC) {
+ if (IsILP32) {
+ Ctx.reportError(Fixup.getLoc(),
+ "ILP32 4 byte checked GOT load/store relocation "
+ "not supported (unchecked eqv: LD32_GOT_LO12_NC)");
+ } else {
+ Ctx.reportError(Fixup.getLoc(),
+ "LP64 4 byte checked GOT load/store relocation "
+ "not supported (unchecked/ILP32 eqv: "
+ "LD32_GOT_LO12_NC)");
+ }
+ return ELF::R_AARCH64_NONE;
+ }
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) {
+ if (IsILP32) {
+ return ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC;
+ } else {
+ Ctx.reportError(Fixup.getLoc(),
+ "LP64 32-bit load/store "
+ "relocation not supported (ILP32 eqv: "
+ "TLSIE_LD32_GOTTPREL_LO12_NC)");
+ return ELF::R_AARCH64_NONE;
+ }
+ }
+ if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC) {
+ if (IsILP32) {
+ return ELF::R_AARCH64_P32_TLSDESC_LD32_LO12;
+ } else {
+ Ctx.reportError(Fixup.getLoc(),
+ "LP64 4 byte TLSDESC load/store relocation "
+ "not supported (ILP32 eqv: TLSDESC_LD64_LO12)");
+ return ELF::R_AARCH64_NONE;
+ }
+ }
Ctx.reportError(Fixup.getLoc(),
- "invalid fixup for 32-bit load/store instruction");
+ "invalid fixup for 32-bit load/store instruction "
+ "fixup_aarch64_ldst_imm12_scale4");
return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_ldst_imm12_scale8:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return R_CLS(LDST64_ABS_LO12_NC);
- if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
- return R_CLS(LD64_GOT_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_GOT && IsNC) {
+ if (!IsILP32) {
+ return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+ } else {
+ Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
+ "relocation not supported (LP64 eqv: "
+ "LD64_GOT_LO12_NC)");
+ return ELF::R_AARCH64_NONE;
+ }
+ }
if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
return R_CLS(TLSLD_LDST64_DTPREL_LO12);
if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
@@ -262,19 +330,40 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
return R_CLS(TLSLE_LDST64_TPREL_LO12);
if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
return R_CLS(TLSLE_LDST64_TPREL_LO12_NC);
- if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
- return IsILP32 ? ELF::R_AARCH64_P32_TLSIE_LD32_GOTTPREL_LO12_NC
- : ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
- if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
- return IsILP32 ? ELF::R_AARCH64_P32_TLSDESC_LD32_LO12_NC
- : ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-
+ if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC) {
+ if (!IsILP32) {
+ return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+ } else {
+ Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
+ "relocation not supported (LP64 eqv: "
+ "TLSIE_LD64_GOTTPREL_LO12_NC)");
+ return ELF::R_AARCH64_NONE;
+ }
+ }
+ if (SymLoc == AArch64MCExpr::VK_TLSDESC) {
+ if (!IsILP32) {
+ return ELF::R_AARCH64_TLSDESC_LD64_LO12;
+ } else {
+ Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store "
+ "relocation not supported (LP64 eqv: "
+ "TLSDESC_LD64_LO12)");
+ return ELF::R_AARCH64_NONE;
+ }
+ }
Ctx.reportError(Fixup.getLoc(),
"invalid fixup for 64-bit load/store instruction");
return ELF::R_AARCH64_NONE;
case AArch64::fixup_aarch64_ldst_imm12_scale16:
if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
return R_CLS(LDST128_ABS_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+ return R_CLS(TLSLD_LDST128_DTPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+ return R_CLS(TLSLD_LDST128_DTPREL_LO12_NC);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+ return R_CLS(TLSLE_LDST128_TPREL_LO12);
+ if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+ return R_CLS(TLSLE_LDST128_TPREL_LO12_NC);
Ctx.reportError(Fixup.getLoc(),
"invalid fixup for 128-bit load/store instruction");
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 685907a..a0de3c3 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -14,27 +14,25 @@
//===----------------------------------------------------------------------===//
#include "AArch64TargetStreamer.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringExtras.h"
+#include "AArch64WinCOFFStreamer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+#include "llvm/Support/Casting.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/raw_ostream.h"
@@ -106,8 +104,8 @@ public:
/// This function is the one used to emit instruction data into the ELF
/// streamer. We override it to add the appropriate mapping symbol if
/// necessary.
- void EmitInstruction(const MCInst &Inst,
- const MCSubtargetInfo &STI) override {
+ void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+ bool) override {
EmitA64MappingSymbol();
MCELFStreamer::EmitInstruction(Inst, STI);
}
@@ -180,6 +178,7 @@ private:
DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
ElfMappingSymbol LastEMS;
};
+
} // end anonymous namespace
AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
@@ -191,6 +190,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
}
namespace llvm {
+
MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
formatted_raw_ostream &OS,
MCInstPrinter *InstPrint,
@@ -212,6 +212,9 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
const Triple &TT = STI.getTargetTriple();
if (TT.isOSBinFormatELF())
return new AArch64TargetELFStreamer(S);
+ if (TT.isOSBinFormatCOFF())
+ return new AArch64TargetWinCOFFStreamer(S);
return nullptr;
}
-}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index 0f5b765..4293dcb 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -16,53 +16,47 @@ namespace llvm {
namespace AArch64 {
enum Fixups {
- // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
- // an ADR instruction.
+ // A 21-bit pc-relative immediate inserted into an ADR instruction.
fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
- // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
- // an ADRP instruction.
+ // A 21-bit pc-relative immediate inserted into an ADRP instruction.
fixup_aarch64_pcrel_adrp_imm21,
- // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
- // No alignment adjustment. All value bits are encoded.
+ // 12-bit fixup for add/sub instructions. No alignment adjustment. All value
+ // bits are encoded.
fixup_aarch64_add_imm12,
- // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
- // store instructions.
+ // unsigned 12-bit fixups for load and store instructions.
fixup_aarch64_ldst_imm12_scale1,
fixup_aarch64_ldst_imm12_scale2,
fixup_aarch64_ldst_imm12_scale4,
fixup_aarch64_ldst_imm12_scale8,
fixup_aarch64_ldst_imm12_scale16,
- // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
- // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
- // pc-relative loads and generates relocations directly when necessary.
+ // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+ // fixup_aarch64_pcrel_adrhi, except this is used by pc-relative loads and
+ // generates relocations directly when necessary.
fixup_aarch64_ldr_pcrel_imm19,
// FIXME: comment
fixup_aarch64_movw,
- // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
- // immediate.
+ // The high 14 bits of a 21-bit pc-relative immediate.
fixup_aarch64_pcrel_branch14,
- // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
- // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
- // b.cc and generates relocations directly when necessary.
+ // The high 19 bits of a 21-bit pc-relative immediate. Same encoding as
+ // fixup_aarch64_pcrel_adrhi, except this is use by b.cc and generates
+ // relocations directly when necessary.
fixup_aarch64_pcrel_branch19,
- // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
- // immediate.
+ // The high 26 bits of a 28-bit pc-relative immediate.
fixup_aarch64_pcrel_branch26,
- // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
- // immediate. Distinguished from branch26 only on ELF.
+ // The high 26 bits of a 28-bit pc-relative immediate. Distinguished from
+ // branch26 only on ELF.
fixup_aarch64_pcrel_call26,
- // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
- // R_AARCH64_TLSDESC_CALL relocation.
+ // zero-space placeholder for the ELF R_AARCH64_TLSDESC_CALL relocation.
fixup_aarch64_tlsdesc_call,
// Marker
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 8fc8223..c25bd8c 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -32,14 +32,15 @@ static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly")));
AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
- // We prefer NEON instructions to be printed in the short form.
- AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+ // We prefer NEON instructions to be printed in the short, Apple-specific
+ // form when targeting Darwin.
+ AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant;
PrivateGlobalPrefix = "L";
PrivateLabelPrefix = "L";
SeparatorString = "%%";
CommentString = ";";
- PointerSize = CalleeSaveStackSlotSize = 8;
+ CodePointerSize = CalleeSaveStackSlotSize = 8;
AlignmentIsInBytes = false;
UsesELFSectionDirectiveForBSS = true;
@@ -68,10 +69,11 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
if (T.getArch() == Triple::aarch64_be)
IsLittleEndian = false;
- // We prefer NEON instructions to be printed in the short form.
- AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+ // We prefer NEON instructions to be printed in the generic form when
+ // targeting ELF.
+ AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant;
- PointerSize = 8;
+ CodePointerSize = 8;
// ".comm align is in bytes but .align is pow-2."
AlignmentIsInBytes = false;
@@ -98,3 +100,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
HasIdentDirective = true;
}
+
+AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() {
+ CommentString = ";";
+ PrivateGlobalPrefix = ".L";
+ PrivateLabelPrefix = ".L";
+}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 253cd30..2d7107a 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -14,6 +14,7 @@
#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCASMINFO_H
+#include "llvm/MC/MCAsmInfoCOFF.h"
#include "llvm/MC/MCAsmInfoDarwin.h"
#include "llvm/MC/MCAsmInfoELF.h"
@@ -33,6 +34,10 @@ struct AArch64MCAsmInfoELF : public MCAsmInfoELF {
explicit AArch64MCAsmInfoELF(const Triple &T);
};
+struct AArch64MCAsmInfoCOFF : public MCAsmInfoCOFF {
+ explicit AArch64MCAsmInfoCOFF();
+};
+
} // namespace llvm
#endif
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 62dfa59..33698d2 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -565,6 +565,9 @@ void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
Fixups.push_back(MCFixup::create(0, MI.getOperand(0).getExpr(), Fixup));
return;
+ } else if (MI.getOpcode() == AArch64::CompilerBarrier) {
+ // This just prevents the compiler from reordering accesses, no actual code.
+ return;
}
uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index a540f49..97c92fa 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -62,6 +62,7 @@ StringRef AArch64MCExpr::getVariantKindName() const {
case VK_TPREL_LO12_NC: return ":tprel_lo12_nc:";
case VK_TLSDESC_LO12: return ":tlsdesc_lo12:";
case VK_ABS_PAGE: return "";
+ case VK_ABS_PAGE_NC: return ":pg_hi21_nc:";
case VK_GOT_PAGE: return ":got:";
case VK_GOT_LO12: return ":got_lo12:";
case VK_GOTTPREL_PAGE: return ":gottprel:";
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index db36a65..3dbf0f8 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -62,6 +62,7 @@ public:
// since a user would write ":lo12:").
VK_CALL = VK_ABS,
VK_ABS_PAGE = VK_ABS | VK_PAGE,
+ VK_ABS_PAGE_NC = VK_ABS | VK_PAGE | VK_NC,
VK_ABS_G3 = VK_ABS | VK_G3,
VK_ABS_G2 = VK_ABS | VK_G2,
VK_ABS_G2_S = VK_SABS | VK_G2,
@@ -95,7 +96,7 @@ public:
VK_TPREL_HI12 = VK_TPREL | VK_HI12,
VK_TPREL_LO12 = VK_TPREL | VK_PAGEOFF,
VK_TPREL_LO12_NC = VK_TPREL | VK_PAGEOFF | VK_NC,
- VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF | VK_NC,
+ VK_TLSDESC_LO12 = VK_TLSDESC | VK_PAGEOFF,
VK_TLSDESC_PAGE = VK_TLSDESC | VK_PAGE,
VK_INVALID = 0xfff
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index e9d38d3..a255549 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -14,6 +14,7 @@
#include "AArch64MCTargetDesc.h"
#include "AArch64ELFStreamer.h"
#include "AArch64MCAsmInfo.h"
+#include "AArch64WinCOFFStreamer.h"
#include "InstPrinter/AArch64InstPrinter.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
@@ -59,8 +60,10 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
MCAsmInfo *MAI;
if (TheTriple.isOSBinFormatMachO())
MAI = new AArch64MCAsmInfoDarwin();
+ else if (TheTriple.isOSBinFormatCOFF())
+ MAI = new AArch64MCAsmInfoCOFF();
else {
- assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+ assert(TheTriple.isOSBinFormatELF() && "Invalid target");
MAI = new AArch64MCAsmInfoELF(TheTriple);
}
@@ -74,8 +77,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
CodeModel::Model &CM) {
- assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
- "Only expect Darwin and ELF targets");
+ assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO() ||
+ TT.isOSBinFormatCOFF()) && "Invalid target");
if (CM == CodeModel::Default)
CM = CodeModel::Small;
@@ -84,9 +87,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
// no matter how far away they are.
else if (CM == CodeModel::JITDefault)
CM = CodeModel::Large;
- else if (CM != CodeModel::Small && CM != CodeModel::Large)
- report_fatal_error(
- "Only small and large code models are allowed on AArch64");
+ else if (CM != CodeModel::Small && CM != CodeModel::Large) {
+ if (!TT.isOSFuchsia())
+ report_fatal_error(
+ "Only small and large code models are allowed on AArch64");
+ else if (CM != CodeModel::Kernel)
+ report_fatal_error(
+ "Only small, kernel, and large code models are allowed on AArch64");
+ }
}
static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
@@ -117,6 +125,14 @@ static MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
/*LabelSections*/ true);
}
+static MCStreamer *createWinCOFFStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool IncrementalLinkerCompatible) {
+ return createAArch64WinCOFFStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+ IncrementalLinkerCompatible);
+}
+
static MCInstrAnalysis *createAArch64InstrAnalysis(const MCInstrInfo *Info) {
return new MCInstrAnalysis(Info);
}
@@ -149,6 +165,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
// Register the obj streamers.
TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
TargetRegistry::RegisterMachOStreamer(*T, createMachOStreamer);
+ TargetRegistry::RegisterCOFFStreamer(*T, createWinCOFFStreamer);
// Register the obj target streamer.
TargetRegistry::RegisterObjectTargetStreamer(
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 615d7da..1404926 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -60,6 +60,8 @@ MCObjectWriter *createAArch64MachObjectWriter(raw_pwrite_stream &OS,
uint32_t CPUType,
uint32_t CPUSubtype);
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS);
+
MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
formatted_raw_ostream &OS,
MCInstPrinter *InstPrint,
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 53a6852..19b2576 100644
--- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -10,20 +10,28 @@
#include "MCTargetDesc/AArch64FixupKinds.h"
#include "MCTargetDesc/AArch64MCTargetDesc.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCAsmLayout.h"
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+
using namespace llvm;
namespace {
+
class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
const MCSymbolRefExpr *Sym,
@@ -38,7 +46,8 @@ public:
const MCFixup &Fixup, MCValue Target,
uint64_t &FixedValue) override;
};
-}
+
+} // end anonymous namespace
bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
@@ -51,18 +60,18 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
return false;
case FK_Data_1:
- Log2Size = llvm::Log2_32(1);
+ Log2Size = Log2_32(1);
return true;
case FK_Data_2:
- Log2Size = llvm::Log2_32(2);
+ Log2Size = Log2_32(2);
return true;
case FK_Data_4:
- Log2Size = llvm::Log2_32(4);
+ Log2Size = Log2_32(4);
if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
return true;
case FK_Data_8:
- Log2Size = llvm::Log2_32(8);
+ Log2Size = Log2_32(8);
if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
return true;
@@ -72,7 +81,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
case AArch64::fixup_aarch64_ldst_imm12_scale4:
case AArch64::fixup_aarch64_ldst_imm12_scale8:
case AArch64::fixup_aarch64_ldst_imm12_scale16:
- Log2Size = llvm::Log2_32(4);
+ Log2Size = Log2_32(4);
switch (Sym->getKind()) {
default:
return false;
@@ -87,14 +96,13 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
return true;
}
case AArch64::fixup_aarch64_pcrel_adrp_imm21:
- Log2Size = llvm::Log2_32(4);
+ Log2Size = Log2_32(4);
// This encompasses the relocation for the whole 21-bit value.
switch (Sym->getKind()) {
- default: {
+ default:
Asm.getContext().reportError(Fixup.getLoc(),
"ADR/ADRP relocations must be GOT relative");
return false;
- }
case MCSymbolRefExpr::VK_PAGE:
RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
return true;
@@ -108,7 +116,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
return true;
case AArch64::fixup_aarch64_pcrel_branch26:
case AArch64::fixup_aarch64_pcrel_call26:
- Log2Size = llvm::Log2_32(4);
+ Log2Size = Log2_32(4);
RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
return true;
}
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..31762b9
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp
@@ -0,0 +1,104 @@
+//= AArch64WinCOFFObjectWriter.cpp - AArch64 Windows COFF Object Writer C++ =//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+ AArch64WinCOFFObjectWriter()
+ : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {}
+
+ ~AArch64WinCOFFObjectWriter() override = default;
+
+ unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+ const MCFixup &Fixup, bool IsCrossSection,
+ const MCAsmBackend &MAB) const override;
+
+ bool recordRelocation(const MCFixup &) const override;
+};
+
+} // end anonymous namespace
+
+unsigned AArch64WinCOFFObjectWriter::getRelocType(
+ MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup,
+ bool IsCrossSection, const MCAsmBackend &MAB) const {
+ auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+ : Target.getSymA()->getKind();
+
+ switch (static_cast<unsigned>(Fixup.getKind())) {
+ default: {
+ const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind());
+ report_fatal_error(Twine("unsupported relocation type: ") + Info.Name);
+ }
+
+ case FK_Data_4:
+ switch (Modifier) {
+ default:
+ return COFF::IMAGE_REL_ARM64_ADDR32;
+ case MCSymbolRefExpr::VK_COFF_IMGREL32:
+ return COFF::IMAGE_REL_ARM64_ADDR32NB;
+ case MCSymbolRefExpr::VK_SECREL:
+ return COFF::IMAGE_REL_ARM64_SECREL;
+ }
+
+ case FK_Data_8:
+ return COFF::IMAGE_REL_ARM64_ADDR64;
+
+ case FK_SecRel_2:
+ return COFF::IMAGE_REL_ARM64_SECTION;
+
+ case FK_SecRel_4:
+ return COFF::IMAGE_REL_ARM64_SECREL;
+
+ case AArch64::fixup_aarch64_add_imm12:
+ return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A;
+
+ case AArch64::fixup_aarch64_ldst_imm12_scale1:
+ case AArch64::fixup_aarch64_ldst_imm12_scale2:
+ case AArch64::fixup_aarch64_ldst_imm12_scale4:
+ case AArch64::fixup_aarch64_ldst_imm12_scale8:
+ case AArch64::fixup_aarch64_ldst_imm12_scale16:
+ return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L;
+
+ case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+ return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21;
+
+ case AArch64::fixup_aarch64_pcrel_branch26:
+ case AArch64::fixup_aarch64_pcrel_call26:
+ return COFF::IMAGE_REL_ARM64_BRANCH26;
+ }
+}
+
+bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+ return true;
+}
+
+namespace llvm {
+
+MCObjectWriter *createAArch64WinCOFFObjectWriter(raw_pwrite_stream &OS) {
+ MCWinCOFFObjectTargetWriter *MOTW = new AArch64WinCOFFObjectWriter();
+ return createWinCOFFObjectWriter(MOTW, OS);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
new file mode 100644
index 0000000..6c8da27
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -0,0 +1,37 @@
+//===-- AArch64WinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64WinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+
+class AArch64WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+ friend class AArch64TargetWinCOFFStreamer;
+
+ AArch64WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+ raw_pwrite_stream &OS)
+ : MCWinCOFFStreamer(C, AB, CE, OS) {}
+};
+} // end anonymous namespace
+
+namespace llvm {
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool IncrementalLinkerCompatible) {
+ auto *S = new AArch64WinCOFFStreamer(Context, MAB, *Emitter, OS);
+ S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
+ return S;
+}
+
+} // end llvm namespace
diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
new file mode 100644
index 0000000..1b4fcd6
--- /dev/null
+++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.h
@@ -0,0 +1,43 @@
+//===-- AArch64WinCOFFStreamer.h - WinCOFF Streamer for AArch64 -*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements WinCOFF streamer information for the AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+#define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64WINCOFFSTREAMER_H
+
+#include "AArch64TargetStreamer.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+namespace {
+class AArch64WinCOFFStreamer;
+
+class AArch64TargetWinCOFFStreamer : public llvm::AArch64TargetStreamer {
+private:
+ AArch64WinCOFFStreamer &getStreamer();
+
+public:
+ AArch64TargetWinCOFFStreamer(llvm::MCStreamer &S)
+ : AArch64TargetStreamer(S) {}
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+MCWinCOFFStreamer
+*createAArch64WinCOFFStreamer(MCContext &Context, MCAsmBackend &TAB,
+ raw_pwrite_stream &OS,
+ MCCodeEmitter *Emitter, bool RelaxAll,
+ bool IncrementalLinkerCompatible);
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index dcc3917..5d76681 100644
--- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -266,82 +266,86 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
}
} // end namespace AArch64CC
+struct SysAlias {
+ const char *Name;
+ uint16_t Encoding;
+ FeatureBitset FeaturesRequired;
+
+ SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {};
+ SysAlias (const char *N, uint16_t E, FeatureBitset F) :
+ Name(N), Encoding(E), FeaturesRequired(F) {};
+
+ bool haveFeatures(FeatureBitset ActiveFeatures) const {
+ return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+ }
+
+ FeatureBitset getRequiredFeatures() const { return FeaturesRequired; }
+};
+
+struct SysAliasReg : SysAlias {
+ bool NeedsReg;
+ SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
+};
+
namespace AArch64AT{
- struct AT {
- const char *Name;
- uint16_t Encoding;
+ struct AT : SysAlias {
+ using SysAlias::SysAlias;
};
-
#define GET_AT_DECL
#include "AArch64GenSystemOperands.inc"
-
}
+
namespace AArch64DB {
- struct DB {
- const char *Name;
- uint16_t Encoding;
+ struct DB : SysAlias {
+ using SysAlias::SysAlias;
};
-
#define GET_DB_DECL
#include "AArch64GenSystemOperands.inc"
}
namespace AArch64DC {
- struct DC {
- const char *Name;
- uint16_t Encoding;
+ struct DC : SysAlias {
+ using SysAlias::SysAlias;
};
-
#define GET_DC_DECL
#include "AArch64GenSystemOperands.inc"
}
namespace AArch64IC {
- struct IC {
- const char *Name;
- uint16_t Encoding;
- bool NeedsReg;
+ struct IC : SysAliasReg {
+ using SysAliasReg::SysAliasReg;
};
#define GET_IC_DECL
#include "AArch64GenSystemOperands.inc"
}
namespace AArch64ISB {
- struct ISB {
- const char *Name;
- uint16_t Encoding;
+ struct ISB : SysAlias {
+ using SysAlias::SysAlias;
};
#define GET_ISB_DECL
#include "AArch64GenSystemOperands.inc"
}
namespace AArch64PRFM {
- struct PRFM {
- const char *Name;
- uint16_t Encoding;
+ struct PRFM : SysAlias {
+ using SysAlias::SysAlias;
};
#define GET_PRFM_DECL
#include "AArch64GenSystemOperands.inc"
}
namespace AArch64PState {
- struct PState {
- const char *Name;
- uint16_t Encoding;
- FeatureBitset FeaturesRequired;
-
- bool haveFeatures(FeatureBitset ActiveFeatures) const {
- return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
- }
+ struct PState : SysAlias{
+ using SysAlias::SysAlias;
};
#define GET_PSTATE_DECL
#include "AArch64GenSystemOperands.inc"
}
namespace AArch64PSBHint {
- struct PSB {
- const char *Name;
- uint16_t Encoding;
+ struct PSB : SysAlias {
+ using SysAlias::SysAlias;
};
#define GET_PSB_DECL
#include "AArch64GenSystemOperands.inc"
@@ -451,10 +455,8 @@ namespace AArch64SysReg {
}
namespace AArch64TLBI {
- struct TLBI {
- const char *Name;
- uint16_t Encoding;
- bool NeedsReg;
+ struct TLBI : SysAliasReg {
+ using SysAliasReg::SysAliasReg;
};
#define GET_TLBI_DECL
#include "AArch64GenSystemOperands.inc"
OpenPOWER on IntegriCloud