diff options
Diffstat (limited to 'contrib/llvm/lib/Target')
649 files changed, 74113 insertions, 24127 deletions
diff --git a/contrib/llvm/lib/Target/AArch64/AArch64.td b/contrib/llvm/lib/Target/AArch64/AArch64.td index 9a7d6c8..0bff9b5 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64.td @@ -32,6 +32,15 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable ARMv8 CRC-32 checksum instructions">; +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable ARMv8 PMUv3 Performance Monitors extension">; + +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Full FP16", [FeatureFPARMv8]>; + +def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", + "Enable Statistical Profiling extension">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -40,6 +49,15 @@ def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", "Has zero-cycle zeroing instructions">; +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true", + "Reserve X18, making it unavailable " + "as a GPR">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -47,6 +65,9 @@ def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [FeatureCRC]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", [HasV8_1aOps]>; + //===----------------------------------------------------------------------===// // Register File Description //===----------------------------------------------------------------------===// @@ -70,19 +91,29 @@ include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", + [FeatureFPARMv8, + FeatureNEON, + FeatureCrypto, + FeatureCRC, + FeaturePerfMon]>; + def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [FeatureFPARMv8, FeatureNEON, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [FeatureFPARMv8, FeatureNEON, FeatureCrypto, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", "Cyclone", @@ -90,12 +121,16 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", FeatureNEON, FeatureCrypto, FeatureCRC, + FeaturePerfMon, FeatureZCRegMove, FeatureZCZeroing]>; def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON, - FeatureCRC]>; + FeatureCRC, + FeaturePerfMon]>; +// FIXME: Cortex-A35 is currently modelled as a Cortex-A53 +def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>; def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>; def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>; // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. @@ -109,11 +144,13 @@ def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; def GenericAsmParserVariant : AsmParserVariant { int Variant = 0; string Name = "generic"; + string BreakCharacters = "."; } def AppleAsmParserVariant : AsmParserVariant { int Variant = 1; string Name = "apple-neon"; + string BreakCharacters = "."; } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index d7ef3f4..d215d9e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -122,7 +122,7 @@ AArch64A53Fix835769::runOnMachineFunction(MachineFunction &F) { static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, const TargetInstrInfo *TII) { // Get the previous machine basic block in the function. - MachineFunction::iterator MBBI = *MBB; + MachineFunction::iterator MBBI(MBB); // Can't go off top of function. if (MBBI == MBB->getParent()->begin()) @@ -131,7 +131,7 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB, MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 2> Cond; - MachineBasicBlock *PrevBB = std::prev(MBBI); + MachineBasicBlock *PrevBB = &*std::prev(MBBI); for (MachineBasicBlock *S : MBB->predecessors()) if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB && !FBB) @@ -151,10 +151,9 @@ static MachineInstr *getLastNonPseudo(MachineBasicBlock &MBB, // If there is no non-pseudo in the current block, loop back around and try // the previous block (if there is one). while ((FMBB = getBBFallenThrough(FMBB, TII))) { - for (auto I = FMBB->rbegin(), E = FMBB->rend(); I != E; ++I) { - if (!I->isPseudo()) - return &*I; - } + for (MachineInstr &I : make_range(FMBB->rbegin(), FMBB->rend())) + if (!I.isPseudo()) + return &I; } // There was no previous non-pseudo in the fallen through blocks @@ -217,8 +216,8 @@ AArch64A53Fix835769::runOnBasicBlock(MachineBasicBlock &MBB) { ++Idx; } - DEBUG(dbgs() << "Scan complete, "<< Sequences.size() - << " occurences of pattern found.\n"); + DEBUG(dbgs() << "Scan complete, " << Sequences.size() + << " occurrences of pattern found.\n"); // Then update the basic block, inserting nops between the detected sequences. for (auto &MI : Sequences) { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 9d6dbd6..79a84ad 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -593,7 +593,6 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C, if (Change) { Substs[MO.getReg()] = Reg; MO.setReg(Reg); - MRI->setPhysRegUsed(Reg); Changed = true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp index 716e1a3..3afcdfb 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AddressTypePromotion.cpp @@ -57,6 +57,8 @@ EnableMerge("aarch64-type-promotion-merge", cl::Hidden, " the other."), cl::init(true)); +#define AARCH64_TYPE_PROMO_NAME "AArch64 Address Type Promotion" + //===----------------------------------------------------------------------===// // AArch64AddressTypePromotion //===----------------------------------------------------------------------===// @@ -76,7 +78,7 @@ public: } const char *getPassName() const override { - return "AArch64 Address Type Promotion"; + return AARCH64_TYPE_PROMO_NAME; } /// Iterate over the functions and promote the computation of interesting @@ -143,10 +145,10 @@ private: char AArch64AddressTypePromotion::ID = 0; INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion", - "AArch64 Type Promotion Pass", false, false) + AARCH64_TYPE_PROMO_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion", - "AArch64 Type Promotion Pass", false, false) + AARCH64_TYPE_PROMO_NAME, false, false) FunctionPass *llvm::createAArch64AddressTypePromotionPass() { return new AArch64AddressTypePromotion(); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp index 18d21fd..1644d71 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp @@ -61,6 +61,12 @@ STATISTIC(NumScalarInsnsUsed, "Number of scalar instructions used"); STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted"); STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted"); +namespace llvm { +void initializeAArch64AdvSIMDScalarPass(PassRegistry &); +} + +#define AARCH64_ADVSIMD_NAME "AdvSIMD Scalar Operation Optimization" + namespace { class AArch64AdvSIMDScalar : public MachineFunctionPass { MachineRegisterInfo *MRI; @@ -82,12 +88,14 @@ private: public: static char ID; // Pass identification, replacement for typeid. - explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {} + explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) { + initializeAArch64AdvSIMDScalarPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; const char *getPassName() const override { - return "AdvSIMD Scalar Operation Optimization"; + return AARCH64_ADVSIMD_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -98,6 +106,9 @@ public: char AArch64AdvSIMDScalar::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(AArch64AdvSIMDScalar, "aarch64-simd-scalar", + AARCH64_ADVSIMD_NAME, false, false) + static bool isGPR64(unsigned Reg, unsigned SubReg, const MachineRegisterInfo *MRI) { if (SubReg) @@ -381,7 +392,7 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) { // Just check things on a one-block-at-a-time basis. for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) - if (processMachineBasicBlock(I)) + if (processMachineBasicBlock(&*I)) Changed = true; return Changed; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp index d973234..a614f55 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64BranchRelaxation.cpp @@ -45,6 +45,12 @@ BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19), STATISTIC(NumSplit, "Number of basic blocks split"); STATISTIC(NumRelaxed, "Number of conditional branches relaxed"); +namespace llvm { +void initializeAArch64BranchRelaxationPass(PassRegistry &); +} + +#define AARCH64_BR_RELAX_NAME "AArch64 branch relaxation pass" + namespace { class AArch64BranchRelaxation : public MachineFunctionPass { /// BasicBlockInfo - Information about the offset and size of a single @@ -93,17 +99,22 @@ class AArch64BranchRelaxation : public MachineFunctionPass { public: static char ID; - AArch64BranchRelaxation() : MachineFunctionPass(ID) {} + AArch64BranchRelaxation() : MachineFunctionPass(ID) { + initializeAArch64BranchRelaxationPass(*PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "AArch64 branch relaxation pass"; + return AARCH64_BR_RELAX_NAME; } }; char AArch64BranchRelaxation::ID = 0; } +INITIALIZE_PASS(AArch64BranchRelaxation, "aarch64-branch-relax", + AARCH64_BR_RELAX_NAME, false, false) + /// verify - check BBOffsets, BBSizes, alignment of islands void AArch64BranchRelaxation::verify() { #ifndef NDEBUG @@ -131,14 +142,14 @@ void AArch64BranchRelaxation::dumpBBs() { /// into the block immediately after it. static bool BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; + MachineFunction::iterator MBBI(MBB); // Can't fall off end of function. - MachineBasicBlock *NextBB = std::next(MBBI); + auto NextBB = std::next(MBBI); if (NextBB == MBB->getParent()->end()) return false; for (MachineBasicBlock *S : MBB->successors()) - if (S == NextBB) + if (S == &*NextBB) return true; return false; @@ -216,9 +227,7 @@ AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) { // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; - ++MBBI; - MF->insert(MBBI, NewBB); + MF->insert(++OrigBB->getIterator(), NewBB); // Splice the instructions starting with MI over to NewBB. NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end()); @@ -421,7 +430,7 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) { MBB->replaceSuccessor(FBB, NewBB); NewBB->addSuccessor(FBB); } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*std::next(MachineFunction::iterator(MBB)); DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() << ", invert condition and change dest. to BB#" diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h index 1e2d1c3..bc44bc5 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -25,30 +25,28 @@ namespace { using namespace llvm; -static const uint16_t XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, - AArch64::X3, AArch64::X4, AArch64::X5, - AArch64::X6, AArch64::X7}; -static const uint16_t HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, - AArch64::H3, AArch64::H4, AArch64::H5, - AArch64::H6, AArch64::H7}; -static const uint16_t SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, - AArch64::S3, AArch64::S4, AArch64::S5, - AArch64::S6, AArch64::S7}; -static const uint16_t DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, - AArch64::D3, AArch64::D4, AArch64::D5, - AArch64::D6, AArch64::D7}; -static const uint16_t QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, - AArch64::Q3, AArch64::Q4, AArch64::Q5, - AArch64::Q6, AArch64::Q7}; +static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, + AArch64::X3, AArch64::X4, AArch64::X5, + AArch64::X6, AArch64::X7}; +static const MCPhysReg HRegList[] = {AArch64::H0, AArch64::H1, AArch64::H2, + AArch64::H3, AArch64::H4, AArch64::H5, + AArch64::H6, AArch64::H7}; +static const MCPhysReg SRegList[] = {AArch64::S0, AArch64::S1, AArch64::S2, + AArch64::S3, AArch64::S4, AArch64::S5, + AArch64::S6, AArch64::S7}; +static const MCPhysReg DRegList[] = {AArch64::D0, AArch64::D1, AArch64::D2, + AArch64::D3, AArch64::D4, AArch64::D5, + AArch64::D6, AArch64::D7}; +static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2, + AArch64::Q3, AArch64::Q4, AArch64::Q5, + AArch64::Q6, AArch64::Q7}; static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers, MVT LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State, unsigned SlotAlign) { unsigned Size = LocVT.getSizeInBits() / 8; - unsigned StackAlign = State.getMachineFunction() - .getTarget() - .getDataLayout() - ->getStackAlignment(); + unsigned StackAlign = + State.getMachineFunction().getDataLayout().getStackAlignment(); unsigned Align = std::min(ArgFlags.getOrigAlign(), StackAlign); for (auto &It : PendingMembers) { @@ -88,7 +86,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT, ISD::ArgFlagsTy &ArgFlags, CCState &State) { // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. - ArrayRef<uint16_t> RegList; + ArrayRef<MCPhysReg> RegList; if (LocVT.SimpleTy == MVT::i64) RegList = XRegList; else if (LocVT.SimpleTy == MVT::f16) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 815ebef..388d64e 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -16,7 +16,7 @@ class CCIfAlign<string Align, CCAction A> : CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>; /// CCIfBigEndian - Match only if we're in big endian mode. class CCIfBigEndian<CCAction A> : - CCIf<"State.getMachineFunction().getTarget().getDataLayout()->isBigEndian()", A>; + CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>; //===----------------------------------------------------------------------===// // ARM AAPCS64 Calling Convention @@ -279,6 +279,23 @@ def CSR_AArch64_TLS_Darwin FP, (sequence "Q%u", 0, 31))>; +// We can only handle a register pair with adjacent registers, the register pair +// should belong to the same class as well. Since the access function on the +// fast path calls a function that follows CSR_AArch64_TLS_Darwin, +// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin. +def CSR_AArch64_CXX_TLS_Darwin + : CalleeSavedRegs<(add CSR_AArch64_AAPCS, + (sub (sequence "X%u", 1, 28), X15, X16, X17, X18), + (sequence "D%u", 0, 31))>; + +// CSRs that are handled by prologue, epilogue. +def CSR_AArch64_CXX_TLS_Darwin_PE + : CalleeSavedRegs<(add LR, FP)>; + +// CSRs that are handled explicitly via copies. +def CSR_AArch64_CXX_TLS_Darwin_ViaCopy + : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>; + // The ELF stub used for TLS-descriptor access saves every feasible // register. Only X0 and LR are clobbered. def CSR_AArch64_TLS_ELF diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 06ff9af..9310ac4 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -117,10 +117,10 @@ struct LDTLSCleanup : public MachineFunctionPass { *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass); // Insert a copy from X0 to TLSBaseAddrReg for later. - MachineInstr *Next = I->getNextNode(); - MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), - TII->get(TargetOpcode::COPY), - *TLSBaseAddrReg).addReg(AArch64::X0); + MachineInstr *Copy = + BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(), + TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) + .addReg(AArch64::X0); return Copy; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index efdb2e3..78c239b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -168,6 +168,8 @@ namespace llvm { void initializeAArch64CollectLOHPass(PassRegistry &); } +#define AARCH64_COLLECT_LOH_NAME "AArch64 Collect Linker Optimization Hint (LOH)" + namespace { struct AArch64CollectLOH : public MachineFunctionPass { static char ID; @@ -178,7 +180,7 @@ struct AArch64CollectLOH : public MachineFunctionPass { bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "AArch64 Collect Linker Optimization Hint (LOH)"; + return AARCH64_COLLECT_LOH_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -220,12 +222,10 @@ typedef SmallVector<unsigned, 32> MapIdToReg; char AArch64CollectLOH::ID = 0; INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh", - "AArch64 Collect Linker Optimization Hint (LOH)", false, - false) + AARCH64_COLLECT_LOH_NAME, false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh", - "AArch64 Collect Linker Optimization Hint (LOH)", false, - false) + AARCH64_COLLECT_LOH_NAME, false, false) /// Given a couple (MBB, reg) get the corresponding set of instruction from /// the given "sets". @@ -353,9 +353,17 @@ static void initReachingDef(const MachineFunction &MF, for (MCRegAliasIterator AI(CurReg, TRI, true); AI.isValid(); ++AI) { MapRegToId::const_iterator ItRegId = RegToId.find(*AI); - assert(ItRegId != RegToId.end() && - "Sub-register of an " - "involved register, not recorded as involved!"); + // If this alias has not been recorded, then it is not interesting + // for the current analysis. + // We can end up in this situation because of tuple registers. + // E.g., Let say we are interested in S1. When we register + // S1, we will also register its aliases and in particular + // the tuple Q1_Q2. + // Now, when we encounter Q1_Q2, we will look through its aliases + // and will find that S2 is not registered. + if (ItRegId == RegToId.end()) + continue; + BBKillSet.set(ItRegId->second); BBGen[ItRegId->second] = &MI; } @@ -523,6 +531,8 @@ static bool isCandidateStore(const MachineInstr *Instr) { switch (Instr->getOpcode()) { default: return false; + case AArch64::STRBBui: + case AArch64::STRHHui: case AArch64::STRBui: case AArch64::STRHui: case AArch64::STRWui: @@ -884,7 +894,8 @@ static void computeOthers(const InstrToInstrs &UseToDefs, bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri); // If the chain is three instructions long and ldr is the second element, // then this ldr must load form GOT, otherwise this is not a correct chain. - if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT) + if (L2 && !IsL2Add && + !(L2->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)) continue; SmallVector<const MachineInstr *, 3> Args; MCLOHType Kind; @@ -1000,7 +1011,8 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId, DEBUG(dbgs() << "** Collect Involved Register\n"); for (const auto &MBB : MF) { for (const MachineInstr &MI : MBB) { - if (!canDefBePartOfLOH(&MI)) + if (!canDefBePartOfLOH(&MI) && + !isCandidateLoad(&MI) && !isCandidateStore(&MI)) continue; // Process defs diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index b9e41c6..fc27bfe 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -59,6 +59,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" @@ -153,13 +154,20 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare( case AArch64::SUBSXri: // cmn is an alias for adds with a dead destination register. case AArch64::ADDSWri: - case AArch64::ADDSXri: - if (MRI->use_empty(I->getOperand(0).getReg())) - return I; - - DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); - return nullptr; - + case AArch64::ADDSXri: { + unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm()); + if (!I->getOperand(2).isImm()) { + DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n'); + return nullptr; + } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) { + DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I << '\n'); + return nullptr; + } else if (!MRI->use_empty(I->getOperand(0).getReg())) { + DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n'); + return nullptr; + } + return I; + } // Prevent false positive case like: // cmp w19, #0 // cinc w0, w19, gt diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 2b0c92f..df1320f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -353,7 +353,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { MIOperands::PhysRegInfo PRI = MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI); - if (PRI.Reads) { + if (PRI.Read) { // The ccmp doesn't produce exactly the same flags as the original // compare, so reject the transform if there are uses of the flags // besides the terminators. @@ -362,7 +362,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { return nullptr; } - if (PRI.Clobbers) { + if (PRI.Defined || PRI.Clobbered) { DEBUG(dbgs() << "Not convertible compare: " << *I); ++NumUnknNZCVDefs; return nullptr; @@ -567,8 +567,8 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) { // All CmpBB instructions are moved into Head, and CmpBB is deleted. // Update the CFG first. updateTailPHIs(); - Head->removeSuccessor(CmpBB); - CmpBB->removeSuccessor(Tail); + Head->removeSuccessor(CmpBB, true); + CmpBB->removeSuccessor(Tail, true); Head->transferSuccessorsAndUpdatePHIs(CmpBB); DebugLoc TermDL = Head->getFirstTerminator()->getDebugLoc(); TII->RemoveBranch(*Head); @@ -786,13 +786,13 @@ void AArch64ConditionalCompares::updateDomTree( // convert() removes CmpBB which was previously dominated by Head. // CmpBB children should be transferred to Head. MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head); - for (unsigned i = 0, e = Removed.size(); i != e; ++i) { - MachineDomTreeNode *Node = DomTree->getNode(Removed[i]); + for (MachineBasicBlock *RemovedMBB : Removed) { + MachineDomTreeNode *Node = DomTree->getNode(RemovedMBB); assert(Node != HeadNode && "Cannot erase the head node"); assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head"); while (Node->getNumChildren()) DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode); - DomTree->eraseNode(Removed[i]); + DomTree->eraseNode(RemovedMBB); } } @@ -801,8 +801,8 @@ void AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) { if (!Loops) return; - for (unsigned i = 0, e = Removed.size(); i != e; ++i) - Loops->removeBlock(Removed[i]); + for (MachineBasicBlock *RemovedMBB : Removed) + Loops->removeBlock(RemovedMBB); } /// Invalidate MachineTraceMetrics before if-conversion. @@ -899,7 +899,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) { Loops = getAnalysisIfAvailable<MachineLoopInfo>(); Traces = &getAnalysis<MachineTraceMetrics>(); MinInstr = nullptr; - MinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); + MinSize = MF.getFunction()->optForMinSize(); bool Changed = false; CmpConv.runOnMachineFunction(MF); diff --git a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 74fc167..576cf4a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -26,6 +26,12 @@ using namespace llvm; STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced"); +namespace llvm { +void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry &); +} + +#define AARCH64_DEAD_REG_DEF_NAME "AArch64 Dead register definitions" + namespace { class AArch64DeadRegisterDefinitions : public MachineFunctionPass { private: @@ -35,11 +41,14 @@ private: bool usesFrameIndex(const MachineInstr &MI); public: static char ID; // Pass identification, replacement for typeid. - explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {} + explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) { + initializeAArch64DeadRegisterDefinitionsPass( + *PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &F) override; - const char *getPassName() const override { return "Dead register definitions"; } + const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -49,6 +58,9 @@ public: char AArch64DeadRegisterDefinitions::ID = 0; } // end anonymous namespace +INITIALIZE_PASS(AArch64DeadRegisterDefinitions, "aarch64-dead-defs", + AARCH64_DEAD_REG_DEF_NAME, false, false) + bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg( unsigned Reg, const MachineInstr &MI) { for (const MachineOperand &MO : MI.implicit_operands()) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index c2470f7..d24e42a 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -22,18 +22,26 @@ #include "llvm/Support/MathExtras.h" using namespace llvm; +namespace llvm { +void initializeAArch64ExpandPseudoPass(PassRegistry &); +} + +#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass" + namespace { class AArch64ExpandPseudo : public MachineFunctionPass { public: static char ID; - AArch64ExpandPseudo() : MachineFunctionPass(ID) {} + AArch64ExpandPseudo() : MachineFunctionPass(ID) { + initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry()); + } const AArch64InstrInfo *TII; bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "AArch64 pseudo instruction expansion pass"; + return AARCH64_EXPAND_PSEUDO_NAME; } private: @@ -45,6 +53,9 @@ private: char AArch64ExpandPseudo::ID = 0; } +INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo", + AARCH64_EXPAND_PSEUDO_NAME, false, false) + /// \brief Transfer implicit operands on the pseudo instruction to the /// instructions created from the expansion. static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 0728198..0ac4b39 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -523,7 +523,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty) U = C; } - if (const PointerType *Ty = dyn_cast<PointerType>(Obj->getType())) + if (auto *Ty = dyn_cast<PointerType>(Obj->getType())) if (Ty->getAddressSpace() > 255) // Fast instruction selection doesn't support the special // address spaces. @@ -969,7 +969,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) { // Cannot encode an offset register and an immediate offset in the same // instruction. Fold the immediate offset into the load/store instruction and - // emit an additonal add to take care of the offset register. + // emit an additional add to take care of the offset register. if (!ImmediateOffsetNeedsLowering && Addr.getOffset() && Addr.getOffsetReg()) RegisterOffsetNeedsLowering = true; @@ -1058,8 +1058,8 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr, // FIXME: We shouldn't be using getObjectSize/getObjectAlignment. The size // and alignment should be based on the VT. MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI, Offset), Flags, - MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); + MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI).addImm(Offset); } else { @@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, } // Check if the mul can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (isMulPowOf2(RHS)) { const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); @@ -1193,12 +1193,16 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(MulLHS); - return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, RHSIsKill, - AArch64_AM::LSL, ShiftVal, SetFlags, WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, AArch64_AM::LSL, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; } + } // Check if the shift can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (const auto *SI = dyn_cast<BinaryOperator>(RHS)) { if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { AArch64_AM::ShiftExtendType ShiftType = AArch64_AM::InvalidShiftExtend; @@ -1214,12 +1218,15 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftType, ShiftVal, SetFlags, - WantResult); + ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftType, ShiftVal, SetFlags, + WantResult); + if (ResultReg) + return ResultReg; } } } + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1323,6 +1330,10 @@ unsigned AArch64FastISel::emitAddSub_rs(bool UseAdd, MVT RetVT, unsigned LHSReg, if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + static const unsigned OpcTable[2][2][2] = { { { AArch64::SUBWrs, AArch64::SUBXrs }, { AArch64::ADDWrs, AArch64::ADDXrs } }, @@ -1360,6 +1371,9 @@ unsigned AArch64FastISel::emitAddSub_rx(bool UseAdd, MVT RetVT, unsigned LHSReg, if (RetVT != MVT::i32 && RetVT != MVT::i64) return 0; + if (ShiftImm >= 4) + return 0; + static const unsigned OpcTable[2][2][2] = { { { AArch64::SUBWrx, AArch64::SUBXrx }, { AArch64::ADDWrx, AArch64::ADDXrx } }, @@ -1542,7 +1556,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, return ResultReg; // Check if the mul can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (isMulPowOf2(RHS)) { const Value *MulLHS = cast<MulOperator>(RHS)->getOperand(0); const Value *MulRHS = cast<MulOperator>(RHS)->getOperand(1); @@ -1558,12 +1572,15 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(MulLHS); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; } + } // Check if the shift can be folded into the instruction. - if (RHS->hasOneUse() && isValueAvailable(RHS)) + if (RHS->hasOneUse() && isValueAvailable(RHS)) { if (const auto *SI = dyn_cast<ShlOperator>(RHS)) if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) { uint64_t ShiftVal = C->getZExtValue(); @@ -1571,9 +1588,12 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, if (!RHSReg) return 0; bool RHSIsKill = hasTrivialKill(SI->getOperand(0)); - return emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, - RHSIsKill, ShiftVal); + ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, LHSIsKill, RHSReg, + RHSIsKill, ShiftVal); + if (ResultReg) + return ResultReg; } + } unsigned RHSReg = getRegForValue(RHS); if (!RHSReg) @@ -1646,6 +1666,11 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT, { AArch64::ORRWrs, AArch64::ORRXrs }, { AArch64::EORWrs, AArch64::EORXrs } }; + + // Don't deal with undefined shifts. + if (ShiftImm >= RetVT.getSizeInBits()) + return 0; + const TargetRegisterClass *RC; unsigned Opc; switch (RetVT.SimpleTy) { @@ -2235,14 +2260,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) { MIB.addImm(TestBit); MIB.addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - fastEmitBranch(FBB, DbgLoc); - + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -2257,7 +2275,6 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; - AArch64CC::CondCode CC = AArch64CC::NE; if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) { if (CI->hasOneUse() && isValueAvailable(CI)) { // Try to optimize or fold the cmp. @@ -2289,7 +2306,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { // FCMP_UEQ and FCMP_ONE cannot be checked with a single branch // instruction. - CC = getCompareCC(Predicate); + AArch64CC::CondCode CC = getCompareCC(Predicate); AArch64CC::CondCode ExtraCC = AArch64CC::AL; switch (Predicate) { default: @@ -2317,52 +2334,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { .addImm(CC) .addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); - return true; - } - } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { - MVT SrcVT; - if (TI->hasOneUse() && isValueAvailable(TI) && - isTypeSupported(TI->getOperand(0)->getType(), SrcVT)) { - unsigned CondReg = getRegForValue(TI->getOperand(0)); - if (!CondReg) - return false; - bool CondIsKill = hasTrivialKill(TI->getOperand(0)); - - // Issue an extract_subreg to get the lower 32-bits. - if (SrcVT == MVT::i64) { - CondReg = fastEmitInst_extractsubreg(MVT::i32, CondReg, CondIsKill, - AArch64::sub_32); - CondIsKill = true; - } - - unsigned ANDReg = emitAnd_ri(MVT::i32, CondReg, CondIsKill, 1); - assert(ANDReg && "Unexpected AND instruction emission failure."); - emitICmp_ri(MVT::i32, ANDReg, /*IsKill=*/true, 0); - - if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { - std::swap(TBB, FBB); - CC = AArch64CC::EQ; - } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) - .addMBB(TBB); - - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (const auto *CI = dyn_cast<ConstantInt>(BI->getCondition())) { @@ -2371,34 +2343,31 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B)) .addMBB(Target); - // Obtain the branch weight and add the target to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - Target->getBasicBlock()); - FuncInfo.MBB->addSuccessor(Target, BranchWeight); + // Obtain the branch probability and add the target to the successor list. + if (FuncInfo.BPI) { + auto BranchProbability = FuncInfo.BPI->getEdgeProbability( + BI->getParent(), Target->getBasicBlock()); + FuncInfo.MBB->addSuccessor(Target, BranchProbability); + } else + FuncInfo.MBB->addSuccessorWithoutProb(Target); return true; - } else if (foldXALUIntrinsic(CC, I, BI->getCondition())) { - // Fake request the condition, otherwise the intrinsic might be completely - // optimized away. - unsigned CondReg = getRegForValue(BI->getCondition()); - if (!CondReg) - return false; - - // Emit the branch. - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) - .addMBB(TBB); + } else { + AArch64CC::CondCode CC = AArch64CC::NE; + if (foldXALUIntrinsic(CC, I, BI->getCondition())) { + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned CondReg = getRegForValue(BI->getCondition()); + if (!CondReg) + return false; - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); + // Emit the branch. + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) + .addImm(CC) + .addMBB(TBB); - fastEmitBranch(FBB, DbgLoc); - return true; + finishCondBranch(BI->getParent(), TBB, FBB); + return true; + } } unsigned CondReg = getRegForValue(BI->getCondition()); @@ -2406,32 +2375,22 @@ bool AArch64FastISel::selectBranch(const Instruction *I) { return false; bool CondRegIsKill = hasTrivialKill(BI->getCondition()); - // We've been divorced from our compare! Our block was split, and - // now our compare lives in a predecessor block. We musn't - // re-compare here, as the children of the compare aren't guaranteed - // live across the block boundary (we *could* check for this). - // Regardless, the compare has been done in the predecessor block, - // and it left a value for us in a virtual register. Ergo, we test - // the one-bit value left in the virtual register. - emitICmp_ri(MVT::i32, CondReg, CondRegIsKill, 0); - + // i1 conditions come as i32 values, test the lowest bit with tb(n)z. + unsigned Opcode = AArch64::TBNZW; if (FuncInfo.MBB->isLayoutSuccessor(TBB)) { std::swap(TBB, FBB); - CC = AArch64CC::EQ; + Opcode = AArch64::TBZW; } - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc)) - .addImm(CC) + const MCInstrDesc &II = TII.get(Opcode); + unsigned ConstrainedCondReg + = constrainOperandRegClass(II, CondReg, II.getNumDefs()); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) + .addReg(ConstrainedCondReg, getKillRegState(CondRegIsKill)) + .addImm(0) .addMBB(TBB); - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TBB, BranchWeight); - - fastEmitBranch(FBB, DbgLoc); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -2447,8 +2406,8 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(AddrReg); // Make sure the CFG is up-to-date. - for (unsigned i = 0, e = BI->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[BI->getSuccessor(i)]); + for (auto *Succ : BI->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[Succ]); return true; } @@ -2456,6 +2415,10 @@ bool AArch64FastISel::selectIndirectBr(const Instruction *I) { bool AArch64FastISel::selectCmp(const Instruction *I) { const CmpInst *CI = cast<CmpInst>(I); + // Vectors of i1 are weird: bail out. + if (CI->getType()->isVectorTy()) + return false; + // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); unsigned ResultReg = 0; @@ -2954,8 +2917,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, .addImm(NumBytes); // Process the args. - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { - CCValAssign &VA = ArgLocs[i]; + for (CCValAssign &VA : ArgLocs) { const Value *ArgVal = CLI.OutVals[VA.getValNo()]; MVT ArgVT = OutVTs[VA.getValNo()]; @@ -3018,8 +2980,8 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(Addr.getOffset()), - MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); + MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (!emitStore(ArgVT, ArgReg, Addr, MMO)) return false; @@ -3318,8 +3280,8 @@ bool AArch64FastISel::foldXALUIntrinsic(AArch64CC::CondCode &CC, return false; // Make sure nothing is in the way - BasicBlock::const_iterator Start = I; - BasicBlock::const_iterator End = II; + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. @@ -3684,6 +3646,9 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (F.isVarArg()) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector<unsigned, 4> RetRegs; @@ -3763,8 +3728,8 @@ bool AArch64FastISel::selectRet(const Instruction *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::RET_ReallyLR)); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); + for (unsigned RetReg : RetRegs) + MIB.addReg(RetReg, RegState::Implicit); return true; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index a76473f..11ae800 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -72,9 +72,9 @@ // // For most functions, some of the frame areas are empty. For those functions, // it may not be necessary to set up fp or bp: -// * A base pointer is definitly needed when there are both VLAs and local +// * A base pointer is definitely needed when there are both VLAs and local // variables with more-than-default alignment requirements. -// * A frame pointer is definitly needed when there are local variables with +// * A frame pointer is definitely needed when there are local variables with // more-than-default alignment requirements. // // In some cases when a base pointer is not strictly needed, it is generated @@ -216,11 +216,11 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves( if (CSI.empty()) return; - const DataLayout *TD = MF.getTarget().getDataLayout(); + const DataLayout &TD = MF.getDataLayout(); bool HasFP = hasFP(MF); // Calculate amount of bytes used for return address storing. - int stackGrowth = -TD->getPointerSize(0); + int stackGrowth = -TD.getPointerSize(0); // Calculate offsets. int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth; @@ -280,14 +280,17 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock::iterator MBBI = MBB.begin(); const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry(); bool HasFP = hasFP(MF); - DebugLoc DL = MBB.findDebugLoc(MBBI); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. @@ -354,7 +357,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes && NeedsRealignment) { // Use the first callee-saved register as a scratch register. scratchSPReg = AArch64::X9; - MF.getRegInfo().setPhysRegUsed(scratchSPReg); } // If we're a leaf function, try using the red zone. @@ -400,8 +402,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (needsFrameMoves) { - const DataLayout *TD = MF.getTarget().getDataLayout(); - const int StackGrowth = -TD->getPointerSize(0); + const DataLayout &TD = MF.getDataLayout(); + const int StackGrowth = -TD.getPointerSize(0); unsigned FramePtr = RegInfo->getFrameRegister(MF); // An example of the prologue: // @@ -513,33 +515,33 @@ static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) { return false; } -static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { +/// Checks whether the given instruction restores callee save registers +/// and if so returns how many. +static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) { unsigned RtIdx = 0; - if (MI->getOpcode() == AArch64::LDPXpost || - MI->getOpcode() == AArch64::LDPDpost) + switch (MI.getOpcode()) { + case AArch64::LDPXpost: + case AArch64::LDPDpost: RtIdx = 1; - - if (MI->getOpcode() == AArch64::LDPXpost || - MI->getOpcode() == AArch64::LDPDpost || - MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) { - if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) || - !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) || - MI->getOperand(RtIdx + 2).getReg() != AArch64::SP) - return false; - return true; + // FALLTHROUGH + case AArch64::LDPXi: + case AArch64::LDPDi: + if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) || + !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) || + MI.getOperand(RtIdx + 2).getReg() != AArch64::SP) + return 0; + return 2; } - - return false; + return 0; } void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); MachineFrameInfo *MFI = MF.getFrameInfo(); - const AArch64InstrInfo *TII = - static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo()); - const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; bool IsTailCallReturn = false; if (MBB.end() != MBBI) { @@ -585,7 +587,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // ---------------------| --- | // | | | | // | CalleeSavedReg | | | - // | (NumRestores * 16) | | | + // | (NumRestores * 8) | | | // | | | | // ---------------------| | NumBytes // | | StackSize (StackAdjustUp) @@ -606,17 +608,17 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Move past the restores of the callee-saved registers. MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator(); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); - if (LastPopI != MBB.begin()) { - do { - ++NumRestores; - --LastPopI; - } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs)); - if (!isCSRestore(LastPopI, CSRegs)) { + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastPopI != Begin) { + --LastPopI; + unsigned Restores = getNumCSRestores(*LastPopI, CSRegs); + NumRestores += Restores; + if (Restores == 0) { ++LastPopI; - --NumRestores; + break; } } - NumBytes -= NumRestores * 16; + NumBytes -= NumRestores * 8; assert(NumBytes >= 0 && "Negative stack allocation size!?"); if (!hasFP(MF)) { @@ -634,15 +636,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // be able to save any instructions. if (NumBytes || MFI->hasVarSizedObjects()) emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags); -} - -/// getFrameIndexOffset - Returns the displacement from the frame register to -/// the stack frame of the specified index. -int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - unsigned FrameReg; - return getFrameIndexReference(MF, FI, FrameReg); + -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags); } /// getFrameIndexReference - Provide a base+offset reference to an FI slot for @@ -739,9 +733,6 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( DebugLoc DL; assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!"); - if (MI != MBB.end()) - DL = MI->getDebugLoc(); - for (unsigned i = 0; i < Count; i += 2) { unsigned idx = Count - i - 2; unsigned Reg1 = CSI[idx].getReg(); @@ -911,7 +902,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned NumFPRSpilled = 0; bool ExtraCSSpill = false; bool CanEliminateFrame = true; - DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:"); + DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"); const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF); // Check pairs of consecutive callee-saved registers. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 731f031..427afdf 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -37,7 +37,6 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; int resolveFrameIndexReference(const MachineFunction &MF, int FI, @@ -61,6 +60,11 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; + + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return true; + } }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 772e894..6c86888 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -34,7 +34,6 @@ using namespace llvm; namespace { class AArch64DAGToDAGISel : public SelectionDAGISel { - AArch64TargetMachine &TM; /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. @@ -45,7 +44,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { public: explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr), + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), ForCodeSize(false) {} const char *getPassName() const override { @@ -53,9 +52,7 @@ public: } bool runOnMachineFunction(MachineFunction &MF) override { - ForCodeSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize); + ForCodeSize = MF.getFunction()->optForSize(); Subtarget = &MF.getSubtarget<AArch64Subtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -79,6 +76,21 @@ public: bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) { return SelectShiftedRegister(N, true, Reg, Shift); } + bool SelectAddrModeIndexed7S8(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 1, Base, OffImm); + } + bool SelectAddrModeIndexed7S16(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 2, Base, OffImm); + } + bool SelectAddrModeIndexed7S32(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 4, Base, OffImm); + } + bool SelectAddrModeIndexed7S64(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 8, Base, OffImm); + } + bool SelectAddrModeIndexed7S128(SDValue N, SDValue &Base, SDValue &OffImm) { + return SelectAddrModeIndexed7S(N, 16, Base, OffImm); + } bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) { return SelectAddrModeIndexed(N, 1, Base, OffImm); } @@ -153,8 +165,7 @@ public: SDNode *SelectBitfieldExtractOp(SDNode *N); SDNode *SelectBitfieldInsertOp(SDNode *N); - - SDNode *SelectLIBM(SDNode *N); + SDNode *SelectBitfieldInsertInZeroOp(SDNode *N); SDNode *SelectReadRegister(SDNode *N); SDNode *SelectWriteRegister(SDNode *N); @@ -165,6 +176,8 @@ public: private: bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg, SDValue &Shift); + bool SelectAddrModeIndexed7S(SDValue N, unsigned Size, SDValue &Base, + SDValue &OffImm); bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base, SDValue &OffImm); bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base, @@ -422,7 +435,7 @@ static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) { return true; } -// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a +// Helper for SelectOpcV64LaneV128 - Recognize operations where one operand is a // high lane extract. static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp, SDValue &LaneOp, int &LaneIdx) { @@ -572,7 +585,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, } // AArch64 mandates that the RHS of the operation must use the smallest - // register classs that could contain the size being extended from. Thus, + // register class that could contain the size being extended from. Thus, // if we're folding a (sext i8), we need the RHS to be a GPR32, even though // there might not be an actual 32-bit value in the program. We can // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here. @@ -587,7 +600,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, /// need to create a real ADD instruction from it anyway and there's no point in /// folding it into the mem op. Theoretically, it shouldn't matter, but there's /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding -/// leads to duplaicated ADRP instructions. +/// leads to duplicated ADRP instructions. static bool isWorthFoldingADDlow(SDValue N) { for (auto Use : N->uses()) { if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE && @@ -604,6 +617,51 @@ static bool isWorthFoldingADDlow(SDValue N) { return true; } +/// SelectAddrModeIndexed7S - Select a "register plus scaled signed 7-bit +/// immediate" address. The "Size" argument is the size in bytes of the memory +/// reference, which determines the scale. +bool AArch64DAGToDAGISel::SelectAddrModeIndexed7S(SDValue N, unsigned Size, + SDValue &Base, + SDValue &OffImm) { + SDLoc dl(N); + const DataLayout &DL = CurDAG->getDataLayout(); + const TargetLowering *TLI = getTargetLowering(); + if (N.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(N)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; + } + + // As opposed to the (12-bit) Indexed addressing mode below, the 7-bit signed + // selected here doesn't support labels/immediates, only base+offset. + + if (CurDAG->isBaseWithConstantOffset(N)) { + if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { + int64_t RHSC = RHS->getSExtValue(); + unsigned Scale = Log2_32(Size); + if ((RHSC & (Size - 1)) == 0 && RHSC >= -(0x40 << Scale) && + RHSC < (0x40 << Scale)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast<FrameIndexSDNode>(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL)); + } + OffImm = CurDAG->getTargetConstant(RHSC >> Scale, dl, MVT::i64); + return true; + } + } + } + + // Base only. The address will be materialized into a register before + // the memory is accessed. + // add x0, Xbase, #offset + // stp x1, x2, [x0] + Base = N; + OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64); + return true; +} + /// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit /// immediate" address. The "Size" argument is the size in bytes of the memory /// reference, which determines the scale. @@ -867,7 +925,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size, if (isa<ConstantSDNode>(RHS)) { int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue(); unsigned Scale = Log2_32(Size); - // Skip the immediate can be seleced by load/store addressing mode. + // Skip the immediate can be selected by load/store addressing mode. // Also skip the immediate can be encoded by a single ADD (SUB is also // checked by using -ImmOff). if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) || @@ -1034,6 +1092,8 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) { // it into an i64. DstVT = MVT::i32; } + } else if (VT == MVT::f16) { + Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost; } else if (VT == MVT::f32) { Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost; } else if (VT == MVT::f64 || VT.is64BitVector()) { @@ -1222,8 +1282,8 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs, SDValue SuperReg = SDValue(Ld, 0); EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, - AArch64::qsub3 }; + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; for (unsigned i = 0; i < NumVecs; ++i) { SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); if (Narrow) @@ -1275,8 +1335,8 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs, Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg); } else { EVT WideVT = RegSeq.getOperand(1)->getValueType(0); - static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2, - AArch64::qsub3 }; + static const unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, + AArch64::qsub2, AArch64::qsub3 }; for (unsigned i = 0; i < NumVecs; ++i) { SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg); @@ -1420,7 +1480,7 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N, // The resulting code will be at least as good as the original one // plus it may expose more opportunities for bitfield insert pattern. // FIXME: Currently we limit this to the bigger pattern, because - // some optimizations expect AND and not UBFM + // some optimizations expect AND and not UBFM. Opd0 = N->getOperand(0); } else return false; @@ -1852,6 +1912,7 @@ static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) { /// Does this tree qualify as an attempt to move a bitfield into position, /// essentially "(and (shl VAL, N), Mask)". static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, + bool BiggerPattern, SDValue &Src, int &ShiftAmount, int &MaskWidth) { EVT VT = Op.getValueType(); @@ -1874,6 +1935,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, Op = Op.getOperand(0); } + // Don't match if the SHL has more than one use, since then we'll end up + // generating SHL+UBFIZ instead of just keeping SHL+AND. + if (!BiggerPattern && !Op.hasOneUse()) + return false; + uint64_t ShlImm; if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm)) return false; @@ -1887,7 +1953,11 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // BFI encompasses sufficiently many nodes that it's worth inserting an extra // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL - // amount. + // amount. BiggerPattern is true when this pattern is being matched for BFI, + // BiggerPattern is false when this pattern is being matched for UBFIZ, in + // which case it is not profitable to insert an extra shift. + if (ShlImm - ShiftAmount != 0 && !BiggerPattern) + return false; Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount); return true; @@ -1904,7 +1974,8 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op, // f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2 static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, SDValue &Src, unsigned &ImmR, - unsigned &ImmS, SelectionDAG *CurDAG) { + unsigned &ImmS, const APInt &UsefulBits, + SelectionDAG *CurDAG) { assert(N->getOpcode() == ISD::OR && "Expect a OR operation"); // Set Opc @@ -1918,23 +1989,30 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // Because of simplify-demanded-bits in DAGCombine, involved masks may not // have the expected shape. Try to undo that. - APInt UsefulBits; - getUsefulBits(SDValue(N, 0), UsefulBits); unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros(); unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros(); - // OR is commutative, check both possibilities (does llvm provide a - // way to do that directely, e.g., via code matcher?) - SDValue OrOpd1Val = N->getOperand(1); - SDNode *OrOpd0 = N->getOperand(0).getNode(); - SDNode *OrOpd1 = N->getOperand(1).getNode(); - for (int i = 0; i < 2; - ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) { + // OR is commutative, check all combinations of operand order and values of + // BiggerPattern, i.e. + // Opd0, Opd1, BiggerPattern=false + // Opd1, Opd0, BiggerPattern=false + // Opd0, Opd1, BiggerPattern=true + // Opd1, Opd0, BiggerPattern=true + // Several of these combinations may match, so check with BiggerPattern=false + // first since that will produce better results by matching more instructions + // and/or inserting fewer extra instructions. + for (int I = 0; I < 4; ++I) { + + bool BiggerPattern = I / 2; + SDNode *OrOpd0 = N->getOperand(I % 2).getNode(); + SDValue OrOpd1Val = N->getOperand((I + 1) % 2); + SDNode *OrOpd1 = OrOpd1Val.getNode(); + unsigned BFXOpc; int DstLSB, Width; if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS, - NumberOfIgnoredLowBits, true)) { + NumberOfIgnoredLowBits, BiggerPattern)) { // Check that the returned opcode is compatible with the pattern, // i.e., same type and zero extended (U and not S) if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) || @@ -1952,8 +2030,9 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst, // If the mask on the insertee is correct, we have a BFXIL operation. We // can share the ImmR and ImmS values from the already-computed UBFM. - } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src, - DstLSB, Width)) { + } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), + BiggerPattern, + Src, DstLSB, Width)) { ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); ImmS = Width - 1; } else @@ -2003,11 +2082,18 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { unsigned Opc; unsigned LSB, MSB; SDValue Opd0, Opd1; + EVT VT = N->getValueType(0); + APInt NUsefulBits; + getUsefulBits(SDValue(N, 0), NUsefulBits); + + // If all bits are not useful, just return UNDEF. + if (!NUsefulBits) + return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT); - if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG)) + if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits, + CurDAG)) return nullptr; - EVT VT = N->getValueType(0); SDLoc dl(N); SDValue Ops[] = { Opd0, Opd1, @@ -2016,58 +2102,37 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) { return CurDAG->SelectNodeTo(N, Opc, VT, Ops); } -SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) { +/// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the +/// equivalent of a left shift by a constant amount followed by an and masking +/// out a contiguous set of bits. +SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) { + if (N->getOpcode() != ISD::AND) + return nullptr; + EVT VT = N->getValueType(0); - unsigned Variant; unsigned Opc; - unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr }; - - if (VT == MVT::f32) { - Variant = 0; - } else if (VT == MVT::f64) { - Variant = 1; - } else - return nullptr; // Unrecognized argument type. Fall back on default codegen. - - // Pick the FRINTX variant needed to set the flags. - unsigned FRINTXOpc = FRINTXOpcs[Variant]; - - switch (N->getOpcode()) { - default: - return nullptr; // Unrecognized libm ISD node. Fall back on default codegen. - case ISD::FCEIL: { - unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr }; - Opc = FRINTPOpcs[Variant]; - break; - } - case ISD::FFLOOR: { - unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr }; - Opc = FRINTMOpcs[Variant]; - break; - } - case ISD::FTRUNC: { - unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr }; - Opc = FRINTZOpcs[Variant]; - break; - } - case ISD::FROUND: { - unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr }; - Opc = FRINTAOpcs[Variant]; - break; - } - } + if (VT == MVT::i32) + Opc = AArch64::UBFMWri; + else if (VT == MVT::i64) + Opc = AArch64::UBFMXri; + else + return nullptr; - SDLoc dl(N); - SDValue In = N->getOperand(0); - SmallVector<SDValue, 2> Ops; - Ops.push_back(In); + SDValue Op0; + int DstLSB, Width; + if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false, + Op0, DstLSB, Width)) + return nullptr; - if (!TM.Options.UnsafeFPMath) { - SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In); - Ops.push_back(SDValue(FRINTX, 1)); - } + // ImmR is the rotate right amount. + unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits(); + // ImmS is the most significant bit of the source to be moved. + unsigned ImmS = Width - 1; - return CurDAG->getMachineNode(Opc, dl, VT, Ops); + SDLoc DL(N); + SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT), + CurDAG->getTargetConstant(ImmS, DL, VT)}; + return CurDAG->SelectNodeTo(N, Opc, VT, Ops); } bool @@ -2119,7 +2184,7 @@ AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, // into a single value to be used in the MRS/MSR instruction. static int getIntOperandFromRegisterString(StringRef RegString) { SmallVector<StringRef, 5> Fields; - RegString.split(Fields, ":"); + RegString.split(Fields, ':'); if (Fields.size() == 1) return -1; @@ -2206,7 +2271,15 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) { assert (isa<ConstantSDNode>(N->getOperand(2)) && "Expected a constant integer expression."); uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(); - return CurDAG->getMachineNode(AArch64::MSRpstate, DL, MVT::Other, + unsigned State; + if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) { + assert(Immed < 2 && "Bad imm"); + State = AArch64::MSRpstateImm1; + } else { + assert(Immed < 16 && "Bad imm"); + State = AArch64::MSRpstateImm4; + } + return CurDAG->getMachineNode(State, DL, MVT::Other, CurDAG->getTargetConstant(Reg, DL, MVT::i32), CurDAG->getTargetConstant(Immed, DL, MVT::i16), N->getOperand(0)); @@ -2279,6 +2352,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { case ISD::SRA: if (SDNode *I = SelectBitfieldExtractOp(Node)) return I; + if (SDNode *I = SelectBitfieldInsertInZeroOp(Node)) + return I; break; case ISD::OR: @@ -2802,6 +2877,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { break; } } + break; } case AArch64ISD::LD2post: { if (VT == MVT::v8i8) @@ -3214,14 +3290,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) { return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST); break; } - - case ISD::FCEIL: - case ISD::FFLOOR: - case ISD::FTRUNC: - case ISD::FROUND: - if (SDNode *I = SelectLIBM(Node)) - return I; - break; } // Select the default instruction diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3e8f46c..9f5beff 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -40,23 +40,6 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); -namespace { -enum AlignMode { - StrictAlign, - NoStrictAlign -}; -} - -static cl::opt<AlignMode> -Align(cl::desc("Load/store alignment support"), - cl::Hidden, cl::init(NoStrictAlign), - cl::values( - clEnumValN(StrictAlign, "aarch64-strict-align", - "Disallow all unaligned memory accesses"), - clEnumValN(NoStrictAlign, "aarch64-no-strict-align", - "Allow unaligned memory accesses"), - clEnumValEnd)); - // Place holder until extr generation is tested fully. static cl::opt<bool> EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, @@ -76,6 +59,9 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration( cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); +/// Value type used for condition codes. +static const MVT MVT_CC = MVT::i32; + AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -210,11 +196,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); - // Exception handling. - // FIXME: These are guesses. Has this been defined yet? - setExceptionPointerRegister(AArch64::X0); - setExceptionSelectorRegister(AArch64::X1); - // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); @@ -234,6 +215,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); @@ -252,6 +237,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); @@ -315,6 +304,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); + setOperationAction(ISD::FMINNAN, MVT::f16, Promote); + setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); // v4f16 is also a storage-only type, so promote it to v4f32 when that is // known to be safe. @@ -403,10 +394,19 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); + setOperationAction(ISD::FMINNUM, Ty, Legal); + setOperationAction(ISD::FMAXNUM, Ty, Legal); + setOperationAction(ISD::FMINNAN, Ty, Legal); + setOperationAction(ISD::FMAXNAN, Ty, Legal); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. + // This requires the Performance Monitors extension. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); + if (Subtarget->isTargetMachO()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret to avoid memory @@ -456,12 +456,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); + setIndexedLoadAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); + setIndexedStoreAction(im, MVT::f16, Legal); } // Trap. @@ -479,6 +481,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); + setTargetDAGCombine(ISD::FP_TO_SINT); + setTargetDAGCombine(ISD::FP_TO_UINT); + setTargetDAGCombine(ISD::FDIV); + setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); @@ -487,16 +493,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); + if (Subtarget->supportsAddressTopByteIgnored()) + setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -512,10 +520,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setMinFunctionAlignment(2); - RequireStrictAlign = (Align == StrictAlign); - setHasExtractBitsInsn(true); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: @@ -646,6 +654,9 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); + + // But we do support custom-lowering for FCOPYSIGN. + setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); @@ -686,6 +697,12 @@ void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT.getSimpleVT(), Legal); + // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!). + if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16) + for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, + ISD::FMINNUM, ISD::FMAXNUM}) + setOperationAction(Opcode, VT.getSimpleVT(), Legal); + if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { @@ -730,7 +747,7 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode( break; } case ISD::INTRINSIC_W_CHAIN: { - ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); + ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); switch (IntID) { default: return; @@ -780,6 +797,34 @@ MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, return MVT::i64; } +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, + unsigned AddrSpace, + unsigned Align, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + // FIXME: This is mostly true for Cyclone, but not necessarily others. + if (Fast) { + // FIXME: Define an attribute for slow unaligned accesses instead of + // relying on the CPU type as a proxy. + // On Cyclone, unaligned 128-bit stores are slow. + *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + VT == MVT::v2i64; + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -809,9 +854,10 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; + case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; + case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; + case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; - case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; - case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; case AArch64ISD::DUP: return "AArch64ISD::DUP"; case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; @@ -931,8 +977,7 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI->getDebugLoc(); - MachineFunction::iterator It = MBB; - ++It; + MachineFunction::iterator It = ++MBB->getIterator(); unsigned DestReg = MI->getOperand(0).getReg(); unsigned IfTrueReg = MI->getOperand(1).getReg(); @@ -1141,8 +1186,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; - if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) && - cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 && + if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags @@ -1156,8 +1200,7 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, // the absence of information about op2. Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); - } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->getZExtValue() == 0 && + } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one @@ -1167,14 +1210,230 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, LHS = LHS.getOperand(0); } - return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) + return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } +/// \defgroup AArch64CCMP CMP;CCMP matching +/// +/// These functions deal with the formation of CMP;CCMP;... sequences. +/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of +/// a comparison. They set the NZCV flags to a predefined value if their +/// predicate is false. This allows to express arbitrary conjunctions, for +/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))" +/// expressed as: +/// cmp A +/// ccmp B, inv(CB), CA +/// check for CB flags +/// +/// In general we can create code for arbitrary "... (and (and A B) C)" +/// sequences. We can also implement some "or" expressions, because "(or A B)" +/// is equivalent to "not (and (not A) (not B))" and we can implement some +/// negation operations: +/// We can negate the results of a single comparison by inverting the flags +/// used when the predicate fails and inverting the flags tested in the next +/// instruction; We can also negate the results of the whole previous +/// conditional compare sequence by inverting the flags tested in the next +/// instruction. However there is no way to negate the result of a partial +/// sequence. +/// +/// Therefore on encountering an "or" expression we can negate the subtree on +/// one side and have to be able to push the negate to the leafs of the subtree +/// on the other side (see also the comments in code). As complete example: +/// "or (or (setCA (cmp A)) (setCB (cmp B))) +/// (and (setCC (cmp C)) (setCD (cmp D)))" +/// is transformed to +/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D)))) +/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" +/// and implemented as: +/// cmp C +/// ccmp D, inv(CD), CC +/// ccmp A, CA, inv(CD) +/// ccmp B, CB, inv(CA) +/// check for CB flags +/// A counterexample is "or (and A B) (and C D)" which cannot be implemented +/// by conditional compare sequences. +/// @{ + +/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. +static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, + ISD::CondCode CC, SDValue CCOp, + SDValue Condition, unsigned NZCV, + SDLoc DL, SelectionDAG &DAG) { + unsigned Opcode = 0; + if (LHS.getValueType().isFloatingPoint()) + Opcode = AArch64ISD::FCCMP; + else if (RHS.getOpcode() == ISD::SUB) { + SDValue SubOp0 = RHS.getOperand(0); + if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { + // See emitComparison() on why we can only do this for SETEQ and SETNE. + Opcode = AArch64ISD::CCMN; + RHS = RHS.getOperand(1); + } + } + if (Opcode == 0) + Opcode = AArch64ISD::CCMP; + + SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); + return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); +} + +/// Returns true if @p Val is a tree of AND/OR/SETCC operations. +/// CanPushNegate is set to true if we can push a negate operation through +/// the tree in a was that we are left with AND operations and negate operations +/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to +/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be +/// brought into such a form. +static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate, + unsigned Depth = 0) { + if (!Val.hasOneUse()) + return false; + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + CanPushNegate = true; + return true; + } + // Protect against stack overflow. + if (Depth > 15) + return false; + if (Opcode == ISD::AND || Opcode == ISD::OR) { + SDValue O0 = Val->getOperand(0); + SDValue O1 = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1)) + return false; + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1)) + return false; + // We cannot push a negate through an AND operation (it would become an OR), + // we can however change a (not (or x y)) to (and (not x) (not y)) if we can + // push the negate through the x/y subtrees. + CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR; + return true; + } + return false; +} + +/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain +/// of CCMP/CFCMP ops. See @ref AArch64CCMP. +/// Tries to transform the given i1 producing node @p Val to a series compare +/// and conditional compare operations. @returns an NZCV flags producing node +/// and sets @p OutCC to the flags that should be tested or returns SDValue() if +/// transformation was not possible. +/// On recursive invocations @p PushNegate may be set to true to have negation +/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate +/// for the comparisons in the current subtree; @p Depth limits the search +/// depth to avoid stack overflow. +static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val, + AArch64CC::CondCode &OutCC, bool PushNegate = false, + SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL, + unsigned Depth = 0) { + // We're at a tree leaf, produce a conditional comparison operation. + unsigned Opcode = Val->getOpcode(); + if (Opcode == ISD::SETCC) { + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); + bool isInteger = LHS.getValueType().isInteger(); + if (PushNegate) + CC = getSetCCInverse(CC, isInteger); + SDLoc DL(Val); + // Determine OutCC and handle FP special case. + if (isInteger) { + OutCC = changeIntCCToAArch64CC(CC); + } else { + assert(LHS.getValueType().isFloatingPoint()); + AArch64CC::CondCode ExtraCC; + changeFPCCToAArch64CC(CC, OutCC, ExtraCC); + // Surpisingly some floating point conditions can't be tested with a + // single condition code. Construct an additional comparison in this case. + // See comment below on how we deal with OR conditions. + if (ExtraCC != AArch64CC::AL) { + SDValue ExtraCmp; + if (!CCOp.getNode()) + ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); + else { + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + // Note that we want the inverse of ExtraCC, so NZCV is not inversed. + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC); + ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, + NZCV, DL, DAG); + } + CCOp = ExtraCmp; + Predicate = AArch64CC::getInvertedCondCode(ExtraCC); + OutCC = AArch64CC::getInvertedCondCode(OutCC); + } + } + + // Produce a normal comparison if we are first in the chain + if (!CCOp.getNode()) + return emitComparison(LHS, RHS, CC, DL, DAG); + // Otherwise produce a ccmp. + SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC); + AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); + unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); + return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL, + DAG); + } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse()) + return SDValue(); + + assert((Opcode == ISD::OR || !PushNegate) + && "Can only push negate through OR operation"); + + // Check if both sides can be transformed. + SDValue LHS = Val->getOperand(0); + SDValue RHS = Val->getOperand(1); + bool CanPushNegateL; + if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1)) + return SDValue(); + bool CanPushNegateR; + if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1)) + return SDValue(); + + // Do we need to negate our operands? + bool NegateOperands = Opcode == ISD::OR; + // We can negate the results of all previous operations by inverting the + // predicate flags giving us a free negation for one side. For the other side + // we need to be able to push the negation to the leafs of the tree. + if (NegateOperands) { + if (!CanPushNegateL && !CanPushNegateR) + return SDValue(); + // Order the side where we can push the negate through to LHS. + if (!CanPushNegateL && CanPushNegateR) + std::swap(LHS, RHS); + } else { + bool NeedsNegOutL = LHS->getOpcode() == ISD::OR; + bool NeedsNegOutR = RHS->getOpcode() == ISD::OR; + if (NeedsNegOutL && NeedsNegOutR) + return SDValue(); + // Order the side where we need to negate the output flags to RHS so it + // gets emitted first. + if (NeedsNegOutL) + std::swap(LHS, RHS); + } + + // Emit RHS. If we want to negate the tree we only need to push a negate + // through if we are already in a PushNegate case, otherwise we can negate + // the "flags to test" afterwards. + AArch64CC::CondCode RHSCC; + SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate, + CCOp, Predicate, Depth+1); + if (NegateOperands && !PushNegate) + RHSCC = AArch64CC::getInvertedCondCode(RHSCC); + // Emit LHS. We must push the negate through if we need to negate it. + SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands, + CmpR, RHSCC, Depth+1); + // If we transformed an OR to and AND then we have to negate the result + // (or absorb a PushNegate resulting in a double negation). + if (Opcode == ISD::OR && !PushNegate) + OutCC = AArch64CC::getInvertedCondCode(OutCC); + return CmpL; +} + +/// @} + static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { - SDValue Cmp; - AArch64CC::CondCode AArch64CC; if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); @@ -1229,47 +1488,56 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, } } } - // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. - // For the i8 operand, the largest immediate is 255, so this can be easily - // encoded in the compare instruction. For the i16 operand, however, the - // largest immediate cannot be encoded in the compare. - // Therefore, use a sign extending load and cmn to avoid materializing the -1 - // constant. For example, - // movz w1, #65535 - // ldrh w0, [x0, #0] - // cmp w0, w1 - // > - // ldrsh w0, [x0, #0] - // cmn w0, #1 - // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) - // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure - // both the LHS and RHS are truely zero extended and to make sure the - // transformation is profitable. + SDValue Cmp; + AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { - if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && - isa<LoadSDNode>(LHS)) { - if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && - cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && - LHS.getNode()->hasNUsesOfValue(1, 0)) { - int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); - if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { - SDValue SExt = - DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, - DAG.getValueType(MVT::i16)); - Cmp = emitComparison(SExt, - DAG.getConstant(ValueofRHS, dl, - RHS.getValueType()), - CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); - return Cmp; - } + const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS); + + // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. + // For the i8 operand, the largest immediate is 255, so this can be easily + // encoded in the compare instruction. For the i16 operand, however, the + // largest immediate cannot be encoded in the compare. + // Therefore, use a sign extending load and cmn to avoid materializing the + // -1 constant. For example, + // movz w1, #65535 + // ldrh w0, [x0, #0] + // cmp w0, w1 + // > + // ldrsh w0, [x0, #0] + // cmn w0, #1 + // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) + // if and only if (sext LHS) == (sext RHS). The checks are in place to + // ensure both the LHS and RHS are truly zero extended and to make sure the + // transformation is profitable. + if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) && + cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && + cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && + LHS.getNode()->hasNUsesOfValue(1, 0)) { + int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); + if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { + SDValue SExt = + DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, + DAG.getValueType(MVT::i16)); + Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, + RHS.getValueType()), + CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); } } + + if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { + if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) { + if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) + AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); + } + } + } + + if (!Cmp) { + Cmp = emitComparison(LHS, RHS, CC, dl, DAG); + AArch64CC = changeIntCCToAArch64CC(CC); } - Cmp = emitComparison(LHS, RHS, CC, dl, DAG); - AArch64CC = changeIntCCToAArch64CC(CC); - AArch64cc = DAG.getConstant(AArch64CC, dl, MVT::i32); + AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } @@ -1391,8 +1659,7 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { @@ -1571,8 +1838,8 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { @@ -1581,6 +1848,16 @@ static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); + unsigned NumElts = InVT.getVectorNumElements(); + + // f16 vectors are promoted to f32 before a conversion. + if (InVT.getVectorElementType() == MVT::f16) { + MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); + SDLoc dl(Op); + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); + } if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); @@ -1628,8 +1905,7 @@ SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end()); - return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, - SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { @@ -1931,6 +2207,31 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } +SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + SDLoc dl(Op); + switch (IntNo) { + default: return SDValue(); // Don't custom lower most intrinsics. + case Intrinsic::aarch64_thread_pointer: { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); + } + case Intrinsic::aarch64_neon_smax: + return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umax: + return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_smin: + return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_neon_umin: + return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -2032,14 +2333,11 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFSINCOS(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: + return LowerINTRINSIC_WO_CHAIN(Op, DAG); } } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { - return 2; -} - //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// @@ -2214,9 +2512,10 @@ SDValue AArch64TargetLowering::LowerFormalArguments( break; } - ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - MemVT, false, false, false, 0); + ArgValue = DAG.getExtLoad( + ExtType, DL, VA.getLocVT(), Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + MemVT, false, false, false, 0); InVals.push_back(ArgValue); } @@ -2289,9 +2588,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 8), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false, + false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2318,9 +2618,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); - SDValue Store = - DAG.getStore(Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(i * 16), false, false, 0); + SDValue Store = DAG.getStore( + Val.getValue(1), DL, Val, FIN, + MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16), + false, false, 0); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); @@ -2453,8 +2754,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); - for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) - if (!ArgLocs[i].isRegLoc()) + for (const CCValAssign &ArgLoc : ArgLocs) + if (!ArgLoc.isRegLoc()) return false; } @@ -2758,7 +3059,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); - DstInfo = MachinePointerInfo::getFixedStack(FI); + DstInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be @@ -2768,7 +3070,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); - DstInfo = MachinePointerInfo::getStack(LocMemOffset); + DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), + LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2802,9 +3105,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { - Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, - RegsToPass[i].second, InFlag); + for (auto &RegToPass : RegsToPass) { + Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, + RegToPass.second, InFlag); InFlag = Chain.getValue(1); } @@ -2860,9 +3163,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Add argument registers to the end of the list so that they are known live // into the call. - for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) - Ops.push_back(DAG.getRegister(RegsToPass[i].first, - RegsToPass[i].second.getValueType())); + for (auto &RegToPass : RegsToPass) + Ops.push_back(DAG.getRegister(RegToPass.first, + RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; @@ -2968,6 +3271,19 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (AArch64::GPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i64)); + else if (AArch64::FPR64RegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } RetOps[0] = Chain; // Update chain. @@ -3010,11 +3326,12 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); - SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, - MachinePointerInfo::getConstantPool(), - /*isVolatile=*/ false, - /*isNonTemporal=*/ true, - /*isInvariant=*/ true, 8); + SDValue GlobalAddr = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), PoolAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + /*isVolatile=*/false, + /*isNonTemporal=*/true, + /*isInvariant=*/true, 8); if (GN->getOffset() != 0) return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, DAG.getConstant(GN->getOffset(), DL, PtrVT)); @@ -3087,8 +3404,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = - DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), - false, true, true, 8); + DAG.getLoad(MVT::i64, DL, Chain, DescAddr, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, + true, true, 8); Chain = FuncTLVGet.getValue(1); MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); @@ -3160,6 +3478,10 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); + + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; @@ -3277,8 +3599,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = LHS.getOpcode(); - if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->isOne() && + if (LHS.getResNo() == 1 && isOneConstant(RHS) && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { assert((CC == ISD::SETEQ || CC == ISD::SETNE) && @@ -3392,17 +3713,11 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); - if (SrcVT != VT) { - if (SrcVT == MVT::f32 && VT == MVT::f64) - In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); - else if (SrcVT == MVT::f64 && VT == MVT::f32) - In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, - DAG.getIntPtrConstant(0, DL)); - else - // FIXME: Src type is different, bail out for now. Can VT really be a - // vector type? - return SDValue(); - } + + if (SrcVT.bitsLT(VT)) + In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); + else if (SrcVT.bitsGT(VT)) + In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; EVT EltVT; @@ -3410,7 +3725,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SDValue VecVal1, VecVal2; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { EltVT = MVT::i32; - VecVT = MVT::v4i32; + VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); EltMask = 0x80000000ULL; if (!VT.isVector()) { @@ -3571,32 +3886,6 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { } } -/// A SELECT_CC operation is really some kind of max or min if both values being -/// compared are, in some sense, equal to the results in either case. However, -/// it is permissible to compare f32 values and produce directly extended f64 -/// values. -/// -/// Extending the comparison operands would also be allowed, but is less likely -/// to happen in practice since their use is right here. Note that truncate -/// operations would *not* be semantically equivalent. -static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { - if (Cmp == Result) - return (Cmp.getValueType() == MVT::f32 || - Cmp.getValueType() == MVT::f64); - - ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp); - ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result); - if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && - Result.getValueType() == MVT::f64) { - bool Lossy; - APFloat CmpVal = CCmp->getValueAPF(); - CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); - return CResult->getValueAPF().bitwiseIsEqual(CmpVal); - } - - return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; -} - SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, SDLoc dl, @@ -3614,7 +3903,13 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } } - // Handle integers first. + // Also handle f16, for which we need to do a f32 comparison. + if (LHS.getValueType() == MVT::f16) { + LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); + RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); + } + + // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); @@ -3637,9 +3932,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. - ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1)); - - if (CVal && CVal->isAllOnesValue()) { + if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -3647,9 +3940,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. - ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0)); - - if (CVal && CVal->isNullValue()) { + if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); @@ -4109,46 +4400,57 @@ SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + HiBitsForLo = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + HiBitsForLo, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); + SDValue LoForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); - SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); - SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". - SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); - SDValue TrueValHi = Opc == ISD::SRA - ? DAG.getNode(Opc, dl, VT, ShOpHi, - DAG.getConstant(VTBits - 1, dl, - MVT::i64)) - : DAG.getConstant(0, dl, VT); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); + SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); + SDValue HiForBigShift = + Opc == ISD::SRA + ? DAG.getNode(Opc, dl, VT, ShOpHi, + DAG.getConstant(VTBits - 1, dl, MVT::i64)) + : DAG.getConstant(0, dl, VT); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } + /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); @@ -4156,31 +4458,41 @@ SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); - SDValue ARMcc; assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); - SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); + + // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which + // is "undef". We wanted 0, so CSEL it directly. + SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), + ISD::SETEQ, dl, DAG); + SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); + LoBitsForHi = + DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), + LoBitsForHi, CCVal, Cmp); + SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); - SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); - SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); + SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); + SDValue HiForNormalShift = + DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); - SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); + SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); - SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), - ISD::SETGE, dl, DAG); - SDValue CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); - SDValue Hi = - DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); + Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, + dl, DAG); + CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); + SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, + HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. - SDValue TrueValLo = DAG.getConstant(0, dl, VT); - SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); - SDValue Lo = - DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); + SDValue LoForBigShift = DAG.getConstant(0, dl, VT); + SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); + SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, + LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); @@ -4362,8 +4674,7 @@ void AArch64TargetLowering::LowerAsmOperandForConstraint( // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); - if (!C || C->getZExtValue() != 0) + if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) @@ -5653,11 +5964,10 @@ static SDValue NormalizeBuildVector(SDValue Op, return Op; SmallVector<SDValue, 16> Ops; - for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { - SDValue Lane = Op.getOperand(I); - if (Lane.getOpcode() == ISD::Constant) { + for (SDValue Lane : Op->ops()) { + if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) { APInt LowBits(EltTy.getSizeInBits(), - cast<ConstantSDNode>(Lane)->getZExtValue()); + CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } Ops.push_back(Lane); @@ -5997,8 +6307,7 @@ FailedModImm: // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { - SDValue shuffle = ReconstructShuffle(Op, DAG); - if (shuffle != SDValue()) + if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } @@ -6017,7 +6326,10 @@ FailedModImm: // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register. - if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { + // Do not do this for UNDEF/LOAD nodes because we have better patterns + // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR. + if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD && + (ElemSize == 32 || ElemSize == 64)) { unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; MachineSDNode *N = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, @@ -6123,24 +6435,11 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, unsigned Val = Cst->getZExtValue(); unsigned Size = Op.getValueType().getSizeInBits(); - if (Val == 0) { - switch (Size) { - case 8: - return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 16: - return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), - Op.getOperand(0)); - case 32: - return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), - Op.getOperand(0)); - case 64: - return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), - Op.getOperand(0)); - default: - llvm_unreachable("Unexpected vector type in extract_subvector!"); - } - } + + // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. + if (Val == 0) + return Op; + // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) @@ -6213,26 +6512,20 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate -/// operand of a vector shift right operation. For a shift opcode, the value -/// is positive, but for an intrinsic the value count must be negative. The -/// absolute value must be in the range: -/// 1 <= |Value| <= ElementBits for a right shift; or -/// 1 <= |Value| <= ElementBits/2 for a narrow right shift. -static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, - int64_t &Cnt) { +/// operand of a vector shift right operation. The value must be in the range: +/// 1 <= Value <= ElementBits for a right shift; or +static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) - Cnt = -Cnt; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } @@ -6261,8 +6554,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, case ISD::SRA: case ISD::SRL: // Right shift immediate - if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && - Cnt < EltSize) { + if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), @@ -6451,7 +6743,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. - uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; @@ -6477,7 +6769,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += DL.getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); @@ -6720,10 +7012,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad( const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); - unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); + unsigned VecSize = DL.getTypeSizeInBits(VecTy); - // Skip illegal vector types. - if (VecSize != 64 && VecSize != 128) + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128)) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to @@ -6806,10 +7098,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); - // Skip illegal vector types. - if (SubVecSize != 64 && SubVecSize != 128) + // Skip if we do not have NEON and skip illegal vector types. + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128)) return false; Value *Op0 = SVI->getOperand(0); @@ -7228,8 +7520,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. - SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); - if (Res != SDValue()) + if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); @@ -7242,7 +7533,7 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. - // This eliminates an "integer-to-vector-move UOP and improve throughput. + // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. @@ -7265,6 +7556,134 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Fold a floating-point multiply by power of two into floating-point to +/// fixed-point conversion. +static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) + return SDValue(); + + SDValue ConstVec = Op->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., float -> i64). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t Bits = IntBits == 64 ? 64 : 32; + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); + if (C == -1 || C == 0 || C > Bits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs + : Intrinsic::aarch64_neon_vcvtfp2fxu; + SDValue FixConv = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), + Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); + // We can handle smaller integers by generating an extra trunc. + if (IntBits < FloatBits) + FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); + + return FixConv; +} + +/// Fold a floating-point divide by power of two into fixed-point to +/// floating-point conversion. +static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue Op = N->getOperand(0); + unsigned Opc = Op->getOpcode(); + if (!Op.getValueType().isVector() || + (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) + return SDValue(); + + SDValue ConstVec = N->getOperand(1); + if (!isa<BuildVectorSDNode>(ConstVec)) + return SDValue(); + + MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); + int32_t IntBits = IntTy.getSizeInBits(); + if (IntBits != 16 && IntBits != 32 && IntBits != 64) + return SDValue(); + + MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + int32_t FloatBits = FloatTy.getSizeInBits(); + if (FloatBits != 32 && FloatBits != 64) + return SDValue(); + + // Avoid conversions where iN is larger than the float (e.g., i64 -> float). + if (IntBits > FloatBits) + return SDValue(); + + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); + if (C == -1 || C == 0 || C > FloatBits) + return SDValue(); + + MVT ResTy; + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + switch (NumLanes) { + default: + return SDValue(); + case 2: + ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; + break; + case 4: + ResTy = MVT::v4i32; + break; + } + + SDLoc DL(N); + SDValue ConvInput = Op.getOperand(0); + bool IsSigned = Opc == ISD::SINT_TO_FP; + if (IntBits < FloatBits) + ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, + ResTy, ConvInput); + + unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp + : Intrinsic::aarch64_neon_vcvtfxu2fp; + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), + DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, + DAG.getConstant(C, DL, MVT::i32)); +} + /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, @@ -7964,7 +8383,6 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); - break; case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: @@ -7978,10 +8396,16 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: - return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: - return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), + return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fmaxnm: + return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), + N->getOperand(1), N->getOperand(2)); + case Intrinsic::aarch64_neon_fminnm: + return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: @@ -8141,7 +8565,7 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { unsigned Alignment = std::min(OrigAlignment, EltOffset); // Create scalar stores. This is at least as good as the code sequence for a - // split unaligned store wich is a dup.s, ext.b, and two stores. + // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(St); @@ -8162,10 +8586,9 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { return NewST1; } -static SDValue performSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { +static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -8173,15 +8596,17 @@ static SDValue performSTORECombine(SDNode *N, if (S->isVolatile()) return SDValue(); + // FIXME: The logic for deciding if an unaligned store should be split should + // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be + // a call to that function here. + // Cyclone has bad performance on unaligned 16B stores when crossing line and // page boundaries. We want to split such stores. if (!Subtarget->isCyclone()) return SDValue(); - // Don't split at Oz. - MachineFunction &MF = DAG.getMachineFunction(); - bool IsMinSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); - if (IsMinSize) + // Don't split at -Oz. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) return SDValue(); SDValue StVal = S->getValue(); @@ -8204,8 +8629,7 @@ static SDValue performSTORECombine(SDNode *N, // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. - SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); - if (ReplacedSplat != SDValue()) + if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S)) return ReplacedSplat; SDLoc DL(S); @@ -8326,6 +8750,299 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } +/// Simplify \Addr given that the top byte of it is ignored by HW during +/// address translation. +static bool performTBISimplification(SDValue Addr, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + APInt DemandedMask = APInt::getLowBitsSet(64, 56); + APInt KnownZero, KnownOne; + TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(), + DCI.isBeforeLegalizeOps()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) { + DCI.CommitTargetLoweringOpt(TLO); + return true; + } + return false; +} + +static SDValue performSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + SDValue Split = split16BStores(N, DCI, DAG, Subtarget); + if (Split.getNode()) + return Split; + + if (Subtarget->supportsAddressTopByteIgnored() && + performTBISimplification(N->getOperand(2), DCI, DAG)) + return SDValue(N, 0); + + return SDValue(); +} + + /// This function handles the log2-shuffle pattern produced by the +/// LoopVectorizer for the across vector reduction. It consists of +/// log2(NumVectorElements) steps and, in each step, 2^(s) elements +/// are reduced, where s is an induction variable from 0 to +/// log2(NumVectorElements). +static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, + unsigned Op, + SelectionDAG &DAG) { + EVT VTy = OpV->getOperand(0).getValueType(); + if (!VTy.isVector()) + return SDValue(); + + int NumVecElts = VTy.getVectorNumElements(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (NumVecElts != 4) + return SDValue(); + } else { + if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) + return SDValue(); + } + + int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); + SDValue PreOp = OpV; + // Iterate over each step of the across vector reduction. + for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { + SDValue CurOp = PreOp.getOperand(0); + SDValue Shuffle = PreOp.getOperand(1); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { + // Try to swap the 1st and 2nd operand as add and min/max instructions + // are commutative. + CurOp = PreOp.getOperand(1); + Shuffle = PreOp.getOperand(0); + if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) + return SDValue(); + } + + // Check if the input vector is fed by the operator we want to handle, + // except the last step; the very first input vector is not necessarily + // the same operator we are handling. + if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) + return SDValue(); + + // Check if it forms one step of the across vector reduction. + // E.g., + // %cur = add %1, %0 + // %shuffle = vector_shuffle %cur, <2, 3, u, u> + // %pre = add %cur, %shuffle + if (Shuffle.getOperand(0) != CurOp) + return SDValue(); + + int NumMaskElts = 1 << CurStep; + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask(); + // Check mask values in each step. + // We expect the shuffle mask in each step follows a specific pattern + // denoted here by the <M, U> form, where M is a sequence of integers + // starting from NumMaskElts, increasing by 1, and the number integers + // in M should be NumMaskElts. U is a sequence of UNDEFs and the number + // of undef in U should be NumVecElts - NumMaskElts. + // E.g., for <8 x i16>, mask values in each step should be : + // step 0 : <1,u,u,u,u,u,u,u> + // step 1 : <2,3,u,u,u,u,u,u> + // step 2 : <4,5,6,7,u,u,u,u> + for (int i = 0; i < NumVecElts; ++i) + if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || + (i >= NumMaskElts && !(Mask[i] < 0))) + return SDValue(); + + PreOp = CurOp; + } + unsigned Opcode; + bool IsIntrinsic = false; + + switch (Op) { + default: + llvm_unreachable("Unexpected operator for across vector reduction"); + case ISD::ADD: + Opcode = AArch64ISD::UADDV; + break; + case ISD::SMAX: + Opcode = AArch64ISD::SMAXV; + break; + case ISD::UMAX: + Opcode = AArch64ISD::UMAXV; + break; + case ISD::SMIN: + Opcode = AArch64ISD::SMINV; + break; + case ISD::UMIN: + Opcode = AArch64ISD::UMINV; + break; + case ISD::FMAXNUM: + Opcode = Intrinsic::aarch64_neon_fmaxnmv; + IsIntrinsic = true; + break; + case ISD::FMINNUM: + Opcode = Intrinsic::aarch64_neon_fminnmv; + IsIntrinsic = true; + break; + } + SDLoc DL(N); + + return IsIntrinsic + ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), + DAG.getConstant(Opcode, DL, MVT::i32), PreOp) + : DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), + DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), + DAG.getConstant(0, DL, MVT::i64)); +} + +/// Target-specific DAG combine for the across vector min/max reductions. +/// This function specifically handles the final clean-up step of the vector +/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which narrows down and finds the final min/max value from all +/// elements of the vector. +/// For example, for a <16 x i8> vector : +/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> +/// %smax0 = smax %arr, svn0 +/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax1 = smax %smax0, %svn1 +/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %smax2 = smax %smax1, svn2 +/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +/// %sc = setcc %smax2, %svn3, gt +/// %n0 = extract_vector_elt %sc, #0 +/// %n1 = extract_vector_elt %smax2, #0 +/// %n2 = extract_vector_elt $smax2, #1 +/// %result = select %n0, %n1, n2 +/// becomes : +/// %1 = smaxv %0 +/// %result = extract_vector_elt %1, 0 +static SDValue +performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue IfTrue = N->getOperand(1); + SDValue IfFalse = N->getOperand(2); + + // Check if the SELECT merges up the final result of the min/max + // from a vector. + if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Expect N0 is fed by SETCC. + SDValue SetCC = N0.getOperand(0); + EVT SetCCVT = SetCC.getValueType(); + if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || + SetCCVT.getVectorElementType() != MVT::i1) + return SDValue(); + + SDValue VectorOp = SetCC.getOperand(0); + unsigned Op = VectorOp->getOpcode(); + // Check if the input vector is fed by the operator we want to handle. + if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && + Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) + return SDValue(); + + EVT VTy = VectorOp.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { + if (EltTy != MVT::f32) + return SDValue(); + } else { + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + } + + // Check if extracting from the same vector. + // For example, + // %sc = setcc %vector, %svn1, gt + // %n0 = extract_vector_elt %sc, #0 + // %n1 = extract_vector_elt %vector, #0 + // %n2 = extract_vector_elt $vector, #1 + if (!(VectorOp == IfTrue->getOperand(0) && + VectorOp == IfFalse->getOperand(0))) + return SDValue(); + + // Check if the condition code is matched with the operator type. + ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get(); + if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || + (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || + (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || + (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || + (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && + CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && + CC != ISD::SETGE) || + (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && + CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && + CC != ISD::SETLE)) + return SDValue(); + + // Expect to check only lane 0 from the vector SETCC. + if (!isNullConstant(N0.getOperand(1))) + return SDValue(); + + // Expect to extract the true value from lane 0. + if (!isNullConstant(IfTrue.getOperand(1))) + return SDValue(); + + // Expect to extract the false value from lane 1. + if (!isOneConstant(IfFalse.getOperand(1))) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); +} + +/// Target-specific DAG combine for the across vector add reduction. +/// This function specifically handles the final clean-up step of the vector +/// add reduction produced by the LoopVectorizer. It is the log2-shuffle +/// pattern, which adds all elements of a vector together. +/// For example, for a <4 x i32> vector : +/// %1 = vector_shuffle %0, <2,3,u,u> +/// %2 = add %0, %1 +/// %3 = vector_shuffle %2, <1,u,u,u> +/// %4 = add %2, %3 +/// %result = extract_vector_elt %4, 0 +/// becomes : +/// %0 = uaddv %0 +/// %result = extract_vector_elt %0, 0 +static SDValue +performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, + const AArch64Subtarget *Subtarget) { + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // Check if the input vector is fed by the ADD. + if (N0->getOpcode() != ISD::ADD) + return SDValue(); + + // The vector extract idx must constant zero because we only expect the final + // result of the reduction is placed in lane 0. + if (!isNullConstant(N1)) + return SDValue(); + + EVT VTy = N0.getValueType(); + if (!VTy.isVector()) + return SDValue(); + + EVT EltTy = VTy.getVectorElementType(); + if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) + return SDValue(); + + if (VTy.getSizeInBits() < 64) + return SDValue(); + + return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); +} + /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, @@ -8751,10 +9468,10 @@ static SDValue performBRCONDCombine(SDNode *N, if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); - if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue()) + if (isNullConstant(LHS)) std::swap(LHS, RHS); - if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue()) + if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || @@ -8868,75 +9585,6 @@ static SDValue performSelectCombine(SDNode *N, return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } -/// performSelectCCCombine - Target-specific DAG combining for ISD::SELECT_CC -/// to match FMIN/FMAX patterns. -static SDValue performSelectCCCombine(SDNode *N, SelectionDAG &DAG) { - // Try to use FMIN/FMAX instructions for FP selects like "x < y ? x : y". - // Unless the NoNaNsFPMath option is set, be careful about NaNs: - // vmax/vmin return NaN if either operand is a NaN; - // only do the transformation when it matches that behavior. - - SDValue CondLHS = N->getOperand(0); - SDValue CondRHS = N->getOperand(1); - SDValue LHS = N->getOperand(2); - SDValue RHS = N->getOperand(3); - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - - unsigned Opcode; - bool IsReversed; - if (selectCCOpsAreFMaxCompatible(CondLHS, LHS) && - selectCCOpsAreFMaxCompatible(CondRHS, RHS)) { - IsReversed = false; // x CC y ? x : y - } else if (selectCCOpsAreFMaxCompatible(CondRHS, LHS) && - selectCCOpsAreFMaxCompatible(CondLHS, RHS)) { - IsReversed = true ; // x CC y ? y : x - } else { - return SDValue(); - } - - bool IsUnordered = false, IsOrEqual; - switch (CC) { - default: - return SDValue(); - case ISD::SETULT: - case ISD::SETULE: - IsUnordered = true; - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETLT: - case ISD::SETLE: - IsOrEqual = (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE); - Opcode = IsReversed ? AArch64ISD::FMAX : AArch64ISD::FMIN; - break; - - case ISD::SETUGT: - case ISD::SETUGE: - IsUnordered = true; - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETGT: - case ISD::SETGE: - IsOrEqual = (CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE); - Opcode = IsReversed ? AArch64ISD::FMIN : AArch64ISD::FMAX; - break; - } - - // If LHS is NaN, an ordered comparison will be false and the result will be - // the RHS, but FMIN(NaN, RHS) = FMAX(NaN, RHS) = NaN. Avoid this by checking - // that LHS != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - return SDValue(); - - // For xxx-or-equal comparisons, "+0 <= -0" and "-0 >= +0" will both be true, - // but FMIN will return -0, and FMAX will return +0. So FMIN/FMAX can only be - // used for unsafe math or if one of the operands is known to be nonzero. - if (IsOrEqual && !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - return SDValue(); - - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); -} - /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) @@ -8961,6 +9609,11 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); + case ISD::FP_TO_SINT: + case ISD::FP_TO_UINT: + return performFpToIntCombine(N, DAG, Subtarget); + case ISD::FDIV: + return performFDivCombine(N, DAG, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::INTRINSIC_WO_CHAIN: @@ -8973,12 +9626,18 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: - return performSelectCombine(N, DCI); + case ISD::SELECT: { + SDValue RV = performSelectCombine(N, DCI); + if (!RV.getNode()) + RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); + return RV; + } case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); - case ISD::SELECT_CC: - return performSelectCCCombine(N, DCI.DAG); + case ISD::LOAD: + if (performTBISimplification(N->getOperand(1), DCI, DAG)) + return SDValue(N, 0); + break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: @@ -8991,6 +9650,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); + case ISD::EXTRACT_VECTOR_ELT: + return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { @@ -9157,6 +9818,20 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } +static void ReplaceReductionResults(SDNode *N, + SmallVectorImpl<SDValue> &Results, + SelectionDAG &DAG, unsigned InterOp, + unsigned AcrossOp) { + EVT LoVT, HiVT; + SDValue Lo, Hi; + SDLoc dl(N); + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); + SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); + Results.push_back(SplitVal); +} + void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { @@ -9165,6 +9840,24 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case AArch64ISD::SADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); + return; + case AArch64ISD::UADDV: + ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); + return; + case AArch64ISD::SMINV: + ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); + return; + case AArch64ISD::UMINV: + ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); + return; + case AArch64ISD::SMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); + return; + case AArch64ISD::UMAXV: + ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); + return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); @@ -9177,10 +9870,10 @@ bool AArch64TargetLowering::useLoadStackGuardNode() const { return true; } -bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { +unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. - return NumUsers > 2; + return 3; } TargetLoweringBase::LegalizeTypeAction @@ -9206,20 +9899,21 @@ bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. -bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return Size == 128; + return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); - return Size <= 128 ? AtomicRMWExpansionKind::LLSC - : AtomicRMWExpansionKind::None; + return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } -bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { +bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { return true; } @@ -9258,6 +9952,13 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast<PointerType>(Addr->getType())->getElementType()); } +void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall( + llvm::Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); +} + Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { @@ -9294,3 +9995,70 @@ bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } + +bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, + EVT) const { + return false; +} + +Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + const unsigned TlsOffset = 0x48; + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Function *ThreadPointerFunc = + Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer); + return IRB.CreatePointerCast( + IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); +} + +void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in AArch64unctionInfo. + AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>(); + AFI->setIsSplitCSR(true); +} + +void AArch64TargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const { + const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (AArch64::GPR64RegClass.contains(*I)) + RC = &AArch64::GPR64RegClass; + else if (AArch64::FPR64RegClass.contains(*I)) + RC = &AArch64::FPR64RegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h index c73ce1e..e99616c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H +#include "AArch64.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/IR/CallingConv.h" @@ -58,13 +59,14 @@ enum NodeType : unsigned { SBCS, ANDS, + // Conditional compares. Operands: left,right,falsecc,cc,flags + CCMP, + CCMN, + FCCMP, + // Floating point comparison FCMP, - // Floating point max and min instructions. - FMAX, - FMIN, - // Scalar extract EXTR, @@ -217,8 +219,6 @@ class AArch64Subtarget; class AArch64TargetMachine; class AArch64TargetLowering : public TargetLowering { - bool RequireStrictAlign; - public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); @@ -226,46 +226,35 @@ public: /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; - /// computeKnownBitsForTargetNode - Determine which of the bits specified in - /// Mask are known to be either zero or one and return them in the - /// KnownZero/KnownOne bitsets. + /// Determine which of the bits specified in Mask are known to be either zero + /// or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero, APInt &KnownOne, const SelectionDAG &DAG, unsigned Depth = 0) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; - /// allowsMisalignedMemoryAccesses - Returns true if the target allows - /// unaligned memory accesses of the specified type. + /// Returns true if the target allows unaligned memory accesses of the + /// specified type. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, - bool *Fast = nullptr) const override { - if (RequireStrictAlign) - return false; - // FIXME: True for Cyclone, but not necessary others. - if (Fast) - *Fast = true; - return true; - } + bool *Fast = nullptr) const override; - /// LowerOperation - Provide custom lowering hooks for some operations. + /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - /// getFunctionAlignment - Return the Log2 alignment of this function. - unsigned getFunctionAlignment(const Function *F) const; - /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. return true; } - /// createFastISel - This method returns a target specific FastISel object, - /// or null if the target does not support "fast" ISel. + /// This method returns a target specific FastISel object, or null if the + /// target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; @@ -273,11 +262,11 @@ public: bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; - /// isShuffleMaskLegal - Return true if the given shuffle mask can be - /// codegen'd directly, or if it should be stack expanded. + /// Return true if the given shuffle mask can be codegen'd directly, or if it + /// should be stack expanded. bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override; - /// getSetCCResultType - Return the ISD::SETCC ValueType + /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -322,8 +311,8 @@ public: bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; - /// isLegalAddressingMode - Return true if the addressing mode represented - /// by AM is legal for this target, for a load/store of the specified type. + /// Return true if the addressing mode represented by AM is legal for this + /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; @@ -335,10 +324,9 @@ public: int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; - /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster - /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be - /// expanded to FMAs when this method returns true, otherwise fmuladd is - /// expanded to fmul + fadd. + /// Return true if an FMA operation is faster than a pair of fmul and fadd + /// instructions. fmuladd intrinsics will be expanded to FMAs when this method + /// returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; @@ -351,25 +339,65 @@ public: bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; - bool hasLoadLinkedStoreConditional() const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; + bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + /// If the target has a standard location for the unsafe stack pointer, + /// returns the address of that location. Otherwise, returns nullptr. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + // FIXME: This is a guess. Has this been defined yet? + return AArch64::X1; + } + + bool isCheapToSpeculateCttz() const override { + return true; + } + + bool isCheapToSpeculateCtlz() const override { + return true; + } + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; - /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can + /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; @@ -392,6 +420,8 @@ private: SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + bool isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, @@ -470,7 +500,7 @@ private: SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector<SDNode *> *Created) const override; - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; unsigned getRegisterByName(const char* RegName, EVT VT, @@ -516,6 +546,8 @@ private: bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; + + bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; }; namespace AArch64 { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 3f2e772..6ac2175 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -248,6 +248,12 @@ def simm7s16 : Operand<i32> { let PrintMethod = "printImmScale<16>"; } +def am_indexed7s8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>; +def am_indexed7s16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>; +def am_indexed7s32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>; +def am_indexed7s64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S64", []>; +def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; + class AsmImmRange<int Low, int High> : AsmOperandClass { let Name = "Imm" # Low # "_" # High; let DiagnosticType = "InvalidImm" # Low # "_" # High; @@ -346,9 +352,11 @@ class fixedpoint_i64<ValueType FloatVT> let ParserMatchClass = Imm1_64Operand; } +def fixedpoint_f16_i32 : fixedpoint_i32<f16>; def fixedpoint_f32_i32 : fixedpoint_i32<f32>; def fixedpoint_f64_i32 : fixedpoint_i32<f64>; +def fixedpoint_f16_i64 : fixedpoint_i64<f16>; def fixedpoint_f32_i64 : fixedpoint_i64<f32>; def fixedpoint_f64_i64 : fixedpoint_i64<f64>; @@ -402,6 +410,7 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{ let ParserMatchClass = Imm1_32Operand; } +def Imm0_1Operand : AsmImmRange<0, 1>; def Imm0_7Operand : AsmImmRange<0, 7>; def Imm0_15Operand : AsmImmRange<0, 15>; def Imm0_31Operand : AsmImmRange<0, 31>; @@ -525,6 +534,20 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{ let ParserMatchClass = Imm0_31Operand; } +// True if the 32-bit immediate is in the range [0,31] +def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint64_t)Imm) < 32; +}]> { + let ParserMatchClass = Imm0_31Operand; +} + +// imm0_1 predicate - True if the immediate is in the range [0,1] +def imm0_1 : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 2; +}]> { + let ParserMatchClass = Imm0_1Operand; +} + // imm0_15 predicate - True if the immediate is in the range [0,15] def imm0_15 : Operand<i64>, ImmLeaf<i64, [{ return ((uint64_t)Imm) < 16; @@ -542,7 +565,9 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{ // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 16; -}]>; +}]> { + let ParserMatchClass = Imm0_15Operand; +} // An arithmetic shifter operand: // {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr @@ -690,6 +715,17 @@ class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>, } // Floating-point immediate. +def fpimm16 : Operand<f16>, + PatLeaf<(f16 fpimm), [{ + return AArch64_AM::getFP16Imm(N->getValueAPF()) != -1; + }], SDNodeXForm<fpimm, [{ + APFloat InVal = N->getValueAPF(); + uint32_t enc = AArch64_AM::getFP16Imm(InVal); + return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i32); + }]>> { + let ParserMatchClass = FPImmOperand; + let PrintMethod = "printFPImmOperand"; +} def fpimm32 : Operand<f32>, PatLeaf<(f32 fpimm), [{ return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1; @@ -822,7 +858,7 @@ class RtSystemI<bit L, dag oops, dag iops, string asm, string operands> // model patterns with sufficiently fine granularity let mayStore = 1, mayLoad = 1, hasSideEffects = 1 in class HintI<string mnemonic> - : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "", + : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#"\t$imm", "", [(int_aarch64_hint imm0_127:$imm)]>, Sched<[WriteHint]> { bits <7> imm; @@ -875,6 +911,25 @@ def msr_sysreg_op : Operand<i32> { let PrintMethod = "printMSRSystemRegister"; } +def PSBHintOperand : AsmOperandClass { + let Name = "PSBHint"; + let ParserMethod = "tryParsePSBHint"; +} +def psbhint_op : Operand<i32> { + let ParserMatchClass = PSBHintOperand; + let PrintMethod = "printPSBHintOp"; + let MCOperandPredicate = [{ + // Check, if operand is valid, to fix exhaustive aliasing in disassembly. + // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields. + if (!MCOp.isImm()) + return false; + bool ValidNamed; + (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(), + STI.getFeatureBits(), ValidNamed); + return ValidNamed; + }]; +} + class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg), "mrs", "\t$Rt, $systemreg"> { bits<16> systemreg; @@ -890,19 +945,19 @@ class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt), let Inst{20-5} = systemreg; } -def SystemPStateFieldOperand : AsmOperandClass { - let Name = "SystemPStateField"; +def SystemPStateFieldWithImm0_15Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_15"; let ParserMethod = "tryParseSysReg"; } -def pstatefield_op : Operand<i32> { - let ParserMatchClass = SystemPStateFieldOperand; +def pstatefield4_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_15Operand; let PrintMethod = "printSystemPStateField"; } let Defs = [NZCV] in -class MSRpstateI - : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm), - "msr", "\t$pstate_field, $imm">, +class MSRpstateImm0_15 + : SimpleSystemI<0, (ins pstatefield4_op:$pstatefield, imm0_15:$imm), + "msr", "\t$pstatefield, $imm">, Sched<[WriteSys]> { bits<6> pstatefield; bits<4> imm; @@ -913,6 +968,37 @@ class MSRpstateI let Inst{7-5} = pstatefield{2-0}; let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; +} + +def SystemPStateFieldWithImm0_1Operand : AsmOperandClass { + let Name = "SystemPStateFieldWithImm0_1"; + let ParserMethod = "tryParseSysReg"; +} +def pstatefield1_op : Operand<i32> { + let ParserMatchClass = SystemPStateFieldWithImm0_1Operand; + let PrintMethod = "printSystemPStateField"; +} + +let Defs = [NZCV] in +class MSRpstateImm0_1 + : SimpleSystemI<0, (ins pstatefield1_op:$pstatefield, imm0_1:$imm), + "msr", "\t$pstatefield, $imm">, + Sched<[WriteSys]> { + bits<6> pstatefield; + bit imm; + let Inst{20-19} = 0b00; + let Inst{18-16} = pstatefield{5-3}; + let Inst{15-9} = 0b0100000; + let Inst{8} = imm; + let Inst{7-5} = pstatefield{2-0}; + + let DecoderMethod = "DecodeSystemPStateInstruction"; + // MSRpstateI aliases with MSRI. When the MSRpstateI decoder method returns + // Fail the decoder should attempt to decode the instruction as MSRI. + let hasCompleteDecoder = 0; } // SYS and SYSL generic system instructions. @@ -1341,7 +1427,7 @@ multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> { } class ShiftAlias<string asm, Instruction inst, RegisterClass regtype> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>; class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype, @@ -1407,13 +1493,13 @@ class MulHi<bits<3> opc, string asm, SDNode OpNode> } class MulAccumWAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>; class MulAccumXAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>; class WideMulAccumAlias<string asm, Instruction inst> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>; class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg, @@ -1643,7 +1729,7 @@ class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype, class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype, RegisterClass src1Regtype, RegisterClass src2Regtype, int shiftExt> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2, shiftExt)>; @@ -1701,10 +1787,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias, } // add Rd, Rb, -imm -> sub Rd, Rn, imm - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; @@ -1776,43 +1862,43 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp, } // Defs = [NZCV] // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<alias#" $Rd, $Rn, $imm", + def : InstAlias<alias#"\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn, addsub_shifted_imm64_neg:$imm), 0>; // Compare aliases - def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>; - def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstAlias<cmp#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx") WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx") XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>; - def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") + def : InstAlias<cmp#"\t$src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>; // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm - def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Wri") + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri") WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>; - def : InstAlias<cmpAlias#" $src, $imm", (!cast<Instruction>(NAME#"Xri") + def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri") XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>; // Compare shorthands - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrs") WZR, GPR32:$src1, GPR32:$src2, 0), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrs") XZR, GPR64:$src1, GPR64:$src2, 0), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrx") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Wrx") WZR, GPR32sponly:$src1, GPR32:$src2, 16), 5>; - def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrx64") + def : InstAlias<cmp#"\t$src1, $src2", (!cast<Instruction>(NAME#"Xrx64") XZR, GPR64sponly:$src1, GPR64:$src2, 24), 5>; // Register/register aliases with no shift when SP is not used. @@ -1998,7 +2084,7 @@ class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype, // Aliases for register+register logical instructions. class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype> - : InstAlias<asm#" $dst, $src1, $src2", + : InstAlias<asm#"\t$dst, $src1, $src2", (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>; multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, @@ -2017,10 +2103,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode, let Inst{31} = 1; } - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2039,10 +2125,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode, } } // end Defs = [NZCV] - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn, logical_imm32_not:$imm), 0>; - def : InstAlias<Alias # " $Rd, $Rn, $imm", + def : InstAlias<Alias # "\t$Rd, $Rn, $imm", (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn, logical_imm64_not:$imm), 0>; } @@ -2105,9 +2191,12 @@ multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $imm, $nzcv, $cond", "", []>, +class BaseCondComparisonImm<bit op, RegisterClass regtype, ImmLeaf immtype, + string mnemonic, SDNode OpNode> + : I<(outs), (ins regtype:$Rn, immtype:$imm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $imm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, immtype:$imm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2127,19 +2216,13 @@ class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm> let Inst{3-0} = nzcv; } -multiclass CondSetFlagsImm<bit op, string asm> { - def Wi : BaseCondSetFlagsImm<op, GPR32, asm> { - let Inst{31} = 0; - } - def Xi : BaseCondSetFlagsImm<op, GPR64, asm> { - let Inst{31} = 1; - } -} - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, +class BaseCondComparisonReg<bit op, RegisterClass regtype, string mnemonic, + SDNode OpNode> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", + [(set NZCV, (OpNode regtype:$Rn, regtype:$Rm, (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]>, Sched<[WriteI, ReadI, ReadI]> { let Uses = [NZCV]; let Defs = [NZCV]; @@ -2159,11 +2242,19 @@ class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm> let Inst{3-0} = nzcv; } -multiclass CondSetFlagsReg<bit op, string asm> { - def Wr : BaseCondSetFlagsReg<op, GPR32, asm> { +multiclass CondComparison<bit op, string mnemonic, SDNode OpNode> { + // immediate operand variants + def Wi : BaseCondComparisonImm<op, GPR32, imm32_0_31, mnemonic, OpNode> { let Inst{31} = 0; } - def Xr : BaseCondSetFlagsReg<op, GPR64, asm> { + def Xi : BaseCondComparisonImm<op, GPR64, imm0_31, mnemonic, OpNode> { + let Inst{31} = 1; + } + // register operand variants + def Wr : BaseCondComparisonReg<op, GPR32, mnemonic, OpNode> { + let Inst{31} = 0; + } + def Xr : BaseCondComparisonReg<op, GPR64, mnemonic, OpNode> { let Inst{31} = 1; } } @@ -2328,7 +2419,7 @@ multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2340,7 +2431,7 @@ multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2508,7 +2599,7 @@ class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, } class ROInstAlias<string asm, RegisterClass regtype, Instruction INST> - : InstAlias<asm # " $Rt, [$Rn, $Rm]", + : InstAlias<asm # "\t$Rt, [$Rn, $Rm]", (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>; multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, @@ -2934,7 +3025,7 @@ multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2946,7 +3037,7 @@ multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype, asm, pattern>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -2958,7 +3049,7 @@ multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, asm, pat>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>; } @@ -2993,7 +3084,7 @@ multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, (ins GPR64sp:$Rn, simm9:$offset), asm>, Sched<[WriteLD]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -3005,7 +3096,7 @@ multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, asm>, Sched<[WriteST]>; - def : InstAlias<asm # " $Rt, [$Rn]", + def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>; } @@ -3136,7 +3227,7 @@ multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype, (ins GPR64sp:$Rn, indextype:$offset), asm>, Sched<[WriteLD, WriteLDHi]>; - def : InstAlias<asm # " $Rt, $Rt2, [$Rn]", + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } @@ -3151,7 +3242,7 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype, asm>, Sched<[WriteSTP]>; - def : InstAlias<asm # " $Rt, $Rt2, [$Rn]", + def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]", (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, 0)>; } @@ -3230,8 +3321,8 @@ class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype, let mayStore = 1, mayLoad = 0 in class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype, Operand idxtype, string asm> - : BaseLoadStorePairPostIdx<opc, V, 0, (outs), - (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2, + : BaseLoadStorePairPostIdx<opc, V, 0, (outs GPR64sp:$wback), + (ins regtype:$Rt, regtype:$Rt2, GPR64sp:$Rn, idxtype:$offset), asm>, Sched<[WriteAdr, WriteSTP]>; @@ -3477,6 +3568,20 @@ class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode, multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { + // Unscaled half-precision to 32-bit + def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm, + [(set GPR32:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Predicates = [HasFullFP16]; + } + + // Unscaled half-precision to 64-bit + def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm, + [(set GPR64:$Rd, (OpN FPR16:$Rn))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + // Unscaled single-precision to 32-bit def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm, [(set GPR32:$Rd, (OpN FPR32:$Rn))]> { @@ -3504,6 +3609,25 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm, multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> { + // Scaled half-precision to 32-bit + def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32, + fixedpoint_f16_i32, asm, + [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i32:$scale)))]> { + let Inst{31} = 0; // 32-bit GPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + + // Scaled half-precision to 64-bit + def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64, + fixedpoint_f16_i64, asm, + [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn, + fixedpoint_f16_i64:$scale)))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Predicates = [HasFullFP16]; + } + // Scaled single-precision to 32-bit def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32, fixedpoint_f32_i32, asm, @@ -3553,7 +3677,7 @@ class BaseIntegerToFP<bit isUnsigned, bits<5> Rd; bits<5> Rn; bits<6> scale; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b00001; let Inst{16} = isUnsigned; let Inst{15-10} = scale; @@ -3570,7 +3694,7 @@ class BaseIntegerToFPUnscaled<bit isUnsigned, bits<5> Rd; bits<5> Rn; bits<6> scale; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21-17} = 0b10001; let Inst{16} = isUnsigned; let Inst{15-10} = 0b000000; @@ -3580,33 +3704,55 @@ class BaseIntegerToFPUnscaled<bit isUnsigned, multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { // Unscaled + def UWHri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR16, f16, asm, node> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def UXHri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR16, f16, asm, node> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; } def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } // Scaled + def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm, + [(set FPR16:$Rd, + (fdiv (node GPR32:$Rn), + fixedpoint_f16_i32:$scale))]> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let scale{5} = 1; + let Predicates = [HasFullFP16]; + } + def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm, [(set FPR32:$Rd, (fdiv (node GPR32:$Rn), fixedpoint_f32_i32:$scale))]> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag let scale{5} = 1; } @@ -3615,16 +3761,25 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { (fdiv (node GPR32:$Rn), fixedpoint_f64_i32:$scale))]> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag let scale{5} = 1; } + def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm, + [(set FPR16:$Rd, + (fdiv (node GPR64:$Rn), + fixedpoint_f16_i64:$scale))]> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm, [(set FPR32:$Rd, (fdiv (node GPR64:$Rn), fixedpoint_f32_i64:$scale))]> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm, @@ -3632,7 +3787,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> { (fdiv (node GPR64:$Rn), fixedpoint_f64_i64:$scale))]> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } } @@ -3654,7 +3809,7 @@ class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode, Sched<[WriteFCopy]> { bits<5> Rd; bits<5> Rn; - let Inst{30-23} = 0b00111100; + let Inst{30-24} = 0b0011110; let Inst{21} = 1; let Inst{20-19} = rmode; let Inst{18-16} = opcode; @@ -3704,26 +3859,49 @@ class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode, } - multiclass UnscaledConversion<string asm> { + def WHr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR16, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def XHr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR16, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag + } + + def HWr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR32, asm> { + let Inst{31} = 0; // 32-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; + } + + def HXr : BaseUnscaledConversion<0b00, 0b110, FPR16, GPR64, asm> { + let Inst{31} = 1; // 64-bit GPR flag + let Inst{23-22} = 0b11; // 16-bit FPR flag + let Predicates = [HasFullFP16]; } def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> { let Inst{31} = 0; // 32-bit GPR flag - let Inst{22} = 0; // 32-bit FPR flag + let Inst{23-22} = 0b00; // 32-bit FPR flag } def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> { let Inst{31} = 1; // 64-bit GPR flag - let Inst{22} = 1; // 64-bit FPR flag + let Inst{23-22} = 0b01; // 64-bit FPR flag } def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128, @@ -3796,7 +3974,7 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype, Sched<[WriteF]> { bits<5> Rd; bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21-19} = 0b100; let Inst{18-15} = opcode; let Inst{14-10} = 0b10000; @@ -3806,12 +3984,17 @@ class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype, multiclass SingleOperandFPData<bits<4> opcode, string asm, SDPatternOperator node = null_frag> { + def Hr : BaseSingleOperandFPData<opcode, FPR16, f16, asm, node> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3828,7 +4011,7 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, bits<5> Rd; bits<5> Rn; bits<5> Rm; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = opcode; @@ -3839,28 +4022,41 @@ class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype, multiclass TwoOperandFPData<bits<4> opcode, string asm, SDPatternOperator node = null_frag> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set (f16 FPR16:$Rd), + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, [(set (f32 FPR32:$Rd), (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, [(set (f64 FPR64:$Rd), (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> { + def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm, + [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPData<opcode, FPR32, asm, [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drr : BaseTwoOperandFPData<opcode, FPR64, asm, [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3878,7 +4074,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub, bits<5> Rn; bits<5> Rm; bits<5> Ra; - let Inst{31-23} = 0b000111110; + let Inst{31-24} = 0b00011111; let Inst{21} = isNegated; let Inst{20-16} = Rm; let Inst{15} = isSub; @@ -3889,16 +4085,23 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub, multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm, SDPatternOperator node> { + def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm, + [(set FPR16:$Rd, + (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> { + let Inst{23-22} = 0b11; // 16-bit size flag + let Predicates = [HasFullFP16]; + } + def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm, [(set FPR32:$Rd, (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> { - let Inst{22} = 0; // 32-bit size flag + let Inst{23-22} = 0b00; // 32-bit size flag } def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm, [(set FPR64:$Rd, (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> { - let Inst{22} = 1; // 64-bit size flag + let Inst{23-22} = 0b01; // 64-bit size flag } } @@ -3913,7 +4116,7 @@ class BaseOneOperandFPComparison<bit signalAllNans, : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>, Sched<[WriteFCmp]> { bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{15-10} = 0b001000; @@ -3932,7 +4135,7 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, Sched<[WriteFCmp]> { bits<5> Rm; bits<5> Rn; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-10} = 0b001000; @@ -3944,24 +4147,36 @@ class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype, multiclass FPComparison<bit signalAllNans, string asm, SDPatternOperator OpNode = null_frag> { let Defs = [NZCV] in { + def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + + def Hri : BaseOneOperandFPComparison<signalAllNans, FPR16, asm, + [(OpNode (f16 FPR16:$Rn), fpimm0), (implicit NZCV)]> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm, [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm, [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm, [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm, [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } // Defs = [NZCV] } @@ -3971,17 +4186,20 @@ multiclass FPComparison<bit signalAllNans, string asm, //--- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseFPCondComparison<bit signalAllNans, - RegisterClass regtype, string asm> - : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond), - asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>, +class BaseFPCondComparison<bit signalAllNans, RegisterClass regtype, + string mnemonic, list<dag> pat> + : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm32_0_15:$nzcv, ccode:$cond), + mnemonic, "\t$Rn, $Rm, $nzcv, $cond", "", pat>, Sched<[WriteFCmp]> { + let Uses = [NZCV]; + let Defs = [NZCV]; + bits<5> Rn; bits<5> Rm; bits<4> nzcv; bits<4> cond; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; @@ -3991,16 +4209,24 @@ class BaseFPCondComparison<bit signalAllNans, let Inst{3-0} = nzcv; } -multiclass FPCondComparison<bit signalAllNans, string asm> { - let Defs = [NZCV], Uses = [NZCV] in { - def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> { - let Inst{22} = 0; +multiclass FPCondComparison<bit signalAllNans, string mnemonic, + SDPatternOperator OpNode = null_frag> { + def Hrr : BaseFPCondComparison<signalAllNans, FPR16, mnemonic, []> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; } - def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> { - let Inst{22} = 1; + def Srr : BaseFPCondComparison<signalAllNans, FPR32, mnemonic, + [(set NZCV, (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b00; + } + + def Drr : BaseFPCondComparison<signalAllNans, FPR64, mnemonic, + [(set NZCV, (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm), (i32 imm:$nzcv), + (i32 imm:$cond), NZCV))]> { + let Inst{23-22} = 0b01; } - } // Defs = [NZCV], Uses = [NZCV] } //--- @@ -4019,7 +4245,7 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm> bits<5> Rm; bits<4> cond; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-16} = Rm; let Inst{15-12} = cond; @@ -4030,12 +4256,17 @@ class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm> multiclass FPCondSelect<string asm> { let Uses = [NZCV] in { + def Hrrr : BaseFPCondSelect<FPR16, f16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Srrr : BaseFPCondSelect<FPR32, f32, asm> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Drrr : BaseFPCondSelect<FPR64, f64, asm> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } // Uses = [NZCV] } @@ -4050,7 +4281,7 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm> Sched<[WriteFImm]> { bits<5> Rd; bits<8> imm; - let Inst{31-23} = 0b000111100; + let Inst{31-24} = 0b00011110; let Inst{21} = 1; let Inst{20-13} = imm; let Inst{12-5} = 0b10000000; @@ -4058,12 +4289,17 @@ class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm> } multiclass FPMoveImmediate<string asm> { + def Hi : BaseFPMoveImmediate<FPR16, fpimm16, asm> { + let Inst{23-22} = 0b11; + let Predicates = [HasFullFP16]; + } + def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> { - let Inst{22} = 0; + let Inst{23-22} = 0b00; } def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> { - let Inst{22} = 1; + let Inst{23-22} = 0b01; } } } // end of 'let Predicates = [HasFPARMv8]' @@ -4079,7 +4315,7 @@ let Predicates = [HasNEON] in { //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeSameVector<bit Q, bit U, bits<3> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, @@ -4093,8 +4329,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -4103,7 +4338,7 @@ class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, } let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm, @@ -4117,8 +4352,7 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{30} = Q; let Inst{29} = U; let Inst{28-24} = 0b01110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -4129,25 +4363,25 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, // All operand sizes distinguished in the encoding. multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; - def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDThreeSameVector<1, U, 0b111, opc, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } @@ -4155,49 +4389,49 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>; - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>; } multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; - def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; @@ -4206,54 +4440,80 @@ multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm, // As above, but only B sized elements supported. multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, 0b001, opc, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, 0b001, opc, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>; } -// As above, but only S and D sized floating point elements supported. -multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc, +// As above, but only floating point elements supported. +multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } -multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc, +multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVector<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVector<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVector<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; } -multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc, +multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDThreeSameVectorTied<0, U, {S,0b10}, {0b00,opc}, V64, + asm, ".4h", + [(set (v4f16 V64:$dst), + (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn), (v4f16 V64:$Rm)))]>; + def v8f16 : BaseSIMDThreeSameVectorTied<1, U, {S,0b10}, {0b00,opc}, V128, + asm, ".8h", + [(set (v8f16 V128:$dst), + (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn), (v8f16 V128:$Rm)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0b01}, {0b11,opc}, V64, asm, ".2s", [(set (v2f32 V64:$dst), (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>; - def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0b01}, {0b11,opc}, V128, asm, ".4s", [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>; - def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,0b11}, {0b11,opc}, V128, asm, ".2d", [(set (v2f64 V128:$dst), (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>; @@ -4262,16 +4522,16 @@ multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc, // As above, but D and B sized elements unsupported. multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDThreeSameVector<0, U, 0b011, opc, V64, asm, ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>; - def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDThreeSameVector<1, U, 0b011, opc, V128, asm, ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>; - def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDThreeSameVector<0, U, 0b101, opc, V64, asm, ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>; - def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDThreeSameVector<1, U, 0b101, opc, V128, asm, ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>; } @@ -4279,10 +4539,10 @@ multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm, // Logical three vector ops share opcode bits, and only use B sized elements. multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64, + def v8i8 : BaseSIMDThreeSameVector<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>; - def v16i8 : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128, + def v16i8 : BaseSIMDThreeSameVector<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>; @@ -4303,11 +4563,11 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm, multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64, + def v8i8 : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64, asm, ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>; - def v16i8 : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128, + def v16i8 : BaseSIMDThreeSameVectorTied<1, U, {size,1}, 0b00011, V128, asm, ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), @@ -4347,8 +4607,8 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list<dag> pattern> + bits<2> size2, RegisterOperand regtype, string asm, + string dstkind, string srckind, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "", pattern>, @@ -4360,7 +4620,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4369,8 +4631,9 @@ class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, string asm, string dstkind, - string srckind, list<dag> pattern> + bits<2> size2, RegisterOperand regtype, + string asm, string dstkind, string srckind, + list<dag> pattern> : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm, "{\t$Rd" # dstkind # ", $Rn" # srckind # "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>, @@ -4382,7 +4645,9 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4392,22 +4657,22 @@ class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode, // Supports B, H, and S element sizes. multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } @@ -4450,49 +4715,49 @@ multiclass SIMDVectorLShiftLongBySizeBHS { // Supports all element sizes. multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8_v4i16 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + def v8i8_v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".4h", ".8b", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v8i8 V64:$Rn)))]>; - def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".8h", ".16b", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v16i8 V128:$Rn)))]>; - def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".2s", ".4h", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v4i16 V64:$Rn)))]>; - def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".4s", ".8h", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v8i16 V128:$Rn)))]>; - def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".1d", ".2s", [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd), (v2i32 V64:$Rn)))]>; - def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".2d", ".4s", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v4i32 V128:$Rn)))]>; @@ -4501,50 +4766,50 @@ multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm, // Supports all element sizes, except 1xD. multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>; } multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; - def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128, + def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } @@ -4553,10 +4818,10 @@ multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm, // Supports only B element sizes. multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, size, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>; @@ -4565,16 +4830,16 @@ multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm, // Supports only B and H element sizes. multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64, + def v8i8 : BaseSIMDTwoSameVector<0, U, 0b00, opc, 0b00, V64, asm, ".8b", ".8b", [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>; - def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128, + def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, 0b00, V128, asm, ".16b", ".16b", [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>; - def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64, + def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, 0b00, V64, asm, ".4h", ".4h", [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>; - def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128, + def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, 0b00, V128, asm, ".8h", ".8h", [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>; } @@ -4583,13 +4848,21 @@ multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm, // as an extra opcode bit. multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } @@ -4597,10 +4870,10 @@ multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm, // Supports only S element size. multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; } @@ -4608,26 +4881,42 @@ multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm, multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>; } multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4f16 : BaseSIMDTwoSameVector<0, U, {S,1}, opc, 0b11, V64, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>; + def v8f16 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b11, V128, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] + def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, 0b00, V64, asm, ".2s", ".2s", [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>; - def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128, + def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, 0b00, V128, asm, ".4s", ".4s", [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>; - def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128, + def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, 0b00, V128, asm, ".2d", ".2d", [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>; } @@ -4706,10 +4995,10 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm, (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>; } -class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, - RegisterOperand regtype, - string asm, string kind, string zero, - ValueType dty, ValueType sty, SDNode OpNode> +class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2, + bits<5> opcode, RegisterOperand regtype, string asm, + string kind, string zero, ValueType dty, + ValueType sty, SDNode OpNode> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero # "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "", @@ -4722,7 +5011,9 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b01110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -4732,54 +5023,74 @@ class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode, // Comparisons support all element sizes, except 1xD. multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm, SDNode OpNode> { - def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64, + def v8i8rz : BaseSIMDCmpTwoVector<0, U, 0b00, 0b00, opc, V64, asm, ".8b", "0", v8i8, v8i8, OpNode>; - def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128, + def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, 0b00, opc, V128, asm, ".16b", "0", v16i8, v16i8, OpNode>; - def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64, + def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, 0b00, opc, V64, asm, ".4h", "0", v4i16, v4i16, OpNode>; - def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128, + def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, 0b00, opc, V128, asm, ".8h", "0", v8i16, v8i16, OpNode>; - def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64, + def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, 0b00, opc, V64, asm, ".2s", "0", v2i32, v2i32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128, + def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, 0b00, opc, V128, asm, ".4s", "0", v4i32, v4i32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128, + def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, 0b00, opc, V128, asm, ".2d", "0", v2i64, v2i64, OpNode>; } -// FP Comparisons support only S and D element sizes. +// FP Comparisons support only S and D element sizes (and H for v8.2a). multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc, string asm, SDNode OpNode> { - def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64, + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16rz : BaseSIMDCmpTwoVector<0, U, {S,1}, 0b11, opc, V64, + asm, ".4h", "0.0", + v4i16, v4f16, OpNode>; + def v8i16rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b11, opc, V128, + asm, ".8h", "0.0", + v8i16, v8f16, OpNode>; + } // Predicates = [HasNEON, HasFullFP16] + def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, 0b00, opc, V64, asm, ".2s", "0.0", v2i32, v2f32, OpNode>; - def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128, + def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, 0b00, opc, V128, asm, ".4s", "0.0", v4i32, v4f32, OpNode>; - def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128, + def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, 0b00, opc, V128, asm, ".2d", "0.0", v2i64, v2f64, OpNode>; - def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0", + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Vd.4h, $Vn.4h, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # "\t$Vd.8h, $Vn.8h, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # "\t$Vd.2s, $Vn.2s, #0", (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0", + def : InstAlias<asm # "\t$Vd.4s, $Vn.4s, #0", (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0", + def : InstAlias<asm # "\t$Vd.2d, $Vn.2d, #0", (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # ".2s $Vd, $Vn, #0", + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # ".4h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v4i16rz) V64:$Vd, V64:$Vn), 0>; + def : InstAlias<asm # ".8h\t$Vd, $Vn, #0", + (!cast<Instruction>(NAME # v8i16rz) V128:$Vd, V128:$Vn), 0>; + } + def : InstAlias<asm # ".2s\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>; - def : InstAlias<asm # ".4s $Vd, $Vn, #0", + def : InstAlias<asm # ".4s\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>; - def : InstAlias<asm # ".2d $Vd, $Vn, #0", + def : InstAlias<asm # ".2d\t$Vd, $Vn, #0", (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>; } @@ -5325,7 +5636,7 @@ multiclass SIMDZipVector<bits<3>opc, string asm, //---------------------------------------------------------------------------- let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in -class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDThreeScalar<bit U, bits<3> size, bits<5> opcode, RegisterClass regtype, string asm, list<dag> pattern> : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm, @@ -5337,8 +5648,7 @@ class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode, let Inst{31-30} = 0b01; let Inst{29} = U; let Inst{28-24} = 0b11110; - let Inst{23-22} = size; - let Inst{21} = 1; + let Inst{23-21} = size; let Inst{20-16} = Rm; let Inst{15-11} = opcode; let Inst{10} = 1; @@ -5369,17 +5679,17 @@ class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode, multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm, + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; } multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm, + def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>; - def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>; - def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>; - def v1i8 : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>; + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; + def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>; def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>; @@ -5389,9 +5699,9 @@ multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm, multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i32 : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, + def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; - def v1i16 : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>; + def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>; } multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, @@ -5404,26 +5714,34 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm, asm, []>; } -multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm, + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm, + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>; + } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>; } -multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm, + def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>; - def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm, + def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm, [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm, + []>; + } // Predicates = [HasNEON, HasFullFP16] } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), @@ -5482,7 +5800,7 @@ multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm, //---------------------------------------------------------------------------- let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, RegisterClass regtype, RegisterClass regtype2, string asm, list<dag> pat> : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm, @@ -5494,7 +5812,9 @@ class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -5523,7 +5843,7 @@ class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in -class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode, +class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<2> size2, bits<5> opcode, RegisterClass regtype, string asm, string zero> : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn, #" # zero, "", []>, @@ -5534,7 +5854,9 @@ class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode, let Inst{29} = U; let Inst{28-24} = 0b11110; let Inst{23-22} = size; - let Inst{21-17} = 0b10000; + let Inst{21} = 0b1; + let Inst{20-19} = size2; + let Inst{18-17} = 0b00; let Inst{16-12} = opcode; let Inst{11-10} = 0b10; let Inst{9-5} = Rn; @@ -5556,21 +5878,28 @@ class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm> multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">; + def v1i64rz : BaseSIMDCmpTwoScalar<U, 0b11, 0b00, opc, FPR64, asm, "0">; def : Pat<(v1i64 (OpNode FPR64:$Rn)), (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; } -multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPCmpTwoScalar<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">; - def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">; + def v1i64rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b00, opc, FPR64, asm, "0.0">; + def v1i32rz : BaseSIMDCmpTwoScalar<U, {S,0}, 0b00, opc, FPR32, asm, "0.0">; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16rz : BaseSIMDCmpTwoScalar<U, {S,1}, 0b11, opc, FPR16, asm, "0.0">; + } - def : InstAlias<asm # " $Rd, $Rn, #0", + def : InstAlias<asm # "\t$Rd, $Rn, #0", (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>; - def : InstAlias<asm # " $Rd, $Rn, #0", + def : InstAlias<asm # "\t$Rd, $Rn, #0", (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>; + let Predicates = [HasNEON, HasFullFP16] in { + def : InstAlias<asm # "\t$Rd, $Rn, #0", + (!cast<Instruction>(NAME # v1i16rz) FPR16:$Rd, FPR16:$Rn), 0>; + } def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))), (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>; @@ -5578,35 +5907,42 @@ multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm, multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>; def : Pat<(i64 (OpNode (i64 FPR64:$Rn))), (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>; } -multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> { - def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>; - def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>; +multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> { + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,[]>; + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,[]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,[]>; + } } -multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm, +multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm, SDPatternOperator OpNode> { - def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm, [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>; - def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm, + def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm, [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>; + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm, + [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>; + } } multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { - def v1i64 : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm, + def v1i64 : BaseSIMDTwoScalar<U, 0b11, 0b00, opc, FPR64, FPR64, asm, [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>; - def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm, + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR32, asm, [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>; - def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>; - def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR16, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR8 , asm, []>; } def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))), @@ -5633,10 +5969,10 @@ multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm, let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode = null_frag> { - def v1i32 : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm, + def v1i32 : BaseSIMDTwoScalar<U, 0b10, 0b00, opc, FPR32, FPR64, asm, [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>; - def v1i16 : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>; - def v1i8 : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>; + def v1i16 : BaseSIMDTwoScalar<U, 0b01, 0b00, opc, FPR16, FPR32, asm, []>; + def v1i8 : BaseSIMDTwoScalar<U, 0b00, 0b00, opc, FPR8 , FPR16, asm, []>; } //---------------------------------------------------------------------------- @@ -5668,10 +6004,14 @@ multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> { asm, ".2d">; } -multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> { - def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64, +multiclass SIMDFPPairwiseScalar<bit S, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v2i16p : BaseSIMDPairwiseScalar<0, {S,0}, opc, FPR16Op, V64, + asm, ".2h">; + } + def v2i32p : BaseSIMDPairwiseScalar<1, {S,0}, opc, FPR32Op, V64, asm, ".2s">; - def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128, + def v2i64p : BaseSIMDPairwiseScalar<1, {S,1}, opc, FPR64Op, V128, asm, ".2d">; } @@ -5727,8 +6067,16 @@ multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> { asm, ".4s", []>; } -multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm, +multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm, Intrinsic intOp> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64, + asm, ".4h", + [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>; + def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128, + asm, ".8h", + [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>; + } // Predicates = [HasNEON, HasFullFP16] def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128, asm, ".4s", [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>; @@ -5925,7 +6273,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst, class SIMDInsElementMovAlias<string size, Instruction inst, Operand idxtype> : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" # - # "|" # size #" $dst$idx, $src$idx2}", + # "|" # size #"\t$dst$idx, $src$idx2}", (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>; @@ -6215,7 +6563,7 @@ multiclass SIMDScalarCPY<string asm> { // AdvSIMD modified immediate instructions //---------------------------------------------------------------------------- -class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops, +class BaseSIMDModifiedImm<bit Q, bit op, bit op2, dag oops, dag iops, string asm, string op_string, string cstr, list<dag> pattern> : I<oops, iops, asm, op_string, cstr, pattern>, @@ -6227,16 +6575,17 @@ class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops, let Inst{29} = op; let Inst{28-19} = 0b0111100000; let Inst{18-16} = imm8{7-5}; - let Inst{11-10} = 0b01; + let Inst{11} = op2; + let Inst{10} = 1; let Inst{9-5} = imm8{4-0}; let Inst{4-0} = Rd; } -class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype, +class BaseSIMDModifiedImmVector<bit Q, bit op, bit op2, RegisterOperand vectype, Operand immtype, dag opt_shift_iop, string opt_shift, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd), + : BaseSIMDModifiedImm<Q, op, op2, (outs vectype:$Rd), !con((ins immtype:$imm8), opt_shift_iop), asm, "{\t$Rd" # kind # ", $imm8" # opt_shift # "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", @@ -6248,7 +6597,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype, Operand immtype, dag opt_shift_iop, string opt_shift, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst), + : BaseSIMDModifiedImm<Q, op, 0, (outs vectype:$dst), !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop), asm, "{\t$Rd" # kind # ", $imm8" # opt_shift # "|" # kind # "\t$Rd, $imm8" # opt_shift # "}", @@ -6259,7 +6608,7 @@ class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype, class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins logical_vec_shift:$shift), "$shift", asm, kind, pattern> { bits<2> shift; @@ -6284,7 +6633,7 @@ class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12, class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins logical_vec_hw_shift:$shift), "$shift", asm, kind, pattern> { bits<2> shift; @@ -6349,7 +6698,7 @@ multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode, class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode, RegisterOperand vectype, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255, + : BaseSIMDModifiedImmVector<Q, op, 0, vectype, imm0_255, (ins move_vec_shift:$shift), "$shift", asm, kind, pattern> { bits<1> shift; @@ -6357,18 +6706,18 @@ class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode, let Inst{12} = shift; } -class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode, +class SIMDModifiedImmVectorNoShift<bit Q, bit op, bit op2, bits<4> cmode, RegisterOperand vectype, Operand imm_type, string asm, string kind, list<dag> pattern> - : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "", + : BaseSIMDModifiedImmVector<Q, op, op2, vectype, imm_type, (ins), "", asm, kind, pattern> { let Inst{15-12} = cmode; } class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm, list<dag> pattern> - : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm, + : BaseSIMDModifiedImm<Q, op, 0, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm, "\t$Rd, $imm8", "", pattern> { let Inst{15-12} = cmode; let DecoderMethod = "DecodeModImmInstruction"; @@ -6438,8 +6787,36 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc, let Inst{4-0} = Rd; } -multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, - SDPatternOperator OpNode> { +multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm, + SDPatternOperator OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b00, opc, + V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", + [(set (v4f16 V64:$Rd), + (OpNode (v4f16 V64:$Rn), + (v4f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", + [(set (v8f16 V128:$Rd), + (OpNode (v8f16 V128:$Rn), + (v8f16 (AArch64duplane16 (v8f16 V128_lo:$Rm), VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, @@ -6476,6 +6853,21 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, let Inst{21} = 0; } + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", + [(set (f16 FPR16Op:$Rd), + (OpNode (f16 FPR16Op:$Rn), + (f16 (vector_extract (v8f16 V128_lo:$Rm), + VectorIndexH:$idx))))]> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, asm, ".s", "", "", ".s", @@ -6501,7 +6893,7 @@ multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm, } } -multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> { +multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> { // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar. def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (AArch64duplane32 (v4f32 V128:$Rm), @@ -6553,7 +6945,28 @@ multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> { V128:$Rm, VectorIndexD:$idx)>; } -multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> { +multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b00, opc, V64, V64, + V128_lo, VectorIndexH, + asm, ".4h", ".4h", ".4h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + + def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b00, opc, + V128, V128, + V128_lo, VectorIndexH, + asm, ".8h", ".8h", ".8h", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64, V128, VectorIndexS, asm, ".2s", ".2s", ".2s", ".s", []> { @@ -6580,6 +6993,16 @@ multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> { let Inst{21} = 0; } + let Predicates = [HasNEON, HasFullFP16] in { + def v1i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b00, opc, + FPR16Op, FPR16Op, V128_lo, VectorIndexH, + asm, ".h", "", "", ".h", []> { + bits<3> idx; + let Inst{11} = idx{2}; + let Inst{21} = idx{1}; + let Inst{20} = idx{0}; + } + } // Predicates = [HasNEON, HasFullFP16] def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc, FPR32Op, FPR32Op, V128, VectorIndexS, @@ -7117,7 +7540,13 @@ class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm, } -multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> { +multiclass SIMDFPScalarRShift<bit U, bits<5> opc, string asm> { + let Predicates = [HasNEON, HasFullFP16] in { + def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?}, + FPR16, FPR16, vecshiftR16, asm, []> { + let Inst{19-16} = imm{3-0}; + } + } // Predicates = [HasNEON, HasFullFP16] def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?}, FPR32, FPR32, vecshiftR32, asm, []> { let Inst{20-16} = imm{4-0}; @@ -7297,6 +7726,23 @@ class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm, multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm, Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4i16 V64:$Rd), (OpNode (v4f16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8i16 V128:$Rd), (OpNode (v8f16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", @@ -7322,8 +7768,26 @@ multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm, } } -multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm, +multiclass SIMDVectorRShiftToFP<bit U, bits<5> opc, string asm, Intrinsic OpNode> { + let Predicates = [HasNEON, HasFullFP16] in { + def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?}, + V64, V64, vecshiftR16, + asm, ".4h", ".4h", + [(set (v4f16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + + def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?}, + V128, V128, vecshiftR16, + asm, ".8h", ".8h", + [(set (v8f16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (i32 imm:$imm)))]> { + bits<4> imm; + let Inst{19-16} = imm; + } + } // Predicates = [HasNEON, HasFullFP16] + def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?}, V64, V64, vecshiftR32, asm, ".2s", ".2s", @@ -8604,9 +9068,8 @@ let Predicates = [HasNEON, HasV8_1a] in { class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode, RegisterOperand regtype, string asm, string kind, list<dag> pattern> - : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, + : BaseSIMDThreeSameVectorTied<Q, U, {size,0}, opcode, regtype, asm, kind, pattern> { - let Inst{21}=0; } multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm, SDPatternOperator Accum> { @@ -9041,6 +9504,7 @@ def : TokenAlias<".8H", ".8h">; def : TokenAlias<".4S", ".4s">; def : TokenAlias<".2D", ".2d">; def : TokenAlias<".1Q", ".1q">; +def : TokenAlias<".2H", ".2h">; def : TokenAlias<".B", ".b">; def : TokenAlias<".H", ".h">; def : TokenAlias<".S", ".s">; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index c0b3f2c..3ef3c8b 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" -#include "AArch64MachineCombinerPattern.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -533,6 +532,14 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, CC); } +/// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. +static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { + uint64_t Imm = MI->getOperand(1).getImm(); + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); +} + // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { @@ -573,6 +580,12 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ORRWrr: case AArch64::ORRXrr: return true; + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or + // ORRXri, it is as cheap as MOV + case AArch64::MOVi32imm: + return canBeExpandedToORR(MI, 32); + case AArch64::MOVi64imm: + return canBeExpandedToORR(MI, 64); } llvm_unreachable("Unknown opcode to check as cheap as a move!"); @@ -1379,42 +1392,34 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( Width = 1; Scale = 1; break; + case AArch64::LDRQui: + case AArch64::STRQui: + Scale = Width = 16; + break; case AArch64::LDRXui: + case AArch64::LDRDui: case AArch64::STRXui: + case AArch64::STRDui: Scale = Width = 8; break; case AArch64::LDRWui: + case AArch64::LDRSui: case AArch64::STRWui: + case AArch64::STRSui: Scale = Width = 4; break; - case AArch64::LDRBui: - case AArch64::STRBui: - Scale = Width = 1; - break; case AArch64::LDRHui: + case AArch64::LDRHHui: case AArch64::STRHui: + case AArch64::STRHHui: Scale = Width = 2; break; - case AArch64::LDRSui: - case AArch64::STRSui: - Scale = Width = 4; - break; - case AArch64::LDRDui: - case AArch64::STRDui: - Scale = Width = 8; - break; - case AArch64::LDRQui: - case AArch64::STRQui: - Scale = Width = 16; - break; + case AArch64::LDRBui: case AArch64::LDRBBui: + case AArch64::STRBui: case AArch64::STRBBui: Scale = Width = 1; break; - case AArch64::LDRHHui: - case AArch64::STRHHui: - Scale = Width = 2; - break; }; BaseReg = LdSt->getOperand(1).getReg(); @@ -1445,23 +1450,43 @@ bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *Second) const { - // Cyclone can fuse CMN, CMP followed by Bcc. - - // FIXME: B0 can also fuse: - // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ. - if (Second->getOpcode() != AArch64::Bcc) - return false; - switch (First->getOpcode()) { - default: - return false; - case AArch64::SUBSWri: - case AArch64::ADDSWri: - case AArch64::ANDSWri: - case AArch64::SUBSXri: - case AArch64::ADDSXri: - case AArch64::ANDSXri: - return true; + if (Subtarget.isCyclone()) { + // Cyclone can fuse CMN, CMP, TST followed by Bcc. + unsigned SecondOpcode = Second->getOpcode(); + if (SecondOpcode == AArch64::Bcc) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::SUBSWri: + case AArch64::ADDSWri: + case AArch64::ANDSWri: + case AArch64::SUBSXri: + case AArch64::ADDSXri: + case AArch64::ANDSXri: + return true; + } + } + // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || + SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { + switch (First->getOpcode()) { + default: + return false; + case AArch64::ADDWri: + case AArch64::ADDXri: + case AArch64::ANDWri: + case AArch64::ANDXri: + case AArch64::EORWri: + case AArch64::EORXri: + case AArch64::ORRWri: + case AArch64::ORRXri: + case AArch64::SUBWri: + case AArch64::SUBXri: + return true; + } + } } + return false; } MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( @@ -1814,7 +1839,7 @@ void AArch64InstrInfo::storeRegToStackSlot( MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align); unsigned Opc = 0; @@ -1911,7 +1936,7 @@ void AArch64InstrInfo::loadRegFromStackSlot( MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI)); + MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); @@ -2226,11 +2251,19 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, case AArch64::LDPDi: case AArch64::STPXi: case AArch64::STPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + ImmIdx = 3; IsSigned = true; Scale = 8; break; case AArch64::LDPQi: case AArch64::STPQi: + case AArch64::LDNPQi: + case AArch64::STNPQi: + ImmIdx = 3; IsSigned = true; Scale = 16; break; @@ -2238,6 +2271,11 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset, case AArch64::LDPSi: case AArch64::STPWi: case AArch64::STPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + ImmIdx = 3; IsSigned = true; Scale = 4; break; @@ -2457,7 +2495,7 @@ static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { unsigned Opc = Root.getOpcode(); MachineBasicBlock &MBB = *Root.getParent(); bool Found = false; @@ -2485,76 +2523,76 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( "ADDWrr does not have register operands"); if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDW_OP2); + Patterns.push_back(MachineCombinerPattern::MULADDW_OP2); Found = true; } break; case AArch64::ADDXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDX_OP2); + Patterns.push_back(MachineCombinerPattern::MULADDX_OP2); Found = true; } break; case AArch64::SUBWrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBW_OP2); + Patterns.push_back(MachineCombinerPattern::MULSUBW_OP2); Found = true; } break; case AArch64::SUBXrr: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP1); Found = true; } if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBX_OP2); + Patterns.push_back(MachineCombinerPattern::MULSUBX_OP2); Found = true; } break; case AArch64::ADDWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDWI_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDWI_OP1); Found = true; } break; case AArch64::ADDXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULADDXI_OP1); + Patterns.push_back(MachineCombinerPattern::MULADDXI_OP1); Found = true; } break; case AArch64::SUBWri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, AArch64::WZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBWI_OP1); Found = true; } break; case AArch64::SUBXri: if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, AArch64::XZR)) { - Patterns.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1); + Patterns.push_back(MachineCombinerPattern::MULSUBXI_OP1); Found = true; } break; @@ -2661,7 +2699,7 @@ static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, /// this function generates the instructions that could replace the /// original code sequence void AArch64InstrInfo::genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const { @@ -2677,13 +2715,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( default: // signal error. break; - case MachineCombinerPattern::MC_MULADDW_OP1: - case MachineCombinerPattern::MC_MULADDX_OP1: + case MachineCombinerPattern::MULADDW_OP1: + case MachineCombinerPattern::MULADDX_OP1: // MUL I=A,B,0 // ADD R,I,C // ==> MADD R,A,B,C // --- Create(MADD); - if (Pattern == MachineCombinerPattern::MC_MULADDW_OP1) { + if (Pattern == MachineCombinerPattern::MULADDW_OP1) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2692,13 +2730,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; - case MachineCombinerPattern::MC_MULADDW_OP2: - case MachineCombinerPattern::MC_MULADDX_OP2: + case MachineCombinerPattern::MULADDW_OP2: + case MachineCombinerPattern::MULADDX_OP2: // MUL I=A,B,0 // ADD R,C,I // ==> MADD R,A,B,C // --- Create(MADD); - if (Pattern == MachineCombinerPattern::MC_MULADDW_OP2) { + if (Pattern == MachineCombinerPattern::MULADDW_OP2) { Opc = AArch64::MADDWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2707,8 +2745,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - case MachineCombinerPattern::MC_MULADDWI_OP1: - case MachineCombinerPattern::MC_MULADDXI_OP1: { + case MachineCombinerPattern::MULADDWI_OP1: + case MachineCombinerPattern::MULADDXI_OP1: { // MUL I=A,B,0 // ADD R,I,Imm // ==> ORR V, ZR, Imm @@ -2716,7 +2754,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) { + if (Pattern == MachineCombinerPattern::MULADDWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; @@ -2751,8 +2789,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } - case MachineCombinerPattern::MC_MULSUBW_OP1: - case MachineCombinerPattern::MC_MULSUBX_OP1: { + case MachineCombinerPattern::MULSUBW_OP1: + case MachineCombinerPattern::MULSUBX_OP1: { // MUL I=A,B,0 // SUB R,I, C // ==> SUB V, 0, C @@ -2760,7 +2798,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *SubRC; unsigned SubOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) { + if (Pattern == MachineCombinerPattern::MULSUBW_OP1) { SubOpc = AArch64::SUBWrr; SubRC = &AArch64::GPR32spRegClass; ZeroReg = AArch64::WZR; @@ -2784,13 +2822,13 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC); break; } - case MachineCombinerPattern::MC_MULSUBW_OP2: - case MachineCombinerPattern::MC_MULSUBX_OP2: + case MachineCombinerPattern::MULSUBW_OP2: + case MachineCombinerPattern::MULSUBX_OP2: // MUL I=A,B,0 // SUB R,C,I // ==> MSUB R,A,B,C (computes C - A*B) // --- Create(MSUB); - if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP2) { + if (Pattern == MachineCombinerPattern::MULSUBW_OP2) { Opc = AArch64::MSUBWrrr; RC = &AArch64::GPR32RegClass; } else { @@ -2799,8 +2837,8 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; - case MachineCombinerPattern::MC_MULSUBWI_OP1: - case MachineCombinerPattern::MC_MULSUBXI_OP1: { + case MachineCombinerPattern::MULSUBWI_OP1: + case MachineCombinerPattern::MULSUBXI_OP1: { // MUL I=A,B,0 // SUB R,I, Imm // ==> ORR V, ZR, -Imm @@ -2808,7 +2846,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( // --- Create(MADD); const TargetRegisterClass *OrrRC; unsigned BitSize, OrrOpc, ZeroReg; - if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) { + if (Pattern == MachineCombinerPattern::MULSUBWI_OP1) { OrrOpc = AArch64::ORRWri; OrrRC = &AArch64::GPR32spRegClass; BitSize = 32; @@ -2944,3 +2982,34 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { MI->eraseFromParent(); return true; } + +std::pair<unsigned, unsigned> +AArch64InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = AArch64II::MO_FRAGMENT; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_PAGE, "aarch64-page"}, + {MO_PAGEOFF, "aarch64-pageoff"}, + {MO_G3, "aarch64-g3"}, + {MO_G2, "aarch64-g2"}, + {MO_G1, "aarch64-g1"}, + {MO_G0, "aarch64-g0"}, + {MO_HI12, "aarch64-hi12"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace AArch64II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT, "aarch64-got"}, + {MO_NC, "aarch64-nc"}, + {MO_TLS, "aarch64-tls"}, + {MO_CONSTPOOL, "aarch64-constant-pool"}}; + return makeArrayRef(TargetFlags); +} diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 68c2a28..ae02822 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -167,13 +167,13 @@ public: /// for an instruction chain ending in <Root>. All potential patterns are /// listed in the <Patterns> array. bool getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) + SmallVectorImpl<MachineCombinerPattern> &Patterns) const override; /// When getMachineCombinerPatterns() finds patterns, this function generates /// the instructions that could replace the original code sequence void genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + MachineInstr &Root, MachineCombinerPattern Pattern, SmallVectorImpl<MachineInstr *> &InsInstrs, SmallVectorImpl<MachineInstr *> &DelInstrs, DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; @@ -181,6 +181,14 @@ public: bool useMachineCombiner() const override; bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; + private: void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, diff --git a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fa1a46a..d02bc9f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -16,6 +16,8 @@ // def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">, AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">; def HasNEON : Predicate<"Subtarget->hasNEON()">, @@ -24,6 +26,12 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, AssemblerPredicate<"FeatureCrypto", "crypto">; def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; +def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16", "fullfp16">; +def HasSPE : Predicate<"Subtarget->hasSPE()">, + AssemblerPredicate<"FeatureSPE", "spe">; + def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsCyclone : Predicate<"Subtarget->isCyclone()">; @@ -66,6 +74,20 @@ def SDT_AArch64CSel : SDTypeProfile<1, 4, SDTCisSameAs<0, 2>, SDTCisInt<3>, SDTCisVT<4, i32>]>; +def SDT_AArch64CCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisInt<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; +def SDT_AArch64FCCMP : SDTypeProfile<1, 5, + [SDTCisVT<0, i32>, + SDTCisFP<1>, + SDTCisSameAs<1, 2>, + SDTCisInt<3>, + SDTCisInt<4>, + SDTCisVT<5, i32>]>; def SDT_AArch64FCmp : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>; @@ -160,13 +182,14 @@ def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut, def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>; def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>; +def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>; +def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>; +def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>; + def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>; def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>; -def AArch64fmax : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>; -def AArch64fmin : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>; - def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>; def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>; def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; @@ -361,6 +384,9 @@ def : InstAlias<"wfi", (HINT 0b011)>; def : InstAlias<"sev", (HINT 0b100)>; def : InstAlias<"sevl", (HINT 0b101)>; +// v8.2a Statistical Profiling extension +def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>; + // As far as LLVM is concerned this writes to the system's exclusive monitors. let mayLoad = 1, mayStore = 1 in def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">; @@ -383,12 +409,17 @@ def : InstAlias<"isb", (ISB 0xf)>; def MRS : MRSI; def MSR : MSRI; -def MSRpstate: MSRpstateI; +def MSRpstateImm1 : MSRpstateImm0_1; +def MSRpstateImm4 : MSRpstateImm0_15; // The thread pointer (on Linux, at least, where this has been implemented) is // TPIDR_EL0. def : Pat<(AArch64threadpointer), (MRS 0xde82)>; +// The cycle counter PMC register is PMCCNTR_EL0. +let Predicates = [HasPerfMon] in +def : Pat<(readcyclecounter), (MRS 0xdce8)>; + // Generic system instructions def SYSxt : SystemXtI<0, "sys">; def SYSLxt : SystemLXtI<1, "sysl">; @@ -595,10 +626,12 @@ def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm), (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>; def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm), (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>; +let AddedComplexity = 1 in { def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3), (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>; def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3), (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>; +} // Because of the immediate format for add/sub-imm instructions, the // expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1). @@ -823,7 +856,7 @@ defm AND : LogicalReg<0b00, 0, "and", and>; defm BIC : LogicalReg<0b00, 1, "bic", BinOpFrag<(and node:$LHS, (not node:$RHS))>>; defm EON : LogicalReg<0b10, 1, "eon", - BinOpFrag<(xor node:$LHS, (not node:$RHS))>>; + BinOpFrag<(not (xor node:$LHS, node:$RHS))>>; defm EOR : LogicalReg<0b10, 0, "eor", xor>; defm ORN : LogicalReg<0b01, 1, "orn", BinOpFrag<(or node:$LHS, (not node:$RHS))>>; @@ -1020,13 +1053,10 @@ def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>; def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>; //===----------------------------------------------------------------------===// -// Conditionally set flags instructions. +// Conditional comparison instructions. //===----------------------------------------------------------------------===// -defm CCMN : CondSetFlagsImm<0, "ccmn">; -defm CCMP : CondSetFlagsImm<1, "ccmp">; - -defm CCMN : CondSetFlagsReg<0, "ccmn">; -defm CCMP : CondSetFlagsReg<1, "ccmp">; +defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>; +defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>; //===----------------------------------------------------------------------===// // Conditional select instructions. @@ -2421,6 +2451,26 @@ defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvt defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>; } +multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> { + def : Pat<(i32 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UWSr) f32:$Rn)>; + def : Pat<(i64 (to_int (round f32:$Rn))), + (!cast<Instruction>(INST # UXSr) f32:$Rn)>; + def : Pat<(i32 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UWDr) f64:$Rn)>; + def : Pat<(i64 (to_int (round f64:$Rn))), + (!cast<Instruction>(INST # UXDr) f64:$Rn)>; +} + +defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">; +defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">; +defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">; +defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">; +defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">; +defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">; +defm : FPToIntegerPats<fp_to_sint, frnd, "FCVTAS">; +defm : FPToIntegerPats<fp_to_uint, frnd, "FCVTAU">; + //===----------------------------------------------------------------------===// // Scaled integer to floating point conversion instructions. //===----------------------------------------------------------------------===// @@ -2466,14 +2516,7 @@ defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>; def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))), (FRINTNDr FPR64:$Rn)>; -// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior -// in the C spec. Setting hasSideEffects ensures it is not DCE'd. -// <rdar://problem/13715968> -// TODO: We should really model the FPSR flags correctly. This is really ugly. -let hasSideEffects = 1 in { defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>; -} - defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>; let SchedRW = [WriteFDiv] in { @@ -2488,23 +2531,23 @@ defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>; let SchedRW = [WriteFDiv] in { defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>; } -defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>; -defm FMAX : TwoOperandFPData<0b0100, "fmax", AArch64fmax>; -defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>; -defm FMIN : TwoOperandFPData<0b0101, "fmin", AArch64fmin>; +defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>; +defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>; +defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>; +defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>; let SchedRW = [WriteFMul] in { defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>; defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>; } defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>; -def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>; -def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), +def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>; //===----------------------------------------------------------------------===// @@ -2556,7 +2599,7 @@ defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>; //===----------------------------------------------------------------------===// defm FCCMPE : FPCondComparison<1, "fccmpe">; -defm FCCMP : FPCondComparison<0, "fccmp">; +defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>; //===----------------------------------------------------------------------===// // Floating point conditional select instruction. @@ -2589,6 +2632,40 @@ defm FMOV : FPMoveImmediate<"fmov">; // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// +defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", + int_aarch64_neon_uabd>; +// Match UABDL in log2-shuffle patterns. +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (v8i8 V64:$opA)), + (zext (v8i8 V64:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; +def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))), + (v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)), + (zext (extract_high_v16i8 V128:$opB))), + (AArch64vashr v8i16:$src, (i32 15))))), + (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (v4i16 V64:$opA)), + (zext (v4i16 V64:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; +def : Pat<(xor (v4i32 (AArch64vashr v4i32:$src, (i32 31))), + (v4i32 (add (sub (zext (extract_high_v8i16 V128:$opA)), + (zext (extract_high_v8i16 V128:$opB))), + (AArch64vashr v4i32:$src, (i32 31))))), + (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (v2i32 V64:$opA)), + (zext (v2i32 V64:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; +def : Pat<(xor (v2i64 (AArch64vashr v2i64:$src, (i32 63))), + (v2i64 (add (sub (zext (extract_high_v4i32 V128:$opA)), + (zext (extract_high_v4i32 V128:$opB))), + (AArch64vashr v2i64:$src, (i32 63))))), + (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; + defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>; def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))), (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))), @@ -2780,29 +2857,29 @@ defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>; defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>; -defm FABD : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>; -defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>; -defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>; -defm FADDP : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>; -defm FADD : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>; -defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; -defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; -defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; -defm FDIV : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>; -defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; -defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>; -defm FMAXP : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>; -defm FMAX : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>; -defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>; -defm FMINNM : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>; -defm FMINP : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>; -defm FMIN : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>; +defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>; +defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>; +defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>; +defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>; +defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>; +defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>; +defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>; +defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>; +defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>; +defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>; +defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>; +defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>; +defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>; +defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>; // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the // instruction expects the addend first, while the fma intrinsic puts it last. -defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla", +defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla", TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >; -defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls", +defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls", TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; // The following def pats catch the case where the LHS of an FMA is negated. @@ -2816,11 +2893,11 @@ def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)), (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>; -defm FMULX : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>; -defm FRECPS : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>; -defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>; -defm FSUB : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>; +defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>; +defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; +defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", @@ -2833,9 +2910,9 @@ defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>; defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>; defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>; defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>; -defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>; +defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>; defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>; -defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>; +defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>; defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>; @@ -2852,9 +2929,9 @@ defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>; defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>; defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>; defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>; -defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>; +defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>; defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>; -defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>; +defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>; defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>; defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>; defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>; @@ -2879,54 +2956,6 @@ defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn", BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >; defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>; -def : Pat<(v8i8 (smin V64:$Rn, V64:$Rm)), - (SMINv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (smin V64:$Rn, V64:$Rm)), - (SMINv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (smin V64:$Rn, V64:$Rm)), - (SMINv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (smin V128:$Rn, V128:$Rm)), - (SMINv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (smin V128:$Rn, V128:$Rm)), - (SMINv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (smin V128:$Rn, V128:$Rm)), - (SMINv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (smax V64:$Rn, V64:$Rm)), - (SMAXv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (smax V64:$Rn, V64:$Rm)), - (SMAXv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (smax V64:$Rn, V64:$Rm)), - (SMAXv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (smax V128:$Rn, V128:$Rm)), - (SMAXv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (smax V128:$Rn, V128:$Rm)), - (SMAXv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (smax V128:$Rn, V128:$Rm)), - (SMAXv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (umin V64:$Rn, V64:$Rm)), - (UMINv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (umin V64:$Rn, V64:$Rm)), - (UMINv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (umin V64:$Rn, V64:$Rm)), - (UMINv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (umin V128:$Rn, V128:$Rm)), - (UMINv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (umin V128:$Rn, V128:$Rm)), - (UMINv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (umin V128:$Rn, V128:$Rm)), - (UMINv4i32 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i8 (umax V64:$Rn, V64:$Rm)), - (UMAXv8i8 V64:$Rn, V64:$Rm)>; -def : Pat<(v4i16 (umax V64:$Rn, V64:$Rm)), - (UMAXv4i16 V64:$Rn, V64:$Rm)>; -def : Pat<(v2i32 (umax V64:$Rn, V64:$Rm)), - (UMAXv2i32 V64:$Rn, V64:$Rm)>; -def : Pat<(v16i8 (umax V128:$Rn, V128:$Rm)), - (UMAXv16i8 V128:$Rn, V128:$Rm)>; -def : Pat<(v8i16 (umax V128:$Rn, V128:$Rm)), - (UMAXv8i16 V128:$Rn, V128:$Rm)>; -def : Pat<(v4i32 (umax V128:$Rn, V128:$Rm)), - (UMAXv4i32 V128:$Rn, V128:$Rm)>; def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm), (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>; @@ -3052,6 +3081,14 @@ def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" # "|cmlt.2d\t$dst, $src1, $src2}", (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmle.4h\t$dst, $src1, $src2}", + (FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmle.8h\t$dst, $src1, $src2}", + (FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" # "|fcmle.2s\t$dst, $src1, $src2}", (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3062,6 +3099,14 @@ def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" # "|fcmle.2d\t$dst, $src1, $src2}", (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" # + "|fcmlt.4h\t$dst, $src1, $src2}", + (FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" # + "|fcmlt.8h\t$dst, $src1, $src2}", + (FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" # "|fcmlt.2s\t$dst, $src1, $src2}", (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3072,6 +3117,14 @@ def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" # "|fcmlt.2d\t$dst, $src1, $src2}", (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" # + "|facle.4h\t$dst, $src1, $src2}", + (FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" # + "|facle.8h\t$dst, $src1, $src2}", + (FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" # "|facle.2s\t$dst, $src1, $src2}", (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3082,6 +3135,14 @@ def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" # "|facle.2d\t$dst, $src1, $src2}", (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>; +let Predicates = [HasNEON, HasFullFP16] in { +def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" # + "|faclt.4h\t$dst, $src1, $src2}", + (FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>; +def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" # + "|faclt.8h\t$dst, $src1, $src2}", + (FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>; +} def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" # "|faclt.2s\t$dst, $src1, $src2}", (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>; @@ -3103,19 +3164,19 @@ defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>; defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>; defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>; defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>; -defm FABD : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>; +defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>; def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))), (FABD64 FPR64:$Rn, FPR64:$Rm)>; -defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge", +defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge", int_aarch64_neon_facge>; -defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt", +defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt", int_aarch64_neon_facgt>; -defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>; -defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>; -defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>; -defm FMULX : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>; -defm FRECPS : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>; -defm FRSQRTS : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>; +defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>; +defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>; +defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>; +defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>; +defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>; +defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>; defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>; defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>; @@ -3198,35 +3259,35 @@ defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>; defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>; defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>; defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>; -defm FCMEQ : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; -defm FCMGE : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>; -defm FCMGT : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; -defm FCMLE : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>; -defm FCMLT : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; -defm FCVTAS : SIMDTwoScalarSD< 0, 0, 0b11100, "fcvtas">; -defm FCVTAU : SIMDTwoScalarSD< 1, 0, 0b11100, "fcvtau">; -defm FCVTMS : SIMDTwoScalarSD< 0, 0, 0b11011, "fcvtms">; -defm FCVTMU : SIMDTwoScalarSD< 1, 0, 0b11011, "fcvtmu">; -defm FCVTNS : SIMDTwoScalarSD< 0, 0, 0b11010, "fcvtns">; -defm FCVTNU : SIMDTwoScalarSD< 1, 0, 0b11010, "fcvtnu">; -defm FCVTPS : SIMDTwoScalarSD< 0, 1, 0b11010, "fcvtps">; -defm FCVTPU : SIMDTwoScalarSD< 1, 1, 0b11010, "fcvtpu">; +defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>; +defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>; +defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; +defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>; +defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; +defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">; +defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">; +defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">; +defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">; +defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">; +defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">; +defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">; +defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; -defm FCVTZS : SIMDTwoScalarSD< 0, 1, 0b11011, "fcvtzs">; -defm FCVTZU : SIMDTwoScalarSD< 1, 1, 0b11011, "fcvtzu">; -defm FRECPE : SIMDTwoScalarSD< 0, 1, 0b11101, "frecpe">; -defm FRECPX : SIMDTwoScalarSD< 0, 1, 0b11111, "frecpx">; -defm FRSQRTE : SIMDTwoScalarSD< 1, 1, 0b11101, "frsqrte">; +defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; +defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; +defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">; +defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">; +defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">; defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg", UnOpFrag<(sub immAllZerosV, node:$LHS)> >; -defm SCVTF : SIMDTwoScalarCVTSD< 0, 0, 0b11101, "scvtf", AArch64sitof>; +defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>; defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>; defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>; defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>; defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>; defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd", int_aarch64_neon_suqadd>; -defm UCVTF : SIMDTwoScalarCVTSD< 1, 0, 0b11101, "ucvtf", AArch64uitof>; +defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>; defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>; defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", int_aarch64_neon_usqadd>; @@ -3390,8 +3451,6 @@ defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", int_aarch64_neon_uabd>; -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", - int_aarch64_neon_uabd>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", @@ -3449,8 +3508,8 @@ defm : Neon_mulacc_widen_patterns< // Patterns for 64-bit pmull def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm), (PMULLv1i64 V64:$Rn, V64:$Rm)>; -def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)), - (vector_extract (v2i64 V128:$Rm), (i64 1))), +def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)), + (extractelt (v2i64 V128:$Rm), (i64 1))), (PMULLv2i64 V128:$Rn, V128:$Rm)>; // CodeGen patterns for addhn and subhn instructions, which can actually be @@ -3593,11 +3652,11 @@ defm CPY : SIMDScalarCPY<"cpy">; //---------------------------------------------------------------------------- defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">; -defm FADDP : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">; -defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">; -defm FMAXP : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">; -defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">; -defm FMINP : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">; +defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">; +defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">; +defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">; +defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">; +defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">; def : Pat<(v2i64 (AArch64saddv V128:$Rn)), (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>; def : Pat<(v2i64 (AArch64uaddv V128:$Rn)), @@ -3713,12 +3772,12 @@ defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>; multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP, SDNodeXForm IdxXFORM> { - def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn), + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn), imm:$idx))))), (DUP V128:$Rn, (IdxXFORM imm:$idx))>; - def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn), - imm:$idx))))), + def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn), + imm:$idx))))), (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>; } @@ -3747,6 +3806,13 @@ def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16), def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))), (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>; +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn), + VectorIndexB:$idx)))), i8), + (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>; +def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn), + VectorIndexH:$idx)))), i16), + (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>; + // Extracting i8 or i16 elements will have the zero-extend transformed to // an 'and' mask by type legalization since neither i8 nor i16 are legal types // for AArch64. Match these patterns here since UMOV already zeroes out the high @@ -3784,6 +3850,11 @@ def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))), (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (i64 FPR64:$Rn), dsub))>; +def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; +def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))), + (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>; + def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>; def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))), @@ -3949,10 +4020,10 @@ defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">; defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">; defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">; defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">; -defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; -defm FMAXV : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; -defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; -defm FMINV : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>; +defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>; +defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>; +defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>; +defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>; // Patterns for across-vector intrinsics, that have a node equivalent, that // returns a vector (with only the low lane defined) instead of a scalar. @@ -4199,15 +4270,23 @@ def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>; def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>; // AdvSIMD FMOV -def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8, +def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8, "fmov", ".2d", [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; -def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64, fpimm8, +def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8, "fmov", ".2s", [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; -def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8, +def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8, "fmov", ".4s", [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +let Predicates = [HasNEON, HasFullFP16] in { +def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8, + "fmov", ".4h", + [(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>; +def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8, + "fmov", ".8h", + [(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>; +} // Predicates = [HasNEON, HasFullFP16] // AdvSIMD MOVI @@ -4235,7 +4314,7 @@ def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; // The movi_edit node has the immediate value already encoded, so we use // a plain imm0_255 in the pattern let isReMaterializable = 1, isAsCheapAsAMove = 1 in -def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128, +def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128, simdimmtype10, "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; @@ -4296,10 +4375,10 @@ def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s", (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>; // Per byte: 8b & 16b -def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64, imm0_255, +def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255, "movi", ".8b", [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>; -def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255, +def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255, "movi", ".16b", [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>; @@ -4340,8 +4419,8 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s", //---------------------------------------------------------------------------- let hasSideEffects = 0 in { - defm FMLA : SIMDFPIndexedSDTied<0, 0b0001, "fmla">; - defm FMLS : SIMDFPIndexedSDTied<0, 0b0101, "fmls">; + defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">; + defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">; } // NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the @@ -4349,18 +4428,18 @@ let hasSideEffects = 0 in { // On the other hand, there are quite a few valid combinatorial options due to // the commutativity of multiplication and the fact that (-x) * y = x * (-y). -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", +defm : SIMDFPIndexedTiedPatterns<"FMLA", TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>; -defm : SIMDFPIndexedSDTiedPatterns<"FMLA", +defm : SIMDFPIndexedTiedPatterns<"FMLA", TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >; -defm : SIMDFPIndexedSDTiedPatterns<"FMLS", +defm : SIMDFPIndexedTiedPatterns<"FMLS", TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >; multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { @@ -4424,7 +4503,9 @@ multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> { (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, V128:$Rm, VectorIndexS:$idx)>; def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn), - (vector_extract (v2f32 (fneg V64:$Rm)), + (vector_extract (v4f32 (insert_subvector undef, + (v2f32 (fneg V64:$Rm)), + (i32 0))), VectorIndexS:$idx))), (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn, (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>; @@ -4442,8 +4523,8 @@ defm : FMLSIndexedAfterNegPatterns< defm : FMLSIndexedAfterNegPatterns< TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >; -defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; -defm FMUL : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>; +defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>; +defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>; def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))), (FMULv2i32_indexed V64:$Rn, @@ -4497,10 +4578,10 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn), //---------------------------------------------------------------------------- // AdvSIMD scalar shift instructions //---------------------------------------------------------------------------- -defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">; -defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">; -defm SCVTF : SIMDScalarRShiftSD<0, 0b11100, "scvtf">; -defm UCVTF : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">; +defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">; +defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">; +defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">; +defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">; // Codegen patterns for the above. We don't put these directly on the // instructions because TableGen's type inference can't handle the truth. // Having the same base pattern for fp <--> int totally freaks it out. @@ -4573,7 +4654,7 @@ defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra", //---------------------------------------------------------------------------- defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>; defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>; -defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf", +defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf", int_aarch64_neon_vcvtfxs2fp>; defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn", int_aarch64_neon_rshrn>; @@ -4608,7 +4689,7 @@ defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll", defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>; defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra", TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>; -defm UCVTF : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf", +defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf", int_aarch64_neon_vcvtfxu2fp>; defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn", int_aarch64_neon_uqrshrn>; @@ -5133,10 +5214,10 @@ def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>; def : Pat<(i64 (anyext GPR32:$src)), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>; -// When we need to explicitly zero-extend, we use an unsigned bitfield move -// instruction (UBFM) on the enclosing super-reg. +// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and +// then assert the extension has happened. def : Pat<(i64 (zext GPR32:$src)), - (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>; + (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>; // To sign extend, we use a signed bitfield move instruction (SBFM) on the // containing super-reg. @@ -5801,6 +5882,21 @@ def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 (REV16v16i8 FPR128:$src))>; } +def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; +def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))), + (EXTRACT_SUBREG V128:$Rn, dsub)>; + def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))), (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>; def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))), @@ -5852,6 +5948,45 @@ def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))), (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>; +// Patterns for nontemporal/no-allocate stores. +// We have to resort to tricks to turn a single-input store into a store pair, +// because there is no single-input nontemporal store, only STNP. +let Predicates = [IsLE] in { +let AddedComplexity = 15 in { +class NTStore128Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR128:$Rt), + (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub), + (CPYi64 FPR128:$Rt, (i64 1)), + GPR64sp:$Rn, simm7s8:$offset)>; + +def : NTStore128Pat<v2i64>; +def : NTStore128Pat<v4i32>; +def : NTStore128Pat<v8i16>; +def : NTStore128Pat<v16i8>; + +class NTStore64Pat<ValueType VT> : + Pat<(nontemporalstore (VT FPR64:$Rt), + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub), + (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)), + GPR64sp:$Rn, simm7s4:$offset)>; + +// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64? +def : NTStore64Pat<v1f64>; +def : NTStore64Pat<v1i64>; +def : NTStore64Pat<v2i32>; +def : NTStore64Pat<v4i16>; +def : NTStore64Pat<v8i8>; + +def : Pat<(nontemporalstore GPR64:$Rt, + (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)), + (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), + (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32), + GPR64sp:$Rn, simm7s4:$offset)>; +} // AddedComplexity=10 +} // Predicates = [IsLE] + // Tail call return handling. These are all compiler pseudo-instructions, // so no encoding information or anything like that. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 82f77a7..566aa2c 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -41,54 +41,85 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded"); STATISTIC(NumPreFolded, "Number of pre-index updates folded"); STATISTIC(NumUnscaledPairCreated, "Number of load/store from unscaled generated"); +STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted"); +STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); +STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); -// Place holder while testing unscaled load/store combining -static cl::opt<bool> EnableAArch64UnscaledMemOp( - "aarch64-unscaled-mem-op", cl::Hidden, - cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); +namespace llvm { +void initializeAArch64LoadStoreOptPass(PassRegistry &); +} + +#define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass" namespace { + +typedef struct LdStPairFlags { + // If a matching instruction is found, MergeForward is set to true if the + // merge is to remove the first instruction and replace the second with + // a pair-wise insn, and false if the reverse is true. + bool MergeForward; + + // SExtIdx gives the index of the result of the load pair that must be + // extended. The value of SExtIdx assumes that the paired load produces the + // value in this order: (I, returned iterator), i.e., -1 means no value has + // to be extended, 0 means I, and 1 means the returned iterator. + int SExtIdx; + + LdStPairFlags() : MergeForward(false), SExtIdx(-1) {} + + void setMergeForward(bool V = true) { MergeForward = V; } + bool getMergeForward() const { return MergeForward; } + + void setSExtIdx(int V) { SExtIdx = V; } + int getSExtIdx() const { return SExtIdx; } + +} LdStPairFlags; + struct AArch64LoadStoreOpt : public MachineFunctionPass { static char ID; - AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} + AArch64LoadStoreOpt() : MachineFunctionPass(ID) { + initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry()); + } const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; + const AArch64Subtarget *Subtarget; // Scan the instructions looking for a load/store that can be combined // with the current instruction into a load/store pair. // Return the matching instruction if one is found, else MBB->end(). - // If a matching instruction is found, MergeForward is set to true if the - // merge is to remove the first instruction and replace the second with - // a pair-wise insn, and false if the reverse is true. - // \p SExtIdx[out] gives the index of the result of the load pair that - // must be extended. The value of SExtIdx assumes that the paired load - // produces the value in this order: (I, returned iterator), i.e., - // -1 means no value has to be extended, 0 means I, and 1 means the - // returned iterator. MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, int &SExtIdx, + LdStPairFlags &Flags, unsigned Limit); + + // Scan the instructions looking for a store that writes to the address from + // which the current load instruction reads. Return true if one is found. + bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI); + // Merge the two instructions indicated into a single pair-wise instruction. // If MergeForward is true, erase the first instruction and fold its // operation into the second. If false, the reverse. Return the instruction // following the first instruction (which may change during processing). - // \p SExtIdx index of the result that must be extended for a paired load. - // -1 means none, 0 means I, and 1 means Paired. MachineBasicBlock::iterator mergePairedInsns(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Paired, bool MergeForward, - int SExtIdx); + MachineBasicBlock::iterator Paired, + const LdStPairFlags &Flags); + + // Promote the load that reads directly from the address stored to. + MachineBasicBlock::iterator + promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using // pre or post indexed addressing with writeback. Scan forwards. MachineBasicBlock::iterator findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, - int Value); + int UnscaledOffset); // Scan the instruction list to find a base register update that can // be combined with the current instruction (a load or store) using @@ -96,97 +127,177 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { MachineBasicBlock::iterator findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); - // Merge a pre-index base register update into a ld/st instruction. - MachineBasicBlock::iterator - mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + // Find an instruction that updates the base register of the ld/st + // instruction. + bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI, + unsigned BaseReg, int Offset); - // Merge a post-index base register update into a ld/st instruction. + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator - mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update); + mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, bool IsPreIdx); + + // Find and merge foldable ldr/str instructions. + bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI); - bool optimizeBlock(MachineBasicBlock &MBB); + // Find and promote load instructions which read directly from store. + bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); + + // Check if converting two narrow loads into a single wider load with + // bitfield extracts could be enabled. + bool enableNarrowLdMerge(MachineFunction &Fn); + + bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt); bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "AArch64 load / store optimization pass"; + return AARCH64_LOAD_STORE_OPT_NAME; } - -private: - int getMemSize(MachineInstr *MemMI); }; char AArch64LoadStoreOpt::ID = 0; } // namespace -static bool isUnscaledLdst(unsigned Opc) { +INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt", + AARCH64_LOAD_STORE_OPT_NAME, false, false) + +static bool isUnscaledLdSt(unsigned Opc) { switch (Opc) { default: return false; case AArch64::STURSi: - return true; case AArch64::STURDi: - return true; case AArch64::STURQi: - return true; + case AArch64::STURBBi: + case AArch64::STURHHi: case AArch64::STURWi: - return true; case AArch64::STURXi: - return true; case AArch64::LDURSi: - return true; case AArch64::LDURDi: - return true; case AArch64::LDURQi: - return true; case AArch64::LDURWi: - return true; case AArch64::LDURXi: - return true; case AArch64::LDURSWi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: return true; } } -// Size in bytes of the data moved by an unscaled load or store -int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { - switch (MemMI->getOpcode()) { +static bool isUnscaledLdSt(MachineInstr *MI) { + return isUnscaledLdSt(MI->getOpcode()); +} + +static unsigned getBitExtrOpcode(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode."); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + return AArch64::UBFMWri; + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + return AArch64::SBFMWri; + } +} + +static bool isNarrowStore(unsigned Opc) { + switch (Opc) { default: - llvm_unreachable("Opcode has unknown size!"); + return false; + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return true; + } +} + +static bool isNarrowStore(MachineInstr *MI) { + return isNarrowStore(MI->getOpcode()); +} + +static bool isNarrowLoad(unsigned Opc) { + switch (Opc) { + default: + return false; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + return true; + } +} + +static bool isNarrowLoad(MachineInstr *MI) { + return isNarrowLoad(MI->getOpcode()); +} + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::STRBBui: + case AArch64::STURBBi: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return 2; + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRSWui: + case AArch64::LDURSWi: + case AArch64::LDRWui: + case AArch64::LDURWi: case AArch64::STRSui: case AArch64::STURSi: - return 4; - case AArch64::STRDui: - case AArch64::STURDi: - return 8; - case AArch64::STRQui: - case AArch64::STURQi: - return 16; case AArch64::STRWui: case AArch64::STURWi: - return 4; - case AArch64::STRXui: - case AArch64::STURXi: - return 8; - case AArch64::LDRSui: - case AArch64::LDURSi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: return 4; case AArch64::LDRDui: case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: return 8; case AArch64::LDRQui: case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: return 16; - case AArch64::LDRWui: - case AArch64::LDURWi: - return 4; - case AArch64::LDRXui: - case AArch64::LDURXi: - return 8; - case AArch64::LDRSWui: - case AArch64::LDURSWi: - return 4; } } @@ -203,6 +314,10 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURDi: case AArch64::STRQui: case AArch64::STURQi: + case AArch64::STRBBui: + case AArch64::STURBBi: + case AArch64::STRHHui: + case AArch64::STURHHi: case AArch64::STRWui: case AArch64::STURWi: case AArch64::STRXui: @@ -219,11 +334,23 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc, case AArch64::STURSi: case AArch64::LDRSui: case AArch64::LDURSi: + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRBBui: + case AArch64::LDURBBi: return Opc; case AArch64::LDRSWui: return AArch64::LDRWui; case AArch64::LDURSWi: return AArch64::LDURWi; + case AArch64::LDRSBWui: + return AArch64::LDRBBui; + case AArch64::LDRSHWui: + return AArch64::LDRHHui; + case AArch64::LDURSBWi: + return AArch64::LDURBBi; + case AArch64::LDURSHWi: + return AArch64::LDURHHi; } } @@ -240,6 +367,14 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::STRQui: case AArch64::STURQi: return AArch64::STPQi; + case AArch64::STRBBui: + return AArch64::STRHHui; + case AArch64::STRHHui: + return AArch64::STRWui; + case AArch64::STURBBi: + return AArch64::STURHHi; + case AArch64::STURHHi: + return AArch64::STURWi; case AArch64::STRWui: case AArch64::STURWi: return AArch64::STPWi; @@ -264,6 +399,48 @@ static unsigned getMatchingPairOpcode(unsigned Opc) { case AArch64::LDRSWui: case AArch64::LDURSWi: return AArch64::LDPSWi; + case AArch64::LDRHHui: + case AArch64::LDRSHWui: + return AArch64::LDRWui; + case AArch64::LDURHHi: + case AArch64::LDURSHWi: + return AArch64::LDURWi; + case AArch64::LDRBBui: + case AArch64::LDRSBWui: + return AArch64::LDRHHui; + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + return AArch64::LDURHHi; + } +} + +static unsigned isMatchingStore(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + unsigned LdOpc = LoadInst->getOpcode(); + unsigned StOpc = StoreInst->getOpcode(); + switch (LdOpc) { + default: + llvm_unreachable("Unsupported load instruction!"); + case AArch64::LDRBBui: + return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui || + StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURBBi: + return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi || + StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRHHui: + return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui || + StOpc == AArch64::STRXui; + case AArch64::LDURHHi: + return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi || + StOpc == AArch64::STURXi; + case AArch64::LDRWui: + return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui; + case AArch64::LDURWi: + return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi; + case AArch64::LDRXui: + return StOpc == AArch64::STRXui; + case AArch64::LDURXi: + return StOpc == AArch64::STURXi; } } @@ -277,6 +454,10 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::STRDpre; case AArch64::STRQui: return AArch64::STRQpre; + case AArch64::STRBBui: + return AArch64::STRBBpre; + case AArch64::STRHHui: + return AArch64::STRHHpre; case AArch64::STRWui: return AArch64::STRWpre; case AArch64::STRXui: @@ -287,12 +468,38 @@ static unsigned getPreIndexedOpcode(unsigned Opc) { return AArch64::LDRDpre; case AArch64::LDRQui: return AArch64::LDRQpre; + case AArch64::LDRBBui: + return AArch64::LDRBBpre; + case AArch64::LDRHHui: + return AArch64::LDRHHpre; case AArch64::LDRWui: return AArch64::LDRWpre; case AArch64::LDRXui: return AArch64::LDRXpre; case AArch64::LDRSWui: return AArch64::LDRSWpre; + case AArch64::LDPSi: + return AArch64::LDPSpre; + case AArch64::LDPSWi: + return AArch64::LDPSWpre; + case AArch64::LDPDi: + return AArch64::LDPDpre; + case AArch64::LDPQi: + return AArch64::LDPQpre; + case AArch64::LDPWi: + return AArch64::LDPWpre; + case AArch64::LDPXi: + return AArch64::LDPXpre; + case AArch64::STPSi: + return AArch64::STPSpre; + case AArch64::STPDi: + return AArch64::STPDpre; + case AArch64::STPQi: + return AArch64::STPQpre; + case AArch64::STPWi: + return AArch64::STPWpre; + case AArch64::STPXi: + return AArch64::STPXpre; } } @@ -306,6 +513,10 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::STRDpost; case AArch64::STRQui: return AArch64::STRQpost; + case AArch64::STRBBui: + return AArch64::STRBBpost; + case AArch64::STRHHui: + return AArch64::STRHHpost; case AArch64::STRWui: return AArch64::STRWpost; case AArch64::STRXui: @@ -316,19 +527,111 @@ static unsigned getPostIndexedOpcode(unsigned Opc) { return AArch64::LDRDpost; case AArch64::LDRQui: return AArch64::LDRQpost; + case AArch64::LDRBBui: + return AArch64::LDRBBpost; + case AArch64::LDRHHui: + return AArch64::LDRHHpost; case AArch64::LDRWui: return AArch64::LDRWpost; case AArch64::LDRXui: return AArch64::LDRXpost; case AArch64::LDRSWui: return AArch64::LDRSWpost; + case AArch64::LDPSi: + return AArch64::LDPSpost; + case AArch64::LDPSWi: + return AArch64::LDPSWpost; + case AArch64::LDPDi: + return AArch64::LDPDpost; + case AArch64::LDPQi: + return AArch64::LDPQpost; + case AArch64::LDPWi: + return AArch64::LDPWpost; + case AArch64::LDPXi: + return AArch64::LDPXpost; + case AArch64::STPSi: + return AArch64::STPSpost; + case AArch64::STPDi: + return AArch64::STPDpost; + case AArch64::STPQi: + return AArch64::STPQpost; + case AArch64::STPWi: + return AArch64::STPWpost; + case AArch64::STPXi: + return AArch64::STPXpost; } } +static bool isPairedLdSt(const MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + return true; + } +} + +static const MachineOperand &getLdStRegOp(const MachineInstr *MI, + unsigned PairedRegOp = 0) { + assert(PairedRegOp < 2 && "Unexpected register operand idx."); + unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 2 : 1; + return MI->getOperand(Idx); +} + +static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { + unsigned Idx = isPairedLdSt(MI) ? 3 : 2; + return MI->getOperand(Idx); +} + +static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst, + MachineInstr *StoreInst) { + assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); + int LoadSize = getMemScale(LoadInst); + int StoreSize = getMemScale(StoreInst); + int UnscaledStOffset = isUnscaledLdSt(StoreInst) + ? getLdStOffsetOp(StoreInst).getImm() + : getLdStOffsetOp(StoreInst).getImm() * StoreSize; + int UnscaledLdOffset = isUnscaledLdSt(LoadInst) + ? getLdStOffsetOp(LoadInst).getImm() + : getLdStOffsetOp(LoadInst).getImm() * LoadSize; + return (UnscaledStOffset <= UnscaledLdOffset) && + (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize)); +} + +// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI. +static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0, + MachineInstr *Op1) { + assert(MI->memoperands_empty() && "expected a new machineinstr"); + size_t numMemRefs = (Op0->memoperands_end() - Op0->memoperands_begin()) + + (Op1->memoperands_end() - Op1->memoperands_begin()); + + MachineFunction *MF = MI->getParent()->getParent(); + MachineSDNode::mmo_iterator MemBegin = MF->allocateMemRefsArray(numMemRefs); + MachineSDNode::mmo_iterator MemEnd = + std::copy(Op0->memoperands_begin(), Op0->memoperands_end(), MemBegin); + MemEnd = std::copy(Op1->memoperands_begin(), Op1->memoperands_end(), MemEnd); + MI->setMemRefs(MemBegin, MemEnd); +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - bool MergeForward, int SExtIdx) { + const LdStPairFlags &Flags) { MachineBasicBlock::iterator NextI = I; ++NextI; // If NextI is the second of the two instructions to be merged, we need @@ -338,25 +641,26 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, if (NextI == Paired) ++NextI; + int SExtIdx = Flags.getSExtIdx(); unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); - bool IsUnscaled = isUnscaledLdst(Opc); - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; + bool IsUnscaled = isUnscaledLdSt(Opc); + int OffsetStride = IsUnscaled ? getMemScale(I) : 1; + bool MergeForward = Flags.getMergeForward(); unsigned NewOpc = getMatchingPairOpcode(Opc); // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; // Also based on MergeForward is from where we copy the base register operand // so we get the flags compatible with the input code. - MachineOperand &BaseRegOp = - MergeForward ? Paired->getOperand(1) : I->getOperand(1); + const MachineOperand &BaseRegOp = + MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I); // Which register is Rt and which is Rt2 depends on the offset order. MachineInstr *RtMI, *Rt2MI; - if (I->getOperand(2).getImm() == - Paired->getOperand(2).getImm() + OffsetStride) { + if (getLdStOffsetOp(I).getImm() == + getLdStOffsetOp(Paired).getImm() + OffsetStride) { RtMI = Paired; Rt2MI = I; // Here we swapped the assumption made for SExtIdx. @@ -368,18 +672,135 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, RtMI = I; Rt2MI = Paired; } - // Handle Unscaled - int OffsetImm = RtMI->getOperand(2).getImm(); - if (IsUnscaled && EnableAArch64UnscaledMemOp) - OffsetImm /= OffsetStride; + + int OffsetImm = getLdStOffsetOp(RtMI).getImm(); + + if (isNarrowLoad(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MachineInstr *RtNewDest = MergeForward ? I : Paired; + // When merging small (< 32 bit) loads for big-endian targets, the order of + // the component parts gets swapped. + if (!Subtarget->isLittleEndian()) + std::swap(RtMI, Rt2MI); + // Construct the new load instruction. + MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2; + NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtNewDest)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + + // Copy MachineMemOperands from the original loads. + concatenateMemOperands(NewMemMI, I, Paired); + + DEBUG( + dbgs() + << "Creating the new load and extract. Replacing instructions:\n "); + DEBUG(I->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(Paired->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG((NewMemMI)->print(dbgs())); + + int Width = getMemScale(I) == 1 ? 8 : 16; + int LSBLow = 0; + int LSBHigh = Width; + int ImmsLow = LSBLow + Width - 1; + int ImmsHigh = LSBHigh + Width - 1; + MachineInstr *ExtDestMI = MergeForward ? Paired : I; + if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) { + // Create the bitfield extract for high bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + } else { + // Create the bitfield extract for low bits. + if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) { + // For unsigned, prefer to use AND for low bits. + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(AArch64::ANDWri)) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(ImmsLow); + } else { + BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(RtMI))) + .addOperand(getLdStRegOp(RtMI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBLow) + .addImm(ImmsLow); + } + + // Create the bitfield extract for high bits. + BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(getBitExtrOpcode(Rt2MI))) + .addOperand(getLdStRegOp(Rt2MI)) + .addReg(getLdStRegOp(RtNewDest).getReg()) + .addImm(LSBHigh) + .addImm(ImmsHigh); + } + DEBUG(dbgs() << " "); + DEBUG((BitExtMI1)->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI2)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + I->eraseFromParent(); + Paired->eraseFromParent(); + return NextI; + } // Construct the new instruction. - MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, - I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(RtMI->getOperand(0)) - .addOperand(Rt2MI->getOperand(0)) - .addOperand(BaseRegOp) - .addImm(OffsetImm); + MachineInstrBuilder MIB; + if (isNarrowStore(Opc)) { + // Change the scaled offset from small to large type. + if (!IsUnscaled) { + assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge"); + OffsetImm /= 2; + } + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(I)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + // Copy MachineMemOperands from the original stores. + concatenateMemOperands(MIB, I, Paired); + } else { + // Handle Unscaled + if (IsUnscaled) + OffsetImm /= OffsetStride; + MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(), + TII->get(NewOpc)) + .addOperand(getLdStRegOp(RtMI)) + .addOperand(getLdStRegOp(Rt2MI)) + .addOperand(BaseRegOp) + .addImm(OffsetImm); + } + (void)MIB; // FIXME: Do we need/want to copy the mem operands from the source @@ -439,13 +860,112 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, return NextI; } +MachineBasicBlock::iterator +AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, + MachineBasicBlock::iterator StoreI) { + MachineBasicBlock::iterator NextI = LoadI; + ++NextI; + + int LoadSize = getMemScale(LoadI); + int StoreSize = getMemScale(StoreI); + unsigned LdRt = getLdStRegOp(LoadI).getReg(); + unsigned StRt = getLdStRegOp(StoreI).getReg(); + bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt); + + assert((IsStoreXReg || + TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) && + "Unexpected RegClass"); + + MachineInstr *BitExtMI; + if (LoadSize == StoreSize && (LoadSize == 4 || LoadSize == 8)) { + // Remove the load, if the destination register of the loads is the same + // register for stored value. + if (StRt == LdRt && LoadSize == 8) { + DEBUG(dbgs() << "Remove load instruction:\n "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << "\n"); + LoadI->eraseFromParent(); + return NextI; + } + // Replace the load with a mov if the load and store are in the same size. + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt) + .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR) + .addReg(StRt) + .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); + } else { + // FIXME: Currently we disable this transformation in big-endian targets as + // performance and correctness are verified only in little-endian. + if (!Subtarget->isLittleEndian()) + return NextI; + bool IsUnscaled = isUnscaledLdSt(LoadI); + assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match"); + assert(LoadSize <= StoreSize && "Invalid load size"); + int UnscaledLdOffset = IsUnscaled + ? getLdStOffsetOp(LoadI).getImm() + : getLdStOffsetOp(LoadI).getImm() * LoadSize; + int UnscaledStOffset = IsUnscaled + ? getLdStOffsetOp(StoreI).getImm() + : getLdStOffsetOp(StoreI).getImm() * StoreSize; + int Width = LoadSize * 8; + int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + int Imms = Immr + Width - 1; + unsigned DestReg = IsStoreXReg + ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32, + &AArch64::GPR64RegClass) + : LdRt; + + assert((UnscaledLdOffset >= UnscaledStOffset && + (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) && + "Invalid offset"); + + Immr = 8 * (UnscaledLdOffset - UnscaledStOffset); + Imms = Immr + Width - 1; + if (UnscaledLdOffset == UnscaledStOffset) { + uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N + | ((Immr) << 6) // immr + | ((Imms) << 0) // imms + ; + + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri), + DestReg) + .addReg(StRt) + .addImm(AndMaskEncoded); + } else { + BitExtMI = + BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(), + TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri), + DestReg) + .addReg(StRt) + .addImm(Immr) + .addImm(Imms); + } + } + + DEBUG(dbgs() << "Promoting load by replacing :\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(LoadI->print(dbgs())); + DEBUG(dbgs() << " with instructions:\n "); + DEBUG(StoreI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG((BitExtMI)->print(dbgs())); + DEBUG(dbgs() << "\n"); + + // Erase the old instructions. + LoadI->eraseFromParent(); + return NextI; +} + /// trackRegDefsUses - Remember what registers the specified instruction uses /// and modifies. -static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, +static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs, BitVector &UsedRegs, const TargetRegisterInfo *TRI) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (MO.isRegMask()) ModifiedRegs.setBitsNotInMask(MO.getRegMask()); @@ -464,16 +984,12 @@ static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, } static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { - if (!IsUnscaled && (Offset > 63 || Offset < -64)) - return false; - if (IsUnscaled) { - // Convert the byte-offset used by unscaled into an "element" offset used - // by the scaled pair load/store instructions. - int ElemOffset = Offset / OffsetStride; - if (ElemOffset > 63 || ElemOffset < -64) - return false; - } - return true; + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + if (IsUnscaled) + Offset /= OffsetStride; + + return Offset <= 63 && Offset >= -64; } // Do alignment, specialized to power of 2 and for signed ints, @@ -507,12 +1023,65 @@ static bool mayAlias(MachineInstr *MIa, return false; } +bool AArch64LoadStoreOpt::findMatchingStore( + MachineBasicBlock::iterator I, unsigned Limit, + MachineBasicBlock::iterator &StoreI) { + MachineBasicBlock::iterator E = I->getParent()->begin(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + + // Track which registers have been modified and used between the first insn + // and the second insn. + BitVector ModifiedRegs, UsedRegs; + ModifiedRegs.resize(TRI->getNumRegs()); + UsedRegs.resize(TRI->getNumRegs()); + + for (unsigned Count = 0; MBBI != E && Count < Limit;) { + --MBBI; + MachineInstr *MI = MBBI; + // Skip DBG_VALUE instructions. Otherwise debug info can affect the + // optimization by changing how far we scan. + if (MI->isDebugValue()) + continue; + // Now that we know this is a real instruction, count it. + ++Count; + + // If the load instruction reads directly from the address to which the + // store instruction writes and the stored value is not modified, we can + // promote the load. Since we do not handle stores with pre-/post-index, + // it's unnecessary to check if BaseReg is modified by the store itself. + if (MI->mayStore() && isMatchingStore(FirstMI, MI) && + BaseReg == getLdStBaseOp(MI).getReg() && + isLdOffsetInRangeOfSt(FirstMI, MI) && + !ModifiedRegs[getLdStRegOp(MI).getReg()]) { + StoreI = MBBI; + return true; + } + + if (MI->isCall()) + return false; + + // Update modified / uses register lists. + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + + // Otherwise, if the base register is modified, we have no match, so + // return early. + if (ModifiedRegs[BaseReg]) + return false; + + // If we encounter a store aliased with the load, return early. + if (MI->mayStore() && mayAlias(FirstMI, MI, TII)) + return false; + } + return false; +} + /// findMatchingInsn - Scan the instructions looking for a load/store that can /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, - bool &MergeForward, int &SExtIdx, - unsigned Limit) { + LdStPairFlags &Flags, unsigned Limit) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; MachineInstr *FirstMI = I; @@ -520,21 +1089,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, unsigned Opc = FirstMI->getOpcode(); bool MayLoad = FirstMI->mayLoad(); - bool IsUnscaled = isUnscaledLdst(Opc); - unsigned Reg = FirstMI->getOperand(0).getReg(); - unsigned BaseReg = FirstMI->getOperand(1).getReg(); - int Offset = FirstMI->getOperand(2).getImm(); + bool IsUnscaled = isUnscaledLdSt(FirstMI); + unsigned Reg = getLdStRegOp(FirstMI).getReg(); + unsigned BaseReg = getLdStBaseOp(FirstMI).getReg(); + int Offset = getLdStOffsetOp(FirstMI).getImm(); + bool IsNarrowStore = isNarrowStore(Opc); + + // For narrow stores, find only the case where the stored value is WZR. + if (IsNarrowStore && Reg != AArch64::WZR) + return E; // Early exit if the first instruction modifies the base register. // e.g., ldr x0, [x0] - // Early exit if the offset if not possible to match. (6 bits of positive - // range, plus allow an extra one in case we find a later insn that matches - // with Offset-1 if (FirstMI->modifiesRegister(BaseReg, TRI)) return E; - int OffsetStride = - IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; - if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) + + // Early exit if the offset if not possible to match. (6 bits of positive + // range, plus allow an extra one in case we find a later insn that matches + // with Offset-1) + int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + if (!(isNarrowLoad(Opc) || IsNarrowStore) && + !inBoundsForPair(IsUnscaled, Offset, OffsetStride)) return E; // Track which registers have been modified and used between the first insn @@ -557,18 +1132,19 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, ++Count; bool CanMergeOpc = Opc == MI->getOpcode(); - SExtIdx = -1; + Flags.setSExtIdx(-1); if (!CanMergeOpc) { bool IsValidLdStrOpc; unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc); - if (!IsValidLdStrOpc) - continue; + assert(IsValidLdStrOpc && + "Given Opc should be a Load or Store with an immediate"); // Opc will be the first instruction in the pair. - SExtIdx = NonSExtOpc == (unsigned)Opc ? 1 : 0; + Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0); CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode()); } - if (CanMergeOpc && MI->getOperand(2).isImm()) { + if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) { + assert(MI->mayLoadOrStore() && "Expected memory operation."); // If we've found another instruction with the same opcode, check to see // if the base and offset are compatible with our starting instruction. // These instructions all have scaled immediate operands, so we just @@ -579,8 +1155,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // Pairwise instructions have a 7-bit signed offset field. Single insns // have a 12-bit unsigned offset field. To be a valid combine, the // final offset must be in range. - unsigned MIBaseReg = MI->getOperand(1).getReg(); - int MIOffset = MI->getOperand(2).getImm(); + unsigned MIBaseReg = getLdStBaseOp(MI).getReg(); + int MIOffset = getLdStOffsetOp(MI).getImm(); if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || (Offset + OffsetStride == MIOffset))) { int MinOffset = Offset < MIOffset ? Offset : MIOffset; @@ -591,30 +1167,43 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, return E; // If the resultant immediate offset of merging these instructions // is out of range for a pairwise instruction, bail and keep looking. - bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); - if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { + bool MIIsUnscaled = isUnscaledLdSt(MI); + bool IsNarrowLoad = isNarrowLoad(MI->getOpcode()); + if (!IsNarrowLoad && + !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } - // If the alignment requirements of the paired (scaled) instruction - // can't express the offset of the unscaled input, bail and keep - // looking. - if (IsUnscaled && EnableAArch64UnscaledMemOp && - (alignTo(MinOffset, OffsetStride) != MinOffset)) { - trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) + + if (IsNarrowLoad || IsNarrowStore) { + // If the alignment requirements of the scaled wide load/store + // instruction can't express the offset of the scaled narrow + // input, bail and keep looking. + if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); MemInsns.push_back(MI); - continue; + continue; + } + } else { + // If the alignment requirements of the paired (scaled) instruction + // can't express the offset of the unscaled input, bail and keep + // looking. + if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) { + trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + MemInsns.push_back(MI); + continue; + } } // If the destination register of the loads is the same register, bail // and keep looking. A load-pair instruction with both destination // registers the same is UNPREDICTABLE and will result in an exception. - if (MayLoad && Reg == MI->getOperand(0).getReg()) { + // For narrow stores, allow only when the stored value is the same + // (i.e., WZR). + if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) || + (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); - if (MI->mayLoadOrStore()) - MemInsns.push_back(MI); + MemInsns.push_back(MI); continue; } @@ -622,10 +1211,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // the two instructions and none of the instructions between the second // and first alias with the second, we can combine the second into the // first. - if (!ModifiedRegs[MI->getOperand(0).getReg()] && - !(MI->mayLoad() && UsedRegs[MI->getOperand(0).getReg()]) && + if (!ModifiedRegs[getLdStRegOp(MI).getReg()] && + !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) && !mayAlias(MI, MemInsns, TII)) { - MergeForward = false; + Flags.setMergeForward(false); return MBBI; } @@ -633,11 +1222,10 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // between the two instructions and none of the instructions between the // first and the second alias with the first, we can combine the first // into the second. - if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && - !(FirstMI->mayLoad() && - UsedRegs[FirstMI->getOperand(0).getReg()]) && + if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] && + !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) && !mayAlias(FirstMI, MemInsns, TII)) { - MergeForward = true; + Flags.setMergeForward(true); return MBBI; } // Unable to combine these instructions due to interference in between. @@ -666,51 +1254,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, } MachineBasicBlock::iterator -AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, - MachineBasicBlock::iterator Update) { - assert((Update->getOpcode() == AArch64::ADDXri || - Update->getOpcode() == AArch64::SUBXri) && - "Unexpected base register update instruction to merge!"); - MachineBasicBlock::iterator NextI = I; - // Return the instruction following the merged instruction, which is - // the instruction following our unmerged load. Unless that's the add/sub - // instruction we're merging, in which case it's the one after that. - if (++NextI == Update) - ++NextI; - - int Value = Update->getOperand(2).getImm(); - assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into pre-indexed load / store"); - if (Update->getOpcode() == AArch64::SUBXri) - Value = -Value; - - unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); - (void)MIB; - - DEBUG(dbgs() << "Creating pre-indexed load/store."); - DEBUG(dbgs() << " Replacing instructions:\n "); - DEBUG(I->print(dbgs())); - DEBUG(dbgs() << " "); - DEBUG(Update->print(dbgs())); - DEBUG(dbgs() << " with instruction:\n "); - DEBUG(((MachineInstr *)MIB)->print(dbgs())); - DEBUG(dbgs() << "\n"); - - // Erase the old instructions for the block. - I->eraseFromParent(); - Update->eraseFromParent(); - - return NextI; -} - -MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( - MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { +AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I, + MachineBasicBlock::iterator Update, + bool IsPreIdx) { assert((Update->getOpcode() == AArch64::ADDXri || Update->getOpcode() == AArch64::SUBXri) && "Unexpected base register update instruction to merge!"); @@ -723,20 +1269,36 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( int Value = Update->getOperand(2).getImm(); assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && - "Can't merge 1 << 12 offset into post-indexed load / store"); + "Can't merge 1 << 12 offset into pre-/post-indexed load / store"); if (Update->getOpcode() == AArch64::SUBXri) Value = -Value; - unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); - MachineInstrBuilder MIB = - BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) - .addOperand(Update->getOperand(0)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addImm(Value); + unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) + : getPostIndexedOpcode(I->getOpcode()); + MachineInstrBuilder MIB; + if (!isPairedLdSt(I)) { + // Non-paired instruction. + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value); + } else { + // Paired instruction. + int Scale = getMemScale(I); + MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) + .addOperand(getLdStRegOp(Update)) + .addOperand(getLdStRegOp(I, 0)) + .addOperand(getLdStRegOp(I, 1)) + .addOperand(getLdStBaseOp(I)) + .addImm(Value / Scale); + } (void)MIB; - DEBUG(dbgs() << "Creating post-indexed load/store."); + if (IsPreIdx) + DEBUG(dbgs() << "Creating pre-indexed load/store."); + else + DEBUG(dbgs() << "Creating post-indexed load/store."); DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); @@ -752,8 +1314,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( return NextI; } -static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, - int Offset) { +bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI, + MachineInstr *MI, + unsigned BaseReg, int Offset) { switch (MI->getOpcode()) { default: break; @@ -769,44 +1332,65 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, // Watch out for 1 << 12 shifted value. if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) break; - // If the instruction has the base register as source and dest and the - // immediate will fit in a signed 9-bit integer, then we have a match. - if (MI->getOperand(0).getReg() == BaseReg && - MI->getOperand(1).getReg() == BaseReg && - MI->getOperand(2).getImm() <= 255 && - MI->getOperand(2).getImm() >= -256) { - // If we have a non-zero Offset, we check that it matches the amount - // we're adding to the register. - if (!Offset || Offset == MI->getOperand(2).getImm()) - return true; + + // The update instruction source and destination register must be the + // same as the load/store base register. + if (MI->getOperand(0).getReg() != BaseReg || + MI->getOperand(1).getReg() != BaseReg) + break; + + bool IsPairedInsn = isPairedLdSt(MemMI); + int UpdateOffset = MI->getOperand(2).getImm(); + // For non-paired load/store instructions, the immediate must fit in a + // signed 9-bit integer. + if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256)) + break; + + // For paired load/store instructions, the immediate must be a multiple of + // the scaling factor. The scaled offset must also fit into a signed 7-bit + // integer. + if (IsPairedInsn) { + int Scale = getMemScale(MemMI); + if (UpdateOffset % Scale != 0) + break; + + int ScaledOffset = UpdateOffset / Scale; + if (ScaledOffset > 64 || ScaledOffset < -64) + break; } + + // If we have a non-zero Offset, we check that it matches the amount + // we're adding to the register. + if (!Offset || Offset == MI->getOperand(2).getImm()) + return true; break; } return false; } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( - MachineBasicBlock::iterator I, unsigned Limit, int Value) { + MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm() * - TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); - // If the base register overlaps the destination register, we can't - // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + // Scan forward looking for post-index opportunities. Updating instructions + // can't be formed if the memory instruction doesn't have the offset we're + // looking for. + if (MIUnscaledOffset != UnscaledOffset) return E; - // Scan forward looking for post-index opportunities. - // Updating instructions can't be formed if the memory insn already - // has an offset other than the value we're looking for. - if (Offset != Value) - return E; + // If the base register overlaps a destination register, we can't + // merge the update. + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -825,7 +1409,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, Value)) + if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -845,21 +1429,22 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( MachineBasicBlock::iterator E = I->getParent()->end(); MachineInstr *MemMI = I; MachineBasicBlock::iterator MBBI = I; - const MachineFunction &MF = *MemMI->getParent()->getParent(); - unsigned DestReg = MemMI->getOperand(0).getReg(); - unsigned BaseReg = MemMI->getOperand(1).getReg(); - int Offset = MemMI->getOperand(2).getImm(); - unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); + unsigned BaseReg = getLdStBaseOp(MemMI).getReg(); + int Offset = getLdStOffsetOp(MemMI).getImm(); // If the load/store is the first instruction in the block, there's obviously // not any matching update. Ditto if the memory offset isn't zero. if (MBBI == B || Offset != 0) return E; - // If the base register overlaps the destination register, we can't + // If the base register overlaps a destination register, we can't // merge the update. - if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) - return E; + bool IsPairedInsn = isPairedLdSt(MemMI); + for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { + unsigned DestReg = getLdStRegOp(MemMI, i).getReg(); + if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) + return E; + } // Track which registers have been modified and used between the first insn // (inclusive) and the second insn. @@ -878,7 +1463,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) + if (isMatchingUpdateInsn(I, MI, BaseReg, Offset)) return MBBI; // Update the status of what the instruction clobbered and used. @@ -892,17 +1477,101 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( return E; } -bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { +bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + // If this is a volatile load, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm. + // FIXME: It is possible to extend it to handle reg+reg cases. + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Look backward up to ScanLimit instructions. + MachineBasicBlock::iterator StoreI; + if (findMatchingStore(MBBI, ScanLimit, StoreI)) { + ++NumLoadsFromStoresPromoted; + // Promote the load. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = promoteLoadFromStore(MBBI, StoreI); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::tryToMergeLdStInst( + MachineBasicBlock::iterator &MBBI) { + MachineInstr *MI = MBBI; + MachineBasicBlock::iterator E = MI->getParent()->end(); + // If this is a volatile load/store, don't mess with it. + if (MI->hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (TII->isLdStPairSuppressed(MI)) + return false; + + // Look ahead up to ScanLimit instructions for a pairable instruction. + LdStPairFlags Flags; + MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit); + if (Paired != E) { + if (isNarrowLoad(MI)) { + ++NumNarrowLoadsPromoted; + } else if (isNarrowStore(MI)) { + ++NumZeroStoresPromoted; + } else { + ++NumPairCreated; + if (isUnscaledLdSt(MI)) + ++NumUnscaledPairCreated; + } + + // Merge the loads into a pair. Keeping the iterator straight is a + // pain, so we let the merge routine tell us what the next instruction + // is after it's done mucking about. + MBBI = mergePairedInsns(MBBI, Paired, Flags); + return true; + } + return false; +} + +bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, + bool enableNarrowLdOpt) { bool Modified = false; - // Two tranformations to do here: - // 1) Find loads and stores that can be merged into a single load or store + // Three tranformations to do here: + // 1) Find loads that directly read from stores and promote them by + // replacing with mov instructions. If the store is wider than the load, + // the load will be replaced with a bitfield extract. + // e.g., + // str w1, [x0, #4] + // ldrh w2, [x0, #6] + // ; becomes + // str w1, [x0, #4] + // lsr w2, w1, #16 + // 2) Find narrow loads that can be converted into a single wider load + // with bitfield extract instructions. + // e.g., + // ldrh w0, [x2] + // ldrh w1, [x2, #2] + // ; becomes + // ldr w0, [x2] + // ubfx w1, w0, #16, #16 + // and w0, w0, #ffff + // 3) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., // ldr x0, [x2] // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] - // 2) Find base register updates that can be merged into the load or store + // 4) Find base register updates that can be merged into the load or store // as a base-reg writeback. // e.g., // ldr x0, [x2] @@ -918,6 +1587,69 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRWui: + case AArch64::LDRXui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURWi: + case AArch64::LDURXi: { + if (tryToPromoteLoadFromStore(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + enableNarrowLdOpt && MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. + case AArch64::LDRBBui: + case AArch64::LDRHHui: + case AArch64::LDRSBWui: + case AArch64::LDRSHWui: + case AArch64::STRBBui: + case AArch64::STRHHui: + // Unscaled instructions. + case AArch64::LDURBBi: + case AArch64::LDURHHi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + case AArch64::STURBBi: + case AArch64::STURHHi: { + if (tryToMergeLdStInst(MBBI)) { + Modified = true; + break; + } + ++MBBI; + break; + } + // FIXME: Do the other instructions. + } + } + + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MBBI != E;) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + // Just move on to the next instruction. + ++MBBI; + break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -929,7 +1661,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDRXui: case AArch64::LDRWui: case AArch64::LDRSWui: - // do the unscaled versions as well + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -941,37 +1673,8 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURWi: case AArch64::LDURXi: case AArch64::LDURSWi: { - // If this is a volatile load/store, don't mess with it. - if (MI->hasOrderedMemoryRef()) { - ++MBBI; - break; - } - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { - ++MBBI; - break; - } - // Check if this load/store has a hint to avoid pair formation. - // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. - if (TII->isLdStPairSuppressed(MI)) { - ++MBBI; - break; - } - // Look ahead up to ScanLimit instructions for a pairable instruction. - bool MergeForward = false; - int SExtIdx = -1; - MachineBasicBlock::iterator Paired = - findMatchingInsn(MBBI, MergeForward, SExtIdx, ScanLimit); - if (Paired != E) { - // Merge the loads into a pair. Keeping the iterator straight is a - // pain, so we let the merge routine tell us what the next instruction - // is after it's done mucking about. - MBBI = mergePairedInsns(MBBI, Paired, MergeForward, SExtIdx); - + if (tryToMergeLdStInst(MBBI)) { Modified = true; - ++NumPairCreated; - if (isUnscaledLdst(MI->getOpcode())) - ++NumUnscaledPairCreated; break; } ++MBBI; @@ -992,17 +1695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { // Just move on to the next instruction. ++MBBI; break; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: case AArch64::STRXui: case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - // do the unscaled versions as well + case AArch64::LDRHHui: + case AArch64::LDRBBui: + // Unscaled instructions. case AArch64::STURSi: case AArch64::STURDi: case AArch64::STURQi: @@ -1012,25 +1720,41 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { case AArch64::LDURDi: case AArch64::LDURQi: case AArch64::LDURWi: - case AArch64::LDURXi: { + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: { // Make sure this is a reg+imm (as opposed to an address reloc). - if (!MI->getOperand(2).isImm()) { + if (!getLdStOffsetOp(MI).isImm()) { ++MBBI; break; } - // Look ahead up to ScanLimit instructions for a mergable instruction. + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 MachineBasicBlock::iterator Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePostIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); Modified = true; ++NumPostFolded; break; } // Don't know how to handle pre/post-index versions, so move to the next // instruction. - if (isUnscaledLdst(Opc)) { + if (isUnscaledLdSt(Opc)) { ++MBBI; break; } @@ -1043,28 +1767,25 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; } + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, + // however, is not, so adjust here. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); // Look forward to try to find a post-index instruction. For example, // ldr x1, [x0, #64] // add x0, x0, #64 // merged into: // ldr x1, [x0, #64]! - - // The immediate in the load/store is scaled by the size of the register - // being loaded. The immediate in the add we're looking for, - // however, is not, so adjust here. - int Value = MI->getOperand(2).getImm() * - TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) - ->getSize(); - Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); + Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset); if (Update != E) { // Merge the update into the ld/st. - MBBI = mergePreIdxUpdateInsn(MBBI, Update); + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); Modified = true; ++NumPreFolded; break; @@ -1081,13 +1802,24 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { return Modified; } +bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) { + bool ProfitableArch = Subtarget->isCortexA57(); + // FIXME: The benefit from converting narrow loads into a wider load could be + // microarchitectural as it assumes that a single load with two bitfield + // extracts is cheaper than two narrow loads. Currently, this conversion is + // enabled only in cortex-a57 on which performance benefits were verified. + return ProfitableArch && !Subtarget->requiresStrictAlign(); +} + bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - TII = static_cast<const AArch64InstrInfo *>(Fn.getSubtarget().getInstrInfo()); - TRI = Fn.getSubtarget().getRegisterInfo(); + Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo()); + TRI = Subtarget->getRegisterInfo(); bool Modified = false; + bool enableNarrowLdOpt = enableNarrowLdMerge(Fn); for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MBB, enableNarrowLdOpt); return Modified; } @@ -1095,8 +1827,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep // loads and stores near one another? -/// createARMLoadStoreOptimizationPass - returns an instance of the load / store -/// optimization pass. +/// createAArch64LoadStoreOptimizationPass - returns an instance of the +/// load / store optimization pass. FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { return new AArch64LoadStoreOpt(); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp index 580427a..2b4cdf1 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp @@ -207,9 +207,9 @@ bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO, void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.setOpcode(MI->getOpcode()); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; - if (lowerOperand(MI->getOperand(i), MCOp)) + if (lowerOperand(MO, MCOp)) OutMI.addOperand(MCOp); } } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h deleted file mode 100644 index 4164b33..0000000 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineCombinerPattern.h +++ /dev/null @@ -1,42 +0,0 @@ -//===- AArch64MachineCombinerPattern.h -===// -//===- AArch64 instruction pattern supported by combiner -===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file defines instruction pattern supported by combiner -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H -#define LLVM_LIB_TARGET_AARCH64_AARCH64MACHINECOMBINERPATTERN_H - -namespace llvm { - -/// Enumeration of instruction pattern supported by machine combiner -/// -/// -namespace MachineCombinerPattern { -enum MC_PATTERN : int { - MC_NONE = 0, - MC_MULADDW_OP1 = 1, - MC_MULADDW_OP2 = 2, - MC_MULSUBW_OP1 = 3, - MC_MULSUBW_OP2 = 4, - MC_MULADDWI_OP1 = 5, - MC_MULSUBWI_OP1 = 6, - MC_MULADDX_OP1 = 7, - MC_MULADDX_OP2 = 8, - MC_MULSUBX_OP1 = 9, - MC_MULSUBX_OP2 = 10, - MC_MULADDXI_OP1 = 11, - MC_MULSUBXI_OP1 = 12 -}; -} // end namespace MachineCombinerPattern -} // end namespace llvm - -#endif diff --git a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 536a8d0..318f839 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=// +//=- AArch64MachineFunctionInfo.h - AArch64 machine function info -*- C++ -*-=// // // The LLVM Compiler Infrastructure // @@ -42,7 +42,7 @@ class AArch64FunctionInfo : public MachineFunctionInfo { unsigned ArgumentStackToRestore; /// HasStackFrame - True if this function has a stack frame. Set by - /// processFunctionBeforeCalleeSavedScan(). + /// determineCalleeSaves(). bool HasStackFrame; /// \brief Amount of stack frame size, not including callee-saved registers. @@ -72,16 +72,22 @@ class AArch64FunctionInfo : public MachineFunctionInfo { /// registers. unsigned VarArgsFPRSize; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: AArch64FunctionInfo() : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {} + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) {} explicit AArch64FunctionInfo(MachineFunction &MF) : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false), NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0), - VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) { + VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0), + IsSplitCSR(false) { (void)MF; } @@ -96,6 +102,9 @@ public: bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } unsigned getLocalStackSize() const { return LocalStackSize; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index e1b93bf..79c09d9 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -489,7 +489,7 @@ bool AArch64PromoteConstant::insertDefinitions( for (const auto &IPI : InsertPts) { // Create the load of the global variable. - IRBuilder<> Builder(IPI.first->getParent(), IPI.first); + IRBuilder<> Builder(IPI.first); LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV); DEBUG(dbgs() << "**********\n"); DEBUG(dbgs() << "New def: "); @@ -540,7 +540,7 @@ bool AArch64PromoteConstant::runOnFunction(Function &F) { bool LocalChange = false; SmallPtrSet<Constant *, 8> AlreadyChecked; - for (Instruction &I : inst_range(&F)) { + for (Instruction &I : instructions(&F)) { // Traverse the operand, looking for constant vectors. Replace them by a // load of a global variable of constant vector type. for (Value *Op : I.operand_values()) { diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 841af55..32b4888 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -15,6 +15,7 @@ #include "AArch64RegisterInfo.h" #include "AArch64FrameLowering.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" @@ -34,10 +35,6 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "AArch64GenRegisterInfo.inc" -static cl::opt<bool> -ReserveX18("aarch64-reserve-x18", cl::Hidden, - cl::desc("Reserve X18, making it unavailable as GPR")); - AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) : AArch64GenRegisterInfo(AArch64::LR), TT(TT) {} @@ -50,10 +47,23 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_NoRegs_SaveList; if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg) return CSR_AArch64_AllRegs_SaveList; + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS) + return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? + CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : + CSR_AArch64_CXX_TLS_Darwin_SaveList; else return CSR_AArch64_AAPCS_SaveList; } +const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()) + return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { @@ -62,6 +72,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return CSR_AArch64_AllRegs_RegMask; + if (CC == CallingConv::CXX_FAST_TLS) + return CSR_AArch64_CXX_TLS_Darwin_RegMask; else return CSR_AArch64_AAPCS_RegMask; } @@ -104,7 +116,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AArch64::W29); } - if (TT.isOSDarwin() || ReserveX18) { + if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) { Reserved.set(AArch64::X18); // Platform register Reserved.set(AArch64::W18); } @@ -131,7 +143,7 @@ bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF, return true; case AArch64::X18: case AArch64::W18: - return TT.isOSDarwin() || ReserveX18; + return MF.getSubtarget<AArch64Subtarget>().isX18Reserved(); case AArch64::FP: case AArch64::W29: return TFI->hasFP(MF) || TT.isOSDarwin(); @@ -186,29 +198,6 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const { return false; } -bool AArch64RegisterInfo::canRealignStack(const MachineFunction &MF) const { - - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) - return false; - - return true; -} - -// FIXME: share this with other backends with identical implementation? -bool -AArch64RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const AArch64FrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = - ((MFI->getMaxAlignment() > StackAlign) || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); -} - unsigned AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const AArch64FrameLowering *TFI = getFrameLowering(MF); @@ -424,10 +413,11 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, case AArch64::GPR64RegClassID: case AArch64::GPR32commonRegClassID: case AArch64::GPR64commonRegClassID: - return 32 - 1 // XZR/SP - - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP - - (TT.isOSDarwin() || ReserveX18) // X18 reserved as platform register - - hasBasePointer(MF); // X19 + return 32 - 1 // XZR/SP + - (TFI->hasFP(MF) || TT.isOSDarwin()) // FP + - MF.getSubtarget<AArch64Subtarget>() + .isX18Reserved() // X18 reserved as platform register + - hasBasePointer(MF); // X19 case AArch64::FPR8RegClassID: case AArch64::FPR16RegClassID: case AArch64::FPR32RegClassID: diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h index 8c379d9..f33f788 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.h @@ -35,6 +35,8 @@ public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; @@ -93,9 +95,6 @@ public: unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; - // Base pointer (stack realignment) support. - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index b2efca0..a8c8b17 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/contrib/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -407,7 +407,7 @@ def FPR128 : RegisterClass<"AArch64", // The lower 16 vector registers. Some instructions can only take registers // in this range. def FPR128_lo : RegisterClass<"AArch64", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, (trunc FPR128, 16)>; // Pairs, triples, and quads of 64-bit vector registers. diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 486efd6..f6ee8cf 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -31,6 +31,11 @@ static cl::opt<bool> EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if " "converter pass"), cl::init(true), cl::Hidden); +// If OS supports TBI, use this flag to enable it. +static cl::opt<bool> +UseAddressTopByteIgnored("aarch64-use-tbi", cl::desc("Assume that top byte of " + "an address is ignored"), cl::init(false), cl::Hidden); + AArch64Subtarget & AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) { // Determine default and user-specified characteristics @@ -46,9 +51,11 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - HasV8_1aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false), - HasCRC(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), - IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), + HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), + HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), + HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), + StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), + CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), TLInfo(TM, *this) {} @@ -113,12 +120,30 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, // bi-directional scheduling. 253.perlbmk. Policy.OnlyTopDown = false; Policy.OnlyBottomUp = false; + // Enabling or Disabling the latency heuristic is a close call: It seems to + // help nearly no benchmark on out-of-order architectures, on the other hand + // it regresses register pressure on a few benchmarking. + if (isCyclone()) + Policy.DisableLatencyHeuristic = true; } bool AArch64Subtarget::enableEarlyIfConversion() const { return EnableEarlyIfConvert; } +bool AArch64Subtarget::supportsAddressTopByteIgnored() const { + if (!UseAddressTopByteIgnored) + return false; + + if (TargetTriple.isiOS()) { + unsigned Major, Minor, Micro; + TargetTriple.getiOSVersion(Major, Minor, Micro); + return Major >= 8; + } + + return false; +} + std::unique_ptr<PBQPRAConstraint> AArch64Subtarget::getCustomPBQPConstraints() const { if (!isCortexA57()) diff --git a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h index 6bb0694..1b8b9b2 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -33,17 +33,21 @@ class Triple; class AArch64Subtarget : public AArch64GenSubtargetInfo { protected: - enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone}; + enum ARMProcFamilyEnum {Others, CortexA35, CortexA53, CortexA57, Cyclone}; /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others. ARMProcFamilyEnum ARMProcFamily; bool HasV8_1aOps; + bool HasV8_2aOps; bool HasFPARMv8; bool HasNEON; bool HasCrypto; bool HasCRC; + bool HasPerfMon; + bool HasFullFP16; + bool HasSPE; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove; @@ -51,6 +55,12 @@ protected: // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing; + // StrictAlign - Disallow unaligned memory accesses. + bool StrictAlign; + + // ReserveX18 - X18 is not available as a general purpose register. + bool ReserveX18; + bool IsLittle; /// CPUString - String name of used CPU. @@ -92,19 +102,30 @@ public: const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { - return isCortexA53() || isCortexA57(); + return isGeneric() || isCortexA53() || isCortexA57(); } bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + bool requiresStrictAlign() const { return StrictAlign; } + + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } bool hasCrypto() const { return HasCrypto; } bool hasCRC() const { return HasCRC; } + /// CPU has TBI (top byte of addresses is ignored during HW address + /// translation) and OS enables it. + bool supportsAddressTopByteIgnored() const; + + bool hasPerfMon() const { return HasPerfMon; } + bool hasFullFP16() const { return HasFullFP16; } + bool hasSPE() const { return HasSPE; } bool isLittleEndian() const { return IsLittle; } @@ -112,11 +133,13 @@ public: bool isTargetIOS() const { return TargetTriple.isiOS(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetWindows() const { return TargetTriple.isOSWindows(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); } bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); } bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } + bool isGeneric() const { return CPUString == "generic"; } bool isCyclone() const { return CPUString == "cyclone"; } bool isCortexA57() const { return CPUString == "cortex-a57"; } bool isCortexA53() const { return CPUString == "cortex-a53"; } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index db6e244..c52c554 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -203,7 +203,7 @@ public: } // namespace TargetIRAnalysis AArch64TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(AArch64TTIImpl(this, F)); }); } diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e085cca..9af0e64 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -23,7 +23,7 @@ using namespace llvm; /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { +int AArch64TTIImpl::getIntImmCost(int64_t Val) { // Check if the immediate can be encoded within an instruction. if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64)) return 0; @@ -37,7 +37,7 @@ unsigned AArch64TTIImpl::getIntImmCost(int64_t Val) { } /// \brief Calculate the cost of materializing the given constant. -unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -51,18 +51,18 @@ unsigned AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Split the constant into 64-bit chunks and calculate the cost for each // chunk. - unsigned Cost = 0; + int Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); int64_t Val = Tmp.getSExtValue(); Cost += getIntImmCost(Val); } // We need at least one instruction to materialze the constant. - return std::max(1U, Cost); + return std::max(1, Cost); } -unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -118,17 +118,17 @@ unsigned AArch64TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, } if (Idx == ImmIdx) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } return AArch64TTIImpl::getIntImmCost(Imm, Ty); } -unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -147,10 +147,10 @@ unsigned AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: if (Idx == 1) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } break; @@ -176,8 +176,7 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, - Type *Src) { +int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -187,7 +186,31 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, if (!SrcTy.isSimple() || !DstTy.isSimple()) return BaseT::getCastInstrCost(Opcode, Dst, Src); - static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = { + static const TypeConversionCostTblEntry + ConversionTbl[] = { + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, + + // The number of shll instructions for the extension. + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, + // LowerVectorINT_TO_FP: { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, @@ -210,6 +233,16 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, + // Complex: to v8f32 + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, + + // Complex: to v16f32 + { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, + // Complex: to v2f64 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, @@ -250,22 +283,21 @@ unsigned AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, }; - int Idx = ConvertCostTableLookup<MVT>( - ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); - if (Idx != -1) - return ConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -281,15 +313,15 @@ unsigned AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, } // All other insert/extracts cost this much. - return 2; + return 3; } -unsigned AArch64TTIImpl::getArithmeticInstrCost( +int AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -300,10 +332,9 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - unsigned Cost = - getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -331,7 +362,7 @@ unsigned AArch64TTIImpl::getArithmeticInstrCost( } } -unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -346,19 +377,20 @@ unsigned AArch64TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return 1; } -unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) { int ISD = TLI->InstructionOpcodeToISD(Opcode); - // We don't lower vector selects well that are wider than the register width. + // We don't lower some vector selects well that are wider than the register + // width. if (ValTy->isVectorTy() && ISD == ISD::SELECT) { // We would need this many instructions to hide the scalarization happening. - const unsigned AmortizationCost = 20; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> + const int AmortizationCost = 20; + static const TypeConversionCostTblEntry VectorSelectTbl[] = { - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost }, + { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, + { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, + { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } @@ -367,20 +399,18 @@ unsigned AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, EVT SelCondTy = TLI->getValueType(DL, CondTy); EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { - int Idx = - ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(), - SelValTy.getSimpleVT()); - if (Idx != -1) - return VectorSelectTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT())) + return Entry->Cost; } } return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); +int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, + unsigned Alignment, unsigned AddressSpace) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isIntegerTy(64)) { @@ -389,7 +419,7 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // practice on inlined memcpy code. // We make v2i64 stores expensive so that we will only vectorize if there // are 6 other instructions getting vectorized. - unsigned AmortizationCost = 6; + int AmortizationCost = 6; return LT.first * 2 * AmortizationCost; } @@ -407,16 +437,18 @@ unsigned AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( - unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, - unsigned Alignment, unsigned AddressSpace) { +int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); if (Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned NumElts = VecTy->getVectorNumElements(); Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // ldN/stN only support legal vector types of size 64 or 128 in bits. if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) @@ -427,8 +459,8 @@ unsigned AArch64TTIImpl::getInterleavedMemoryOpCost( Alignment, AddressSpace); } -unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { - unsigned Cost = 0; +int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { + int Cost = 0; for (auto *I : Tys) { if (!I->isVectorTy()) continue; @@ -506,7 +538,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_ld4: Info.ReadMem = true; Info.WriteMem = false; - Info.Vol = false; + Info.IsSimple = true; Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(0); break; @@ -515,7 +547,7 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, case Intrinsic::aarch64_neon_st4: Info.ReadMem = false; Info.WriteMem = true; - Info.Vol = false; + Info.IsSimple = true; Info.NumMemRefs = 1; Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1); break; diff --git a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 444d3cc..ec58c4f 100644 --- a/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -48,7 +48,7 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { }; public: - explicit AArch64TTIImpl(const AArch64TargetMachine *TM, Function &F) + explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -63,12 +63,11 @@ public: /// @{ using BaseT::getIntImmCost; - unsigned getIntImmCost(int64_t Val); - unsigned getIntImmCost(const APInt &Imm, Type *Ty); - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCost(int64_t Val); + int getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); /// @} @@ -76,6 +75,8 @@ public: /// \name Vector TTI Implementations /// @{ + bool enableInterleavedAccessVectorization() { return true; } + unsigned getNumberOfRegisters(bool Vector) { if (Vector) { if (ST->hasNEON()) @@ -96,25 +97,25 @@ public: unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getAddressComputationCost(Type *Ty, bool IsComplex); + int getAddressComputationCost(Type *Ty, bool IsComplex); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); - unsigned getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); + int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); @@ -123,11 +124,9 @@ public: bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info); - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 38e8b4d..394c8e7 100644 --- a/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/contrib/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -43,7 +43,6 @@ class AArch64Operand; class AArch64AsmParser : public MCTargetAsmParser { private: StringRef Mnemonic; ///< Instruction mnemonic. - MCSubtargetInfo &STI; // Map of register aliases registers via the .req directive. StringMap<std::pair<bool, unsigned> > RegisterReqs; @@ -101,6 +100,7 @@ private: OperandMatchResultTy tryParseSysReg(OperandVector &Operands); OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands); OperandMatchResultTy tryParsePrefetch(OperandVector &Operands); + OperandMatchResultTy tryParsePSBHint(OperandVector &Operands); OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands); OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands); OperandMatchResultTy tryParseFPImm(OperandVector &Operands); @@ -115,16 +115,16 @@ public: #define GET_OPERAND_DIAGNOSTIC_TYPES #include "AArch64GenAsmMatcher.inc" }; - AArch64AsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, + AArch64AsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI) { + : MCTargetAsmParser(Options, STI) { MCAsmParserExtension::Initialize(Parser); MCStreamer &S = getParser().getStreamer(); if (S.getTargetStreamer() == nullptr) new AArch64TargetStreamer(S); // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, @@ -160,7 +160,8 @@ private: k_Prefetch, k_ShiftExtend, k_FPImm, - k_Barrier + k_Barrier, + k_PSBHint, } Kind; SMLoc StartLoc, EndLoc; @@ -228,6 +229,12 @@ private: unsigned Length; }; + struct PSBHintOp { + unsigned Val; + const char *Data; + unsigned Length; + }; + struct ShiftExtendOp { AArch64_AM::ShiftExtendType Type; unsigned Amount; @@ -251,6 +258,7 @@ private: struct SysRegOp SysReg; struct SysCRImmOp SysCRImm; struct PrefetchOp Prefetch; + struct PSBHintOp PSBHint; struct ShiftExtendOp ShiftExtend; }; @@ -302,6 +310,9 @@ public: case k_Prefetch: Prefetch = o.Prefetch; break; + case k_PSBHint: + PSBHint = o.PSBHint; + break; case k_ShiftExtend: ShiftExtend = o.ShiftExtend; break; @@ -393,6 +404,16 @@ public: return Prefetch.Val; } + unsigned getPSBHint() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return PSBHint.Val; + } + + StringRef getPSBHintName() const { + assert(Kind == k_PSBHint && "Invalid access!"); + return StringRef(PSBHint.Data, PSBHint.Length); + } + StringRef getPrefetchName() const { assert(Kind == k_Prefetch && "Invalid access!"); return StringRef(Prefetch.Data, Prefetch.Length); @@ -497,6 +518,15 @@ public: return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000; } + bool isImm0_1() const { + if (!isImm()) + return false; + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm()); + if (!MCE) + return false; + int64_t Val = MCE->getValue(); + return (Val >= 0 && Val < 2); + } bool isImm0_7() const { if (!isImm()) return false; @@ -876,12 +906,15 @@ public: } bool isMSRSystemRegister() const { if (!isSysReg()) return false; - return SysReg.MSRReg != -1U; } - bool isSystemPStateField() const { + bool isSystemPStateFieldWithImm0_1() const { if (!isSysReg()) return false; - + return (SysReg.PStateField == AArch64PState::PAN || + SysReg.PStateField == AArch64PState::UAO); + } + bool isSystemPStateFieldWithImm0_15() const { + if (!isSysReg() || isSystemPStateFieldWithImm0_1()) return false; return SysReg.PStateField != -1U; } bool isReg() const override { return Kind == k_Register && !Reg.isVector; } @@ -950,6 +983,7 @@ public: } bool isSysCR() const { return Kind == k_SysCR; } bool isPrefetch() const { return Kind == k_Prefetch; } + bool isPSBHint() const { return Kind == k_PSBHint; } bool isShiftExtend() const { return Kind == k_ShiftExtend; } bool isShifter() const { if (!isShiftExtend()) @@ -1175,8 +1209,10 @@ public: template <unsigned NumRegs> void addVectorList64Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { AArch64::D0, AArch64::D0_D1, - AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 }; + static const unsigned FirstRegs[] = { AArch64::D0, + AArch64::D0_D1, + AArch64::D0_D1_D2, + AArch64::D0_D1_D2_D3 }; unsigned FirstReg = FirstRegs[NumRegs - 1]; Inst.addOperand( @@ -1186,8 +1222,10 @@ public: template <unsigned NumRegs> void addVectorList128Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); - static unsigned FirstRegs[] = { AArch64::Q0, AArch64::Q0_Q1, - AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 }; + static const unsigned FirstRegs[] = { AArch64::Q0, + AArch64::Q0_Q1, + AArch64::Q0_Q1_Q2, + AArch64::Q0_Q1_Q2_Q3 }; unsigned FirstReg = FirstRegs[NumRegs - 1]; Inst.addOperand( @@ -1304,6 +1342,12 @@ public: Inst.addOperand(MCOperand::createImm(MCE->getValue() / 16)); } + void addImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(MCE->getValue())); + } + void addImm0_7Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm()); @@ -1491,7 +1535,13 @@ public: Inst.addOperand(MCOperand::createImm(SysReg.MSRReg)); } - void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const { + void addSystemPStateFieldWithImm0_1Operands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + + Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); + } + + void addSystemPStateFieldWithImm0_15Operands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); Inst.addOperand(MCOperand::createImm(SysReg.PStateField)); @@ -1507,6 +1557,11 @@ public: Inst.addOperand(MCOperand::createImm(getPrefetch())); } + void addPSBHintOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createImm(getPSBHint())); + } + void addShifterOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); unsigned Imm = @@ -1703,6 +1758,19 @@ public: return Op; } + static std::unique_ptr<AArch64Operand> CreatePSBHint(unsigned Val, + StringRef Str, + SMLoc S, + MCContext &Ctx) { + auto Op = make_unique<AArch64Operand>(k_PSBHint, Ctx); + Op->PSBHint.Val = Val; + Op->PSBHint.Data = Str.data(); + Op->PSBHint.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return Op; + } + static std::unique_ptr<AArch64Operand> CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp, unsigned Val, bool HasExplicitAmount, SMLoc S, SMLoc E, MCContext &Ctx) { @@ -1776,6 +1844,10 @@ void AArch64Operand::print(raw_ostream &OS) const { OS << "<prfop invalid #" << getPrefetch() << ">"; break; } + case k_PSBHint: { + OS << getPSBHintName(); + break; + } case k_ShiftExtend: { OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #" << getShiftExtendAmount(); @@ -1849,6 +1921,8 @@ static bool isValidVectorKind(StringRef Name) { .Case(".h", true) .Case(".s", true) .Case(".d", true) + // Needed for fp16 scalar pairwise reductions + .Case(".2h", true) .Default(false); } @@ -2016,7 +2090,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { bool Valid; auto Mapper = AArch64PRFM::PRFMMapper(); StringRef Name = - Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid); + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name, S, getContext())); return MatchOperand_Success; @@ -2030,7 +2104,7 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { bool Valid; auto Mapper = AArch64PRFM::PRFMMapper(); unsigned prfop = - Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid); + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); if (!Valid) { TokError("pre-fetch hint expected"); return MatchOperand_ParseFail; @@ -2042,6 +2116,32 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) { return MatchOperand_Success; } +/// tryParsePSBHint - Try to parse a PSB operand, mapped to Hint command +AArch64AsmParser::OperandMatchResultTy +AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + SMLoc S = getLoc(); + const AsmToken &Tok = Parser.getTok(); + if (Tok.isNot(AsmToken::Identifier)) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + bool Valid; + auto Mapper = AArch64PSBHint::PSBHintMapper(); + unsigned psbhint = + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); + if (!Valid) { + TokError("invalid operand for instruction"); + return MatchOperand_ParseFail; + } + + Parser.Lex(); // Eat identifier token. + Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(), + S, getContext())); + return MatchOperand_Success; +} + /// tryParseAdrpLabel - Parse and validate a source label for the ADRP /// instruction. AArch64AsmParser::OperandMatchResultTy @@ -2439,6 +2539,13 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, } else if (!Op.compare_lower("cisw")) { // SYS #0, C7, C14, #2 SYS_ALIAS(0, 7, 14, 2); + } else if (!Op.compare_lower("cvap")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #3, C7, C12, #1 + SYS_ALIAS(3, 7, 12, 1); + } else { + return TokError("DC CVAP requires ARMv8.2a"); + } } else { return TokError("invalid operand for DC instruction"); } @@ -2479,6 +2586,20 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc, } else if (!Op.compare_lower("s12e0w")) { // SYS #4, C7, C8, #7 SYS_ALIAS(4, 7, 8, 7); + } else if (!Op.compare_lower("s1e1rp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #0 + SYS_ALIAS(0, 7, 9, 0); + } else { + return TokError("AT S1E1RP requires ARMv8.2a"); + } + } else if (!Op.compare_lower("s1e1wp")) { + if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) { + // SYS #0, C7, C9, #1 + SYS_ALIAS(0, 7, 9, 1); + } else { + return TokError("AT S1E1WP requires ARMv8.2a"); + } } else { return TokError("invalid operand for AT instruction"); } @@ -2644,7 +2765,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { bool Valid; auto Mapper = AArch64DB::DBarrierMapper(); StringRef Name = - Mapper.toString(MCE->getValue(), STI.getFeatureBits(), Valid); + Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid); Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name, ExprLoc, getContext())); return MatchOperand_Success; @@ -2658,7 +2779,7 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) { bool Valid; auto Mapper = AArch64DB::DBarrierMapper(); unsigned Opt = - Mapper.fromString(Tok.getString(), STI.getFeatureBits(), Valid); + Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid); if (!Valid) { TokError("invalid barrier option name"); return MatchOperand_ParseFail; @@ -2687,20 +2808,21 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) { bool IsKnown; auto MRSMapper = AArch64SysReg::MRSMapper(); - uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), STI.getFeatureBits(), - IsKnown); + uint32_t MRSReg = MRSMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (MRSReg != -1U) && "register should be -1 if and only if it's unknown"); auto MSRMapper = AArch64SysReg::MSRMapper(); - uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), STI.getFeatureBits(), - IsKnown); + uint32_t MSRReg = MSRMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (MSRReg != -1U) && "register should be -1 if and only if it's unknown"); auto PStateMapper = AArch64PState::PStateMapper(); uint32_t PStateField = - PStateMapper.fromString(Tok.getString(), STI.getFeatureBits(), IsKnown); + PStateMapper.fromString(Tok.getString(), + getSTI().getFeatureBits(), IsKnown); assert(IsKnown == (PStateField != -1U) && "register should be -1 if and only if it's unknown"); @@ -3151,7 +3273,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, if (Operands.size() < 2 || !static_cast<AArch64Operand &>(*Operands[1]).isReg()) - return true; + return Error(Loc, "Only valid when first operand is register"); bool IsXReg = AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains( @@ -3183,7 +3305,7 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode, } // If it is a label or an imm that cannot fit in a movz, put it into CP. const MCExpr *CPLoc = - getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4); + getTargetStreamer().addConstantPoolEntry(SubExprVal, IsXReg ? 8 : 4, Loc); Operands.push_back(AArch64Operand::CreateImm(CPLoc, S, E, Ctx)); return false; } @@ -3601,6 +3723,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "index must be a multiple of 8 in range [0, 32760]."); case Match_InvalidMemoryIndexed16: return Error(Loc, "index must be a multiple of 16 in range [0, 65520]."); + case Match_InvalidImm0_1: + return Error(Loc, "immediate must be an integer in range [0, 1]."); case Match_InvalidImm0_7: return Error(Loc, "immediate must be an integer in range [0, 7]."); case Match_InvalidImm0_15: @@ -3912,7 +4036,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, AArch64Operand &ImmOp = static_cast<AArch64Operand &>(*Operands[2]); if (RegOp.isReg() && ImmOp.isFPImm() && ImmOp.getFPImm() == (unsigned)-1) { unsigned zreg = - AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains( + !AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains( RegOp.getReg()) ? AArch64::WZR : AArch64::XZR; @@ -3929,10 +4053,27 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // If that fails, try against the alternate table containing long-form NEON: // "fadd v0.2s, v1.2s, v2.2s" - if (MatchResult != Match_Success) + if (MatchResult != Match_Success) { + // But first, save the short-form match result: we can use it in case the + // long-form match also fails. + auto ShortFormNEONErrorInfo = ErrorInfo; + auto ShortFormNEONMatchResult = MatchResult; + MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0); + // Now, both matches failed, and the long-form match failed on the mnemonic + // suffix token operand. The short-form match failure is probably more + // relevant: use it instead. + if (MatchResult == Match_InvalidOperand && ErrorInfo == 1 && + Operands.size() > 1 && ((AArch64Operand &)*Operands[1]).isToken() && + ((AArch64Operand &)*Operands[1]).isTokenSuffix()) { + MatchResult = ShortFormNEONMatchResult; + ErrorInfo = ShortFormNEONErrorInfo; + } + } + + switch (MatchResult) { case Match_Success: { // Perform range checking and other semantic validations @@ -3944,7 +4085,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return true; Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; } case Match_MissingFeature: { @@ -3966,6 +4107,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return showMatchError(IDLoc, MatchResult); case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0ULL) { if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); @@ -4011,6 +4153,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidMemoryIndexed8SImm7: case Match_InvalidMemoryIndexed16SImm7: case Match_InvalidMemoryIndexedSImm9: + case Match_InvalidImm0_1: case Match_InvalidImm0_7: case Match_InvalidImm0_15: case Match_InvalidImm0_31: @@ -4083,7 +4226,7 @@ bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) { if (getParser().parseExpression(Value)) return true; - getParser().getStreamer().EmitValue(Value, Size); + getParser().getStreamer().EmitValue(Value, Size, L); if (getLexer().is(AsmToken::EndOfStatement)) break; @@ -4155,7 +4298,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) { Inst.setOpcode(AArch64::TLSDESCCALL); Inst.addOperand(MCOperand::createExpr(Expr)); - getParser().getStreamer().EmitInstruction(Inst, STI); + getParser().getStreamer().EmitInstruction(Inst, getSTI()); return false; } diff --git a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index db9fb0e..f1f968e 100644 --- a/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/contrib/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -1516,6 +1516,10 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst, uint64_t pstate_field = (op1 << 3) | op2; + if ((pstate_field == AArch64PState::PAN || + pstate_field == AArch64PState::UAO) && crm > 1) + return Fail; + Inst.addOperand(MCOperand::createImm(pstate_field)); Inst.addOperand(MCOperand::createImm(crm)); diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp index 7f56c2c..d8a8108 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -55,7 +56,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O, unsigned Opcode = MI->getOpcode(); if (Opcode == AArch64::SYSxt) - if (printSysAlias(MI, O)) { + if (printSysAlias(MI, STI, O)) { printAnnotation(O, Annot); return; } @@ -269,7 +270,7 @@ struct LdStNInstrDesc { int NaturalOffset; }; -static LdStNInstrDesc LdStNInstInfo[] = { +static const LdStNInstrDesc LdStNInstInfo[] = { { AArch64::LD1i8, "ld1", ".b", 1, true, 0 }, { AArch64::LD1i16, "ld1", ".h", 1, true, 0 }, { AArch64::LD1i32, "ld1", ".s", 1, true, 0 }, @@ -612,7 +613,7 @@ static LdStNInstrDesc LdStNInstInfo[] = { { AArch64::ST4Fourv2s_POST, "st4", ".2s", 1, false, 32 }, }; -static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { +static const LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) { unsigned Idx; for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx) if (LdStNInstInfo[Idx].Opcode == Opcode) @@ -641,7 +642,7 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } - if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { + if (const LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) { O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t'; // Now onto the operands: first a vector list with possible lane @@ -674,7 +675,9 @@ void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O, AArch64InstPrinter::printInst(MI, O, Annot, STI); } -bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { +bool AArch64InstPrinter::printSysAlias(const MCInst *MI, + const MCSubtargetInfo &STI, + raw_ostream &O) { #ifndef NDEBUG unsigned Opcode = MI->getOpcode(); assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!"); @@ -729,6 +732,11 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { if (Op1Val == 3 && Op2Val == 1) Asm = "dc\tcvau"; break; + case 12: + if (Op1Val == 3 && Op2Val == 1 && + (STI.getFeatureBits()[AArch64::HasV8_2aOps])) + Asm = "dc\tcvap"; + break; case 14: if (Op1Val == 3 && Op2Val == 1) Asm = "dc\tcivac"; @@ -773,6 +781,21 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) { break; } break; + case 9: + switch (Op1Val) { + default: + break; + case 0: + if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) { + switch (Op2Val) { + default: + break; + case 0: Asm = "at\ts1e1rp"; break; + case 1: Asm = "at\ts1e1wp"; break; + } + } + break; + } } } else if (CnVal == 8) { // TLBI aliases @@ -1122,6 +1145,19 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum, O << '#' << prfop; } +void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, + raw_ostream &O) { + unsigned psbhintop = MI->getOperand(OpNum).getImm(); + bool Valid; + StringRef Name = + AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid); + if (Valid) + O << Name; + else + O << '#' << psbhintop; +} + void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O) { diff --git a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h index 15dee97..ea68d98 100644 --- a/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h +++ b/contrib/llvm/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h @@ -15,14 +15,10 @@ #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H #include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class AArch64InstPrinter : public MCInstPrinter { public: AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, @@ -48,7 +44,8 @@ public: unsigned AltIdx = AArch64::NoRegAltName); protected: - bool printSysAlias(const MCInst *MI, raw_ostream &O); + bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI, + raw_ostream &O); // Operand printers void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); @@ -122,6 +119,9 @@ protected: void printPrefetchOp(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); + void printPSBHintOp(const MCInst *MI, unsigned OpNum, + const MCSubtargetInfo &STI, raw_ostream &O); + void printFPImmOperand(const MCInst *MI, unsigned OpNum, const MCSubtargetInfo &STI, raw_ostream &O); diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index ed24343..648b1df 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -364,6 +364,32 @@ static inline float getFPImmFloat(unsigned Imm) { return FPUnion.F; } +/// getFP16Imm - Return an 8-bit floating-point version of the 16-bit +/// floating-point value. If the value cannot be represented as an 8-bit +/// floating-point value, then return -1. +static inline int getFP16Imm(const APInt &Imm) { + uint32_t Sign = Imm.lshr(15).getZExtValue() & 1; + int32_t Exp = (Imm.lshr(10).getSExtValue() & 0x1f) - 15; // -14 to 15 + int32_t Mantissa = Imm.getZExtValue() & 0x3ff; // 10 bits + + // We can handle 4 bits of mantissa. + // mantissa = (16+UInt(e:f:g:h))/16. + if (Mantissa & 0x3f) + return -1; + Mantissa >>= 6; + + // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3 + if (Exp < -3 || Exp > 4) + return -1; + Exp = ((Exp+3) & 0x7) ^ 4; + + return ((int)Sign << 7) | (Exp << 4) | Mantissa; +} + +static inline int getFP16Imm(const APFloat &FPImm) { + return getFP16Imm(FPImm.bitcastToAPInt()); +} + /// getFP32Imm - Return an 8-bit floating-point version of the 32-bit /// floating-point value. If the value cannot be represented as an 8-bit /// floating-point value, then return -1. diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 16d5356..d26604f 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -128,10 +128,9 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d) /// if necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override { + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size); + MCELFStreamer::EmitValueImpl(Value, Size, Loc); } private: diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 921c4b9..fbce26e 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -48,10 +48,6 @@ AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { UseDataRegionDirectives = true; ExceptionsType = ExceptionHandling::DwarfCFI; - - // AArch64 Darwin doesn't have the baggage of X86/ARM, so it's fine to use - // LShr instead of AShr. - UseLogicalShr = true; } const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol( diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp index 2870341..a540f49 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp @@ -85,13 +85,13 @@ void AArch64MCExpr::visitUsedExpr(MCStreamer &Streamer) const { Streamer.visitUsedExpr(*getSubExpr()); } -MCSection *AArch64MCExpr::findAssociatedSection() const { +MCFragment *AArch64MCExpr::findAssociatedFragment() const { llvm_unreachable("FIXME: what goes here?"); } bool AArch64MCExpr::evaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const { + const MCAsmLayout *Layout, + const MCFixup *Fixup) const { if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup)) return false; diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h index 1165314..db36a65 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h @@ -149,11 +149,10 @@ public: void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override; + MCFragment *findAssociatedFragment() const override; - bool evaluateAsRelocatableImpl(MCValue &Res, - const MCAsmLayout *Layout, - const MCFixup *Fixup) const override; + bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; @@ -162,7 +161,6 @@ public: } static bool classof(const AArch64MCExpr *) { return true; } - }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp index 741b273..61c96f1 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp @@ -90,9 +90,11 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo( Log2Size = llvm::Log2_32(4); // This encompasses the relocation for the whole 21-bit value. switch (Sym->getKind()) { - default: - Asm.getContext().reportFatalError(Fixup.getLoc(), - "ADR/ADRP relocations must be GOT relative"); + default: { + Asm.getContext().reportError(Fixup.getLoc(), + "ADR/ADRP relocations must be GOT relative"); + return false; + } case MCSymbolRefExpr::VK_PAGE: RelocType = unsigned(MachO::ARM64_RELOC_PAGE21); return true; @@ -170,25 +172,25 @@ void AArch64MachObjectWriter::recordRelocation( // assembler local symbols. If we got here, that's not what we have, // so complain loudly. if (Kind == AArch64::fixup_aarch64_pcrel_branch19) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "conditional branch requires assembler-local" - " label. '" + - Target.getSymA()->getSymbol().getName() + - "' is external."); + Asm.getContext().reportError(Fixup.getLoc(), + "conditional branch requires assembler-local" + " label. '" + + Target.getSymA()->getSymbol().getName() + + "' is external."); return; } // 14-bit branch relocations should only target internal labels, and so // should never get here. if (Kind == AArch64::fixup_aarch64_pcrel_branch14) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "Invalid relocation on conditional branch!"); + Asm.getContext().reportError(Fixup.getLoc(), + "Invalid relocation on conditional branch!"); return; } if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size, - Asm)) { - Asm.getContext().reportFatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); + Asm)) { + Asm.getContext().reportError(Fixup.getLoc(), "unknown AArch64 fixup kind!"); return; } @@ -200,8 +202,9 @@ void AArch64MachObjectWriter::recordRelocation( Type = MachO::ARM64_RELOC_UNSIGNED; if (IsPCRel) { - Asm.getContext().reportFatalError(Fixup.getLoc(), - "PC relative absolute relocation!"); + Asm.getContext().reportError(Fixup.getLoc(), + "PC relative absolute relocation!"); + return; // FIXME: x86_64 sets the type to a branch reloc here. Should we do // something similar? @@ -229,16 +232,20 @@ void AArch64MachObjectWriter::recordRelocation( Writer->addRelocation(A_Base, Fragment->getParent(), MRE); return; } else if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { // Otherwise, neither symbol can be modified. - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation of modified symbol"); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } // We don't support PCrel relocations of differences. - if (IsPCRel) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported pc-relative relocation of " - "difference"); + if (IsPCRel) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported pc-relative relocation of " + "difference"); + return; + } // AArch64 always uses external relocations. If there is no symbol to use as // a base address (a local symbol with no preceding non-local symbol), @@ -246,20 +253,26 @@ void AArch64MachObjectWriter::recordRelocation( // // FIXME: We should probably just synthesize an external symbol and use // that. - if (!A_Base) - Asm.getContext().reportFatalError( + if (!A_Base) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + A->getName() + "'. Must have non-local symbol earlier in section."); - if (!B_Base) - Asm.getContext().reportFatalError( + return; + } + if (!B_Base) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + B->getName() + "'. Must have non-local symbol earlier in section."); + return; + } - if (A_Base == B_Base && A_Base) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation with identical base"); + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } Value += (!A->getFragment() ? 0 : Writer->getSymbolAddress(*A, Layout)) - (!A_Base || !A_Base->getFragment() ? 0 : Writer->getSymbolAddress( @@ -309,10 +322,12 @@ void AArch64MachObjectWriter::recordRelocation( // we need to preserve and merge with the new Target? How about // the FixedValue? if (!Symbol->getVariableValue()->evaluateAsRelocatable(Target, &Layout, - &Fixup)) - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unable to resolve variable '" + - Symbol->getName() + "'"); + &Fixup)) { + Asm.getContext().reportError(Fixup.getLoc(), + "unable to resolve variable '" + + Symbol->getName() + "'"); + return; + } return recordRelocation(Writer, Asm, Layout, Fragment, Fixup, Target, FixedValue); } @@ -337,11 +352,13 @@ void AArch64MachObjectWriter::recordRelocation( Value += Layout.getSymbolOffset(*Symbol) - Layout.getSymbolOffset(*Base); } else if (Symbol->isInSection()) { - if (!CanUseLocalRelocation) - Asm.getContext().reportFatalError( + if (!CanUseLocalRelocation) { + Asm.getContext().reportError( Fixup.getLoc(), "unsupported relocation of local symbol '" + Symbol->getName() + "'. Must have non-local symbol earlier in section."); + return; + } // Adjust the relocation to be section-relative. // The index is the section ordinal (1-based). const MCSection &Sec = Symbol->getSection(); @@ -361,9 +378,10 @@ void AArch64MachObjectWriter::recordRelocation( return; } } - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation of variable '" + Symbol->getName() + "'"); + return; } } diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp index 52b000d..3e86a42 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp @@ -26,8 +26,9 @@ AArch64TargetStreamer::~AArch64TargetStreamer() {} // The constant pool handling is shared by all AArch64TargetStreamer // implementations. const MCExpr *AArch64TargetStreamer::addConstantPoolEntry(const MCExpr *Expr, - unsigned Size) { - return ConstantPools->addEntry(Streamer, Expr, Size); + unsigned Size, + SMLoc Loc) { + return ConstantPools->addEntry(Streamer, Expr, Size, Loc); } void AArch64TargetStreamer::emitCurrentConstantPool() { diff --git a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h index fcc0d05..51432830 100644 --- a/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h +++ b/contrib/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h @@ -24,7 +24,7 @@ public: /// Callback used to implement the ldr= pseudo. /// Add a new entry to the constant pool for the current section and return an /// MCExpr that can be used to refer to the constant pool location. - const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size); + const MCExpr *addConstantPoolEntry(const MCExpr *, unsigned Size, SMLoc Loc); /// Callback used to implemnt the .ltorg directive. /// Emit contents of constant pool for the current section. diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index ee85b65b..78f5289 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -146,11 +146,22 @@ const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings // v8.1a "Privileged Access Never" extension-specific PStates {"pan", PAN, {AArch64::HasV8_1aOps}}, + + // v8.2a + {"uao", UAO, {AArch64::HasV8_2aOps}}, }; AArch64PState::PStateMapper::PStateMapper() : AArch64NamedImmMapper(PStateMappings, 0) {} +const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = { + // v8.2a "Statistical Profiling" extension-specific PSB operand + {"csync", CSync, {AArch64::FeatureSPE}}, +}; + +AArch64PSBHint::PSBHintMapper::PSBHintMapper() + : AArch64NamedImmMapper(PSBHintMappings, 0) {} + const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"mdccsr_el0", MDCCSR_EL0, {}}, {"dbgdtrrx_el0", DBGDTRRX_EL0, {}}, @@ -192,6 +203,7 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = { {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}}, {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}}, {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}}, + {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}}, {"mvfr0_el1", MVFR0_EL1, {}}, {"mvfr1_el1", MVFR1_EL1, {}}, {"mvfr2_el1", MVFR2_EL1, {}}, @@ -275,9 +287,6 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = { {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}}, {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}}, {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}}, - - // v8.1a "Privileged Access Never" extension-specific system registers - {"pan", PAN, {AArch64::HasV8_1aOps}}, }; AArch64SysReg::MSRMapper::MSRMapper() { @@ -804,6 +813,24 @@ const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}}, {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}}, {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}}, + + // v8.2a registers + {"uao", UAO, {AArch64::HasV8_2aOps}}, + + // v8.2a "Statistical Profiling extension" registers + {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}}, + {"pmbptr_el1", PMBPTR_EL1, {AArch64::FeatureSPE}}, + {"pmbsr_el1", PMBSR_EL1, {AArch64::FeatureSPE}}, + {"pmbidr_el1", PMBIDR_EL1, {AArch64::FeatureSPE}}, + {"pmscr_el2", PMSCR_EL2, {AArch64::FeatureSPE}}, + {"pmscr_el12", PMSCR_EL12, {AArch64::FeatureSPE}}, + {"pmscr_el1", PMSCR_EL1, {AArch64::FeatureSPE}}, + {"pmsicr_el1", PMSICR_EL1, {AArch64::FeatureSPE}}, + {"pmsirr_el1", PMSIRR_EL1, {AArch64::FeatureSPE}}, + {"pmsfcr_el1", PMSFCR_EL1, {AArch64::FeatureSPE}}, + {"pmsevfr_el1", PMSEVFR_EL1, {AArch64::FeatureSPE}}, + {"pmslatfr_el1", PMSLATFR_EL1, {AArch64::FeatureSPE}}, + {"pmsidr_el1", PMSIDR_EL1, {AArch64::FeatureSPE}}, }; uint32_t diff --git a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h index 7e42f8e..f649cb9 100644 --- a/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h +++ b/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h @@ -337,7 +337,9 @@ namespace AArch64AT { S12E1R = 0x63c4, // 01 100 0111 1000 100 S12E1W = 0x63c5, // 01 100 0111 1000 101 S12E0R = 0x63c6, // 01 100 0111 1000 110 - S12E0W = 0x63c7 // 01 100 0111 1000 111 + S12E0W = 0x63c7, // 01 100 0111 1000 111 + S1E1RP = 0x43c8, // 01 000 0111 1001 000 + S1E1WP = 0x43c9 // 01 000 0111 1001 001 }; struct ATMapper : AArch64NamedImmMapper { @@ -463,6 +465,9 @@ namespace AArch64PState { // v8.1a "Privileged Access Never" extension-specific PStates PAN = 0x04, + + // v8.2a "User Access Override" extension-specific PStates + UAO = 0x03 }; struct PStateMapper : AArch64NamedImmMapper { @@ -473,6 +478,21 @@ namespace AArch64PState { } +namespace AArch64PSBHint { + enum PSBHintValues { + Invalid = -1, + // v8.2a "Statistical Profiling" extension-specific PSB operands + CSync = 0x11, // psb csync = hint #0x11 + }; + + struct PSBHintMapper : AArch64NamedImmMapper { + const static Mapping PSBHintMappings[]; + + PSBHintMapper(); + }; + +} + namespace AArch64SE { enum ShiftExtSpecifiers { Invalid = -1, @@ -594,6 +614,7 @@ namespace AArch64SysReg { ID_A64ISAR1_EL1 = 0xc031, // 11 000 0000 0110 001 ID_A64MMFR0_EL1 = 0xc038, // 11 000 0000 0111 000 ID_A64MMFR1_EL1 = 0xc039, // 11 000 0000 0111 001 + ID_A64MMFR2_EL1 = 0xc03a, // 11 000 0000 0111 010 MVFR0_EL1 = 0xc018, // 11 000 0000 0011 000 MVFR1_EL1 = 0xc019, // 11 000 0000 0011 001 MVFR2_EL1 = 0xc01a, // 11 000 0000 0011 010 @@ -1190,6 +1211,24 @@ namespace AArch64SysReg { SPSR_EL12 = 0xea00, // 11 101 0100 0000 000 ELR_EL12 = 0xea01, // 11 101 0100 0000 001 + // v8.2a registers + UAO = 0xc214, // 11 000 0100 0010 100 + + // v8.2a "Statistical Profiling extension" registers + PMBLIMITR_EL1 = 0xc4d0, // 11 000 1001 1010 000 + PMBPTR_EL1 = 0xc4d1, // 11 000 1001 1010 001 + PMBSR_EL1 = 0xc4d3, // 11 000 1001 1010 011 + PMBIDR_EL1 = 0xc4d7, // 11 000 1001 1010 111 + PMSCR_EL2 = 0xe4c8, // 11 100 1001 1001 000 + PMSCR_EL12 = 0xecc8, // 11 101 1001 1001 000 + PMSCR_EL1 = 0xc4c8, // 11 000 1001 1001 000 + PMSICR_EL1 = 0xc4ca, // 11 000 1001 1001 010 + PMSIRR_EL1 = 0xc4cb, // 11 000 1001 1001 011 + PMSFCR_EL1 = 0xc4cc, // 11 000 1001 1001 100 + PMSEVFR_EL1 = 0xc4cd, // 11 000 1001 1001 101 + PMSLATFR_EL1 = 0xc4ce, // 11 000 1001 1001 110 + PMSIDR_EL1 = 0xc4cf, // 11 000 1001 1001 111 + // Cyclone specific system registers CPM_IOACC_CTL_EL3 = 0xff90, }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h index 0a05d25..8c3cb56 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -44,15 +44,21 @@ FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); -FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm); +FunctionPass *createSIFixSGPRCopiesPass(); FunctionPass *createSIFixSGPRLiveRangesPass(); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSIInsertWaits(TargetMachine &tm); -FunctionPass *createSIPrepareScratchRegs(); + +ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); +extern char &AMDGPUAnnotateKernelFeaturesID; void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; +void initializeSIFixSGPRCopiesPass(PassRegistry &); +extern char &SIFixSGPRCopiesID; + void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; @@ -64,6 +70,8 @@ FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); ModulePass *createAMDGPUAlwaysInlinePass(); +ModulePass *createAMDGPUOpenCLImageTypeLoweringPass(); +FunctionPass *createAMDGPUAnnotateUniformValues(); void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&); extern char &SIFixControlFlowLiveIntervalsID; @@ -71,6 +79,8 @@ extern char &SIFixControlFlowLiveIntervalsID; void initializeSIFixSGPRLiveRangesPass(PassRegistry&); extern char &SIFixSGPRLiveRangesID; +void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&); +extern char &AMDGPUAnnotateUniformValuesPassID; extern Target TheAMDGPUTarget; extern Target TheGCNTarget; @@ -85,8 +95,6 @@ enum TargetIndex { }; } -#define END_OF_TEXT_LABEL_NAME "EndOfTextLabel" - } // End namespace llvm namespace ShaderType { diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td index 68b5050..d4af8d2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -108,6 +108,11 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature <"unsafe-ds-offset-fol "true", "Force using DS instruction immediate offsets on SI">; +def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global", + "FlatForGlobal", + "true", + "Force to generate flat instruction for global">; + def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "FlatAddressSpace", "true", @@ -272,9 +277,14 @@ def isSICI : Predicate< "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS" >, AssemblerPredicate<"FeatureGCN1Encoding">; +def isVI : Predicate < + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, + AssemblerPredicate<"FeatureGCN3Encoding">; + class PredicateControl { Predicate SubtargetPredicate; Predicate SIAssemblerPredicate = isSICI; + Predicate VIAssemblerPredicate = isVI; list<Predicate> AssemblerPredicates = []; Predicate AssemblerPredicate = TruePredicate; list<Predicate> OtherPredicates = []; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp new file mode 100644 index 0000000..3781839 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -0,0 +1,126 @@ +//===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass adds target attributes to functions which use intrinsics +/// which will impact calling convention lowering. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +#define DEBUG_TYPE "amdgpu-annotate-kernel-features" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateKernelFeatures : public ModulePass { +private: + void addAttrToCallers(Function *Intrin, StringRef AttrName); + bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); + +public: + static char ID; + + AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { } + bool runOnModule(Module &M) override; + const char *getPassName() const override { + return "AMDGPU Annotate Kernel Features"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + ModulePass::getAnalysisUsage(AU); + } +}; + +} + +char AMDGPUAnnotateKernelFeatures::ID = 0; + +char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; + + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) +INITIALIZE_PASS_END(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, + "Add AMDGPU function attributes", false, false) + + +void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, + StringRef AttrName) { + SmallPtrSet<Function *, 4> SeenFuncs; + + for (User *U : Intrin->users()) { + // CallInst is the only valid user for an intrinsic. + CallInst *CI = cast<CallInst>(U); + + Function *CallingFunction = CI->getParent()->getParent(); + if (SeenFuncs.insert(CallingFunction).second) + CallingFunction->addFnAttr(AttrName); + } +} + +bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( + Module &M, + ArrayRef<StringRef[2]> IntrinsicToAttr) { + bool Changed = false; + + for (const StringRef *Arr : IntrinsicToAttr) { + if (Function *Fn = M.getFunction(Arr[0])) { + addAttrToCallers(Fn, Arr[1]); + Changed = true; + } + } + + return Changed; +} + +bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { + Triple TT(M.getTargetTriple()); + + static const StringRef IntrinsicToAttr[][2] = { + // .x omitted + { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, + { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, + + // .x omitted + { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, + { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } + + }; + + static const StringRef HSAIntrinsicToAttr[][2] = { + { "llvm.r600.read.local.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.local.size.z", "amdgpu-dispatch-ptr" }, + + { "llvm.r600.read.global.size.x", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.y", "amdgpu-dispatch-ptr" }, + { "llvm.r600.read.global.size.z", "amdgpu-dispatch-ptr" }, + { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" } + }; + + // TODO: Intrinsics that require queue ptr. + + // We do not need to note the x workitem or workgroup id because they are + // always initialized. + + bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); + if (TT.getOS() == Triple::AMDHSA) + Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); + + return Changed; +} + +ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { + return new AMDGPUAnnotateKernelFeatures(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp new file mode 100644 index 0000000..dfddc34 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -0,0 +1,84 @@ +//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass adds amdgpu.uniform metadata to IR values so this information +/// can be used during instruction selection. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUIntrinsicInfo.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-annotate-uniform" + +using namespace llvm; + +namespace { + +class AMDGPUAnnotateUniformValues : public FunctionPass, + public InstVisitor<AMDGPUAnnotateUniformValues> { + DivergenceAnalysis *DA; + +public: + static char ID; + AMDGPUAnnotateUniformValues() : + FunctionPass(ID) { } + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<DivergenceAnalysis>(); + AU.setPreservesAll(); + } + + void visitLoadInst(LoadInst &I); + +}; + +} // End anonymous namespace + +INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) +INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, + "Add AMDGPU uniform metadata", false, false) + +char AMDGPUAnnotateUniformValues::ID = 0; + +void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { + Value *Ptr = I.getPointerOperand(); + if (!DA->isUniform(Ptr)) + return; + + if (Instruction *PtrI = dyn_cast<Instruction>(Ptr)) + PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {})); + +} + +bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) { + return false; +} + +bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) { + DA = &getAnalysis<DivergenceAnalysis>(); + visit(F); + + return true; +} + +FunctionPass * +llvm::createAMDGPUAnnotateUniformValues() { + return new AMDGPUAnnotateUniformValues(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 0a5309b..ba71dc0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -100,14 +100,63 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { } } -void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) { - - // This label is used to mark the end of the .text section. - const TargetLoweringObjectFile &TLOF = getObjFileLowering(); - OutStreamer->SwitchSection(TLOF.getTextSection()); - MCSymbol *EndOfTextLabel = - OutContext.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - OutStreamer->EmitLabel(EndOfTextLabel); +void AMDGPUAsmPrinter::EmitFunctionEntryLabel() { + const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); + if (MFI->isKernel() && STM.isAmdHsaOS()) { + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + TS->EmitAMDGPUSymbolType(CurrentFnSym->getName(), + ELF::STT_AMDGPU_HSA_KERNEL); + } + + AsmPrinter::EmitFunctionEntryLabel(); +} + +static bool isModuleLinkage(const GlobalValue *GV) { + switch (GV->getLinkage()) { + case GlobalValue::InternalLinkage: + case GlobalValue::CommonLinkage: + return true; + case GlobalValue::ExternalLinkage: + return false; + default: llvm_unreachable("unknown linkage type"); + } +} + +void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { + + if (TM.getTargetTriple().getOS() != Triple::AMDHSA) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + if (GV->isDeclaration() || GV->getLinkage() == GlobalValue::PrivateLinkage) { + AsmPrinter::EmitGlobalVariable(GV); + return; + } + + // Group segment variables aren't emitted in HSA. + if (AMDGPU::isGroupSegment(GV)) + return; + + AMDGPUTargetStreamer *TS = + static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); + if (isModuleLinkage(GV)) { + TS->EmitAMDGPUHsaModuleScopeGlobal(GV->getName()); + } else { + TS->EmitAMDGPUHsaProgramScopeGlobal(GV->getName()); + } + + const DataLayout &DL = getDataLayout(); + OutStreamer->PushSection(); + OutStreamer->SwitchSection( + getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); + MCSymbol *GVSym = getSymbol(GV); + const Constant *C = GV->getInitializer(); + OutStreamer->EmitLabel(GVSym); + EmitGlobalConstant(DL, C); + OutStreamer->PopSection(); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -125,8 +174,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>(); SIProgramInfo KernelInfo; if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + getSIProgramInfo(KernelInfo, MF); if (!STM.isAmdHsaOS()) { - getSIProgramInfo(KernelInfo, MF); EmitProgramInfoSI(MF, KernelInfo); } // Emit directives @@ -165,6 +214,23 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), false); + + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), + false); + } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer->emitRawComment( @@ -278,27 +344,30 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, unsigned width = 0; bool isSGPR = false; - if (!MO.isReg()) { + if (!MO.isReg()) continue; - } + unsigned reg = MO.getReg(); - if (reg == AMDGPU::VCC || reg == AMDGPU::VCC_LO || - reg == AMDGPU::VCC_HI) { + switch (reg) { + case AMDGPU::EXEC: + case AMDGPU::SCC: + case AMDGPU::M0: + continue; + + case AMDGPU::VCC: + case AMDGPU::VCC_LO: + case AMDGPU::VCC_HI: VCCUsed = true; continue; - } else if (reg == AMDGPU::FLAT_SCR || - reg == AMDGPU::FLAT_SCR_LO || - reg == AMDGPU::FLAT_SCR_HI) { + + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: FlatUsed = true; continue; - } - switch (reg) { - default: break; - case AMDGPU::SCC: - case AMDGPU::EXEC: - case AMDGPU::M0: - continue; + default: + break; } if (AMDGPU::SReg_32RegClass.contains(reg)) { @@ -348,11 +417,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, } } - if (VCCUsed) + if (VCCUsed || FlatUsed) MaxSGPR += 2; - if (FlatUsed) + if (FlatUsed) { MaxSGPR += 2; + // 2 additional for VI+. + if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + MaxSGPR += 2; + } // We found the maximum register index. They start at 0, so add one to get the // number of registers. @@ -368,6 +441,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.NumSGPR = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG; } + if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) { + LLVMContext &Ctx = MF.getFunction()->getContext(); + Ctx.emitError("too many user SGPRs used"); + } + ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4; ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8; // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode @@ -419,18 +497,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, S_00B848_FLOAT_MODE(ProgInfo.FloatMode) | S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | - S_00B848_IEEE_MODE(ProgInfo.DebugMode) | + S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + // 0 = X, 1 = XY, 2 = XYZ + unsigned TIDIGCompCnt = 0; + if (MFI->hasWorkItemIDZ()) + TIDIGCompCnt = 2; + else if (MFI->hasWorkItemIDY()) + TIDIGCompCnt = 1; + ProgInfo.ComputePGMRSrc2 = S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) | - S_00B84C_USER_SGPR(MFI->NumUserSGPRs) | - S_00B84C_TGID_X_EN(1) | - S_00B84C_TGID_Y_EN(1) | - S_00B84C_TGID_Z_EN(1) | - S_00B84C_TG_SIZE_EN(1) | - S_00B84C_TIDIG_COMP_CNT(2) | - S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks); + S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) | + S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) | + S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) | + S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) | + S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) | + S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) | + S_00B84C_EXCP_EN_MSB(0) | + S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) | + S_00B84C_EXCP_EN(0); } static unsigned getRsrcReg(unsigned ShaderType) { @@ -491,14 +578,53 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.compute_pgm_resource_registers = KernelInfo.ComputePGMRSrc1 | (KernelInfo.ComputePGMRSrc2 << 32); - header.code_properties = - AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR | - AMD_CODE_PROPERTY_IS_PTR64; + header.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + + if (MFI->hasPrivateSegmentBuffer()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + + if (MFI->hasQueuePtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; + + if (MFI->hasKernargSegmentPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR; + + if (MFI->hasDispatchID()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID; + + if (MFI->hasFlatScratchInit()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT; + + // TODO: Private segment size + + if (MFI->hasGridWorkgroupCountX()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X; + } + + if (MFI->hasGridWorkgroupCountY()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y; + } + + if (MFI->hasGridWorkgroupCountZ()) { + header.code_properties |= + AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z; + } + + if (MFI->hasDispatchPtr()) + header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; header.kernarg_segment_byte_size = MFI->ABIArgOffset; header.wavefront_sgpr_count = KernelInfo.NumSGPR; header.workitem_vgpr_count = KernelInfo.NumVGPR; - + header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; + header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; AMDGPUTargetStreamer *TS = static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 345af9b..817cbfc 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -99,7 +99,9 @@ public: void EmitFunctionBodyStart() override; - void EmitEndOfAsmFile(Module &M) override; + void EmitFunctionEntryLabel() override; + + void EmitGlobalVariable(const GlobalVariable *GV) override; bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp new file mode 100644 index 0000000..2f6b302 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.cpp @@ -0,0 +1,26 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.cpp -------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" + +using namespace llvm; + +DiagnosticInfoUnsupported::DiagnosticInfoUnsupported( + const Function &Fn, + const Twine &Desc, + DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity), + Description(Desc), + Fn(Fn) { } + +int DiagnosticInfoUnsupported::KindID = 0; + +void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const { + DP << "unsupported " << getDescription() << " in " << Fn.getName(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h new file mode 100644 index 0000000..0fd37e1 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUDiagnosticInfoUnsupported.h @@ -0,0 +1,48 @@ +//===-- AMDGPUDiagnosticInfoUnsupported.h - Error reporting -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUDIAGNOSTICINFOUNSUPPORTED_H + +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" + +namespace llvm { + +/// Diagnostic information for unimplemented or unsupported feature reporting. +class DiagnosticInfoUnsupported : public DiagnosticInfo { +private: + const Twine &Description; + const Function &Fn; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error); + + const Function &getFunction() const { return Fn; } + const Twine &getDescription() const { return Description; } + + void print(DiagnosticPrinter &DP) const override; + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp index 8175786..4d84d28 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp @@ -71,9 +71,15 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const { } /// \returns The number of registers allocated for \p FI. -int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); + // Start the offset at 2 so we don't overwrite work group information. // XXX: We should only do this when the shader actually uses this // information. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h index 9f31be1..257a3da 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h @@ -8,14 +8,12 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Interface to describe a layout of a stack frame on a AMDIL target -/// machine. +/// \brief Interface to describe a layout of a stack frame on an AMDGPU target. // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H -#define LLVM_LIB_TARGET_R600_AMDGPUFRAMELOWERING_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUFRAMELOWERING_H -#include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetFrameLowering.h" namespace llvm { @@ -34,7 +32,8 @@ public: /// \returns The number of 32-bit sub-registers that are used when storing /// values to the stack. unsigned getStackWidth(const MachineFunction &MF) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 64c54cc..b33040b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -11,6 +11,8 @@ /// \brief Defines an instruction selector for the AMDGPU target. // //===----------------------------------------------------------------------===// + +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPURegisterInfo.h" @@ -20,9 +22,9 @@ #include "SIISelLowering.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" -#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Function.h" @@ -40,12 +42,14 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can // make the right decision when generating code for different targets. const AMDGPUSubtarget *Subtarget; + public: AMDGPUDAGToDAGISel(TargetMachine &TM); virtual ~AMDGPUDAGToDAGISel(); bool runOnMachineFunction(MachineFunction &MF) override; SDNode *Select(SDNode *N) override; const char *getPassName() const override; + void PreprocessISelDAG() override; void PostprocessISelDAG() override; private: @@ -91,7 +95,7 @@ private: bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const; - void SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, + bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const; @@ -108,6 +112,16 @@ private: SDValue &TFE) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &GLC) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, + bool &Imm) const; + bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, + bool &Imm) const; + bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const; + bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; + bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; SDNode *SelectAddrSpaceCast(SDNode *N); bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; @@ -273,6 +287,23 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { return N; } +static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { + switch (NumVectorElts) { + case 1: + return AMDGPU::SReg_32RegClassID; + case 2: + return AMDGPU::SReg_64RegClassID; + case 4: + return AMDGPU::SReg_128RegClassID; + case 8: + return AMDGPU::SReg_256RegClassID; + case 16: + return AMDGPU::SReg_512RegClassID; + } + + llvm_unreachable("invalid vector size"); +} + SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -306,38 +337,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { EVT EltVT = VT.getVectorElementType(); assert(EltVT.bitsEq(MVT::i32)); if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - bool UseVReg = true; - for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); - U != E; ++U) { - if (!U->isMachineOpcode()) { - continue; - } - const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); - if (!RC) { - continue; - } - if (static_cast<const SIRegisterInfo *>(TRI)->isSGPRClass(RC)) { - UseVReg = false; - } - } - switch(NumVectorElts) { - case 1: RegClassID = UseVReg ? AMDGPU::VGPR_32RegClassID : - AMDGPU::SReg_32RegClassID; - break; - case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID : - AMDGPU::SReg_64RegClassID; - break; - case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID : - AMDGPU::SReg_128RegClassID; - break; - case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID : - AMDGPU::SReg_256RegClassID; - break; - case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID : - AMDGPU::SReg_512RegClassID; - break; - default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); - } + RegClassID = selectSGPRVectorRegClassID(NumVectorElts); } else { // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG // that adds a 128 bits reg copy when going through TwoAddressInstructions @@ -455,98 +455,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, N->getValueType(0), Ops); } - - case ISD::LOAD: { - LoadSDNode *LD = cast<LoadSDNode>(N); - SDLoc SL(N); - EVT VT = N->getValueType(0); - - if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) { - N = glueCopyToM0(N); - break; - } - - // To simplify the TableGen patters, we replace all i64 loads with - // v2i32 loads. Alternatively, we could promote i64 loads to v2i32 - // during DAG legalization, however, so places (ExpandUnalignedLoad) - // in the DAG legalizer assume that if i64 is legal, so doing this - // promotion early can cause problems. - - SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SDLoc(N), LD->getChain(), - LD->getBasePtr(), LD->getMemOperand()); - SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, - MVT::i64, NewLoad); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLoad.getValue(1)); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), BitCast); - SDNode *Load = glueCopyToM0(NewLoad.getNode()); - SelectCode(Load); - N = BitCast.getNode(); - break; - } - + case ISD::LOAD: case ISD::STORE: { - // Handle i64 stores here for the same reason mentioned above for loads. - StoreSDNode *ST = cast<StoreSDNode>(N); - SDValue Value = ST->getValue(); - if (Value.getValueType() == MVT::i64 && !ST->isTruncatingStore()) { - - SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(N), - MVT::v2i32, Value); - SDValue NewStore = CurDAG->getStore(ST->getChain(), SDLoc(N), NewValue, - ST->getBasePtr(), ST->getMemOperand()); - - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewStore); - - if (NewValue.getOpcode() == ISD::BITCAST) { - Select(NewStore.getNode()); - return SelectCode(NewValue.getNode()); - } - - // getNode() may fold the bitcast if its input was another bitcast. If that - // happens we should only select the new store. - N = NewStore.getNode(); - } - N = glueCopyToM0(N); break; } - case AMDGPUISD::REGISTER_LOAD: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - - SDLoc DL(N); - SelectADDRIndirect(N->getOperand(1), Addr, Offset); - const SDValue Ops[] = { - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, DL, - CurDAG->getVTList(MVT::i32, MVT::i64, - MVT::Other), - Ops); - } - case AMDGPUISD::REGISTER_STORE: { - if (Subtarget->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) - break; - SDValue Addr, Offset; - SelectADDRIndirect(N->getOperand(2), Addr, Offset); - SDLoc DL(N); - const SDValue Ops[] = { - N->getOperand(1), - Addr, - Offset, - CurDAG->getTargetConstant(0, DL, MVT::i32), - N->getOperand(0), - }; - return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, DL, - CurDAG->getVTList(MVT::Other), - Ops); - } - case AMDGPUISD::BFE_I32: case AMDGPUISD::BFE_U32: { if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS) @@ -575,7 +489,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, SDLoc(N), N->getOperand(0), OffsetVal, WidthVal); - } case AMDGPUISD::DIV_SCALE: { return SelectDIV_SCALE(N); @@ -601,7 +514,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return SelectCode(N); } - bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) { assert(AS != 0 && "Use checkPrivateAddress instead."); if (!Ptr) @@ -681,7 +593,7 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const { if (checkPrivateAddress(N->getMemOperand())) { if (MMO) { const PseudoSourceValue *PSV = MMO->getPseudoValue(); - if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + if (PSV && PSV->isConstantPool()) { return true; } } @@ -847,7 +759,8 @@ SDNode *AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) { unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32; - // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod + // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, + // omod SDValue Ops[8]; SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]); @@ -883,15 +796,39 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, Offset = N1; return true; } - } + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + int64_t ByteOffset = C->getSExtValue(); + if (isUInt<16>(ByteOffset)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, ByteOffset, 16)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset = Addr.getOperand(0); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + // If we have a constant address, prefer to put the constant into the + // offset. This can save moves to load the constant address since multiple + // operations can share the zero base address register, and enables merging + // into read2 / write2 instructions. - SDLoc DL(Addr); + SDLoc DL(Addr); - // If we have a constant address, prefer to put the constant into the - // offset. This can save moves to load the constant address since multiple - // operations can share the zero base address register, and enables merging - // into read2 / write2 instructions. - if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { if (isUInt<16>(CAddr->getZExtValue())) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, @@ -904,10 +841,11 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base, // default case Base = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16); return true; } +// TODO: If offset is too big, put low 16-bit into offset. bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, SDValue &Offset0, SDValue &Offset1) const { @@ -926,9 +864,35 @@ bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base, Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); return true; } - } - - if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { + } else if (Addr.getOpcode() == ISD::SUB) { + // sub C, x -> add (sub 0, x), C + if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) { + unsigned DWordOffset0 = C->getZExtValue() / 4; + unsigned DWordOffset1 = DWordOffset0 + 1; + + if (isUInt<8>(DWordOffset0)) { + SDLoc DL(Addr); + SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); + + // XXX - This is kind of hacky. Create a dummy sub node so we can check + // the known bits in isDSOffsetLegal. We need to emit the selected node + // here, so this is thrown away. + SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + if (isDSOffsetLegal(Sub, DWordOffset1, 8)) { + MachineSDNode *MachineSub + = CurDAG->getMachineNode(AMDGPU::V_SUB_I32_e32, DL, MVT::i32, + Zero, Addr.getOperand(1)); + + Base = SDValue(MachineSub, 0); + Offset0 = CurDAG->getTargetConstant(DWordOffset0, DL, MVT::i8); + Offset1 = CurDAG->getTargetConstant(DWordOffset1, DL, MVT::i8); + return true; + } + } + } + } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) { unsigned DWordOffset0 = CAddr->getZExtValue() / 4; unsigned DWordOffset1 = DWordOffset0 + 1; assert(4 * DWordOffset0 == CAddr->getZExtValue()); @@ -956,12 +920,16 @@ static bool isLegalMUBUFImmOffset(const ConstantSDNode *Imm) { return isUInt<12>(Imm->getZExtValue()); } -void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, +bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, SDValue &TFE) const { + // Subtarget prefers to use flat instruction + if (Subtarget->useFlatForGlobal()) + return false; + SDLoc DL(Addr); GLC = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -994,14 +962,14 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, if (isLegalMUBUFImmOffset(C1)) { Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return; + return true; } else if (isUInt<32>(C1->getZExtValue())) { // Illegal offset, store it in soffset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), 0); - return; + return true; } } @@ -1013,7 +981,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = N0; VAddr = N1; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - return; + return true; } // default case -> offset @@ -1021,6 +989,7 @@ void AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, Ptr = Addr; Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, @@ -1033,8 +1002,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return false; - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; ConstantSDNode *C = cast<ConstantSDNode>(Addr64); if (C->getSExtValue()) { @@ -1052,8 +1022,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, - SDValue &SLC) const { + SDValue &Offset, + SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); SDValue GLC, TFE; @@ -1066,36 +1036,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc, SDLoc DL(Addr); MachineFunction &MF = CurDAG->getMachineFunction(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const SITargetLowering& Lowering = - *static_cast<const SITargetLowering*>(getTargetLowering()); - - unsigned ScratchOffsetReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass, - ScratchOffsetReg, MVT::i32); - SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32); - SDValue ScratchRsrcDword0 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32); - SDValue ScratchRsrcDword1 = - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0); - - const SDValue RsrcOps[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - ScratchRsrcDword0, - CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - ScratchRsrcDword1, - CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32), - }; - SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, RsrcOps), 0); - Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0); - SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL, - MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32); + Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); + SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32); // (add n0, c1) if (CurDAG->isBaseWithConstantOffset(Addr)) { @@ -1126,8 +1070,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE); + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, + GLC, SLC, TFE)) + return false; if (!cast<ConstantSDNode>(Offen)->getSExtValue() && !cast<ConstantSDNode>(Idxen)->getSExtValue() && @@ -1153,18 +1098,134 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } +/// +/// \param EncodedOffset This is the immediate value that will be encoded +/// directly into the instruction. On SI/CI the \p EncodedOffset +/// will be in units of dwords and on VI+ it will be units of bytes. +static bool isLegalSMRDImmOffset(const AMDGPUSubtarget *ST, + int64_t EncodedOffset) { + return ST->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, + SDValue &Offset, bool &Imm) const { + + // FIXME: Handle non-constant offsets. + ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode); + if (!C) + return false; + + SDLoc SL(ByteOffsetNode); + AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration(); + int64_t ByteOffset = C->getSExtValue(); + int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ? + ByteOffset >> 2 : ByteOffset; + + if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) { + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + Imm = true; + return true; + } + + if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset)) + return false; + + if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) { + // 32-bit Immediates are supported on Sea Islands. + Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); + } else { + SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32); + Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, + C32Bit), 0); + } + Imm = false; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, + SDValue &Offset, bool &Imm) const { + + SDLoc SL(Addr); + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue N0 = Addr.getOperand(0); + SDValue N1 = Addr.getOperand(1); + + if (SelectSMRDOffset(N1, Offset, Imm)) { + SBase = N0; + return true; + } + } + SBase = Addr; + Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); + Imm = true; + return true; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRD(Addr, SBase, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase, + SDValue &Offset) const { + bool Imm; + return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && Imm; +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, + SDValue &Offset) const { + if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS) + return false; + + bool Imm; + if (!SelectSMRDOffset(Addr, Offset, Imm)) + return false; + + return !Imm && isa<ConstantSDNode>(Offset); +} + +bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, + SDValue &Offset) const { + bool Imm; + return SelectSMRDOffset(Addr, Offset, Imm) && !Imm && + !isa<ConstantSDNode>(Offset); +} + // FIXME: This is incorrect and only enough to be able to compile. SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N); SDLoc DL(N); + const MachineFunction &MF = CurDAG->getMachineFunction(); + DiagnosticInfoUnsupported NotImplemented(*MF.getFunction(), + "addrspacecast not implemented"); + CurDAG->getContext()->diagnose(NotImplemented); + assert(Subtarget->hasFlatAddressSpace() && "addrspacecast only supported with flat address space!"); - assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS && - ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) && - "Cannot cast address space to / from constant address!"); - assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS || ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) && "Can only cast to / from flat address space!"); @@ -1190,7 +1251,6 @@ SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32)); } - if (DestSize > SrcSize) { assert(SrcSize == 32 && DestSize == 64); @@ -1371,6 +1431,65 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +void AMDGPUDAGToDAGISel::PreprocessISelDAG() { + bool Modified = false; + + // XXX - Other targets seem to be able to do this without a worklist. + SmallVector<LoadSDNode *, 8> LoadsToReplace; + SmallVector<StoreSDNode *, 8> StoresToReplace; + + for (SDNode &Node : CurDAG->allnodes()) { + if (LoadSDNode *LD = dyn_cast<LoadSDNode>(&Node)) { + EVT VT = LD->getValueType(0); + if (VT != MVT::i64 || LD->getExtensionType() != ISD::NON_EXTLOAD) + continue; + + // To simplify the TableGen patters, we replace all i64 loads with v2i32 + // loads. Alternatively, we could promote i64 loads to v2i32 during DAG + // legalization, however, so places (ExpandUnalignedLoad) in the DAG + // legalizer assume that if i64 is legal, so doing this promotion early + // can cause problems. + LoadsToReplace.push_back(LD); + } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(&Node)) { + // Handle i64 stores here for the same reason mentioned above for loads. + SDValue Value = ST->getValue(); + if (Value.getValueType() != MVT::i64 || ST->isTruncatingStore()) + continue; + StoresToReplace.push_back(ST); + } + } + + for (LoadSDNode *LD : LoadsToReplace) { + SDLoc SL(LD); + + SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(), + LD->getBasePtr(), LD->getMemOperand()); + SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, + MVT::i64, NewLoad); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1)); + CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast); + Modified = true; + } + + for (StoreSDNode *ST : StoresToReplace) { + SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST), + MVT::v2i32, ST->getValue()); + const SDValue StoreOps[] = { + ST->getChain(), + NewValue, + ST->getBasePtr(), + ST->getOffset() + }; + + CurDAG->UpdateNodeOperands(ST, StoreOps); + Modified = true; + } + + // XXX - Is this necessary? + if (Modified) + CurDAG->RemoveDeadNodes(); +} + void AMDGPUDAGToDAGISel::PostprocessISelDAG() { const AMDGPUTargetLowering& Lowering = *static_cast<const AMDGPUTargetLowering*>(getTargetLowering()); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 3a65f3b..222f631 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPURegisterInfo.h" @@ -27,50 +28,9 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/DiagnosticPrinter.h" using namespace llvm; -namespace { - -/// Diagnostic information for unimplemented or unsupported feature reporting. -class DiagnosticInfoUnsupported : public DiagnosticInfo { -private: - const Twine &Description; - const Function &Fn; - - static int KindID; - - static int getKindID() { - if (KindID == 0) - KindID = llvm::getNextAvailablePluginDiagnosticKind(); - return KindID; - } - -public: - DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc, - DiagnosticSeverity Severity = DS_Error) - : DiagnosticInfo(getKindID(), Severity), - Description(Desc), - Fn(Fn) { } - - const Function &getFunction() const { return Fn; } - const Twine &getDescription() const { return Description; } - - void print(DiagnosticPrinter &DP) const override { - DP << "unsupported " << getDescription() << " in " << Fn.getName(); - } - - static bool classof(const DiagnosticInfo *DI) { - return DI->getKind() == getKindID(); - } -}; - -int DiagnosticInfoUnsupported::KindID = 0; -} - - static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { @@ -113,6 +73,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::BRIND, MVT::Other, Expand); + // This is totally unsupported, just custom lower to produce an error. + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); + // We need to custom lower some of the intrinsics setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -352,7 +315,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Custom); - setOperationAction(ISD::UDIVREM, VT, Custom); + setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::ADDC, VT, Expand); setOperationAction(ISD::SUBC, VT, Expand); setOperationAction(ISD::ADDE, VT, Expand); @@ -429,12 +392,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM, setSelectIsExpensive(false); PredictableSelectIsExpensive = false; - // There are no integer divide instructions, and these expand to a pretty - // large sequence of instructions. - setIntDivIsCheap(false); - setPow2SDivIsCheap(false); setFsqrtIsCheap(true); + // We want to find all load dependencies for long chains of stores to enable + // merging into very wide vectors. The problem is with vectors with > 4 + // elements. MergeConsecutiveStores will attempt to merge these because x8/x16 + // vectors are a legal type, even though we have to split the loads + // usually. When we can more precisely specify load legality per address + // space, we should be able to make FindBetterChain/MergeConsecutiveStores + // smarter so that they can figure out what to do in 2 iterations without all + // N > 4 stores on the same chain. + GatherAllAliasesMaxDepth = 16; + // FIXME: Need to really handle these. MaxStoresPerMemcpy = 4096; MaxStoresPerMemmove = 4096; @@ -534,6 +503,18 @@ bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT, return true; } +bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const { + // There are few operations which truly have vector input operands. Any vector + // operation is going to involve operations on each component, and a + // build_vector will be a copy per element, so it always makes sense to use a + // build_vector input in place of the extracted element to avoid a copy into a + // super register. + // + // We should probably only do this if all users are extracts only, but this + // should be the common case. + return true; +} + bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const { // Truncate is just accessing a subregister. return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0); @@ -617,6 +598,15 @@ SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI, return SDValue(); } +SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + const Function &Fn = *DAG.getMachineFunction().getFunction(); + + DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "dynamic alloca"); + DAG.getContext()->diagnose(NoDynamicAlloca); + return SDValue(); +} + SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { @@ -643,6 +633,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } return Op; } @@ -892,7 +883,9 @@ SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op, FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op); unsigned FrameIndex = FIN->getIndex(); - unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex); + unsigned IgnoredFrameReg; + unsigned Offset = + TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op), Op.getValueType()); } @@ -1043,9 +1036,6 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); - case AMDGPUIntrinsic::AMDGPU_brev: - return DAG.getNode(AMDGPUISD::BREV, DL, VT, Op.getOperand(1)); - case Intrinsic::AMDGPU_class: return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -1057,6 +1047,8 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1)); case AMDGPUIntrinsic::AMDGPU_trunc: // Legacy name. return DAG.getNode(ISD::FTRUNC, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_brev: // Legacy name + return DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(1)); } } @@ -1077,6 +1069,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); + // TODO: Should this propagate fast-math-flags? SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT, DAG.getConstantFP(1.0f, DL, MVT::f32), Op.getOperand(1)); @@ -1167,45 +1160,6 @@ SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL, return SDValue(); } -// FIXME: Remove this when combines added to DAGCombiner. -SDValue AMDGPUTargetLowering::CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const { - if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) - return SDValue(); - - ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get(); - switch (CCOpcode) { - case ISD::SETULE: - case ISD::SETULT: { - unsigned Opc = (LHS == True) ? ISD::UMIN : ISD::UMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETLE: - case ISD::SETLT: { - unsigned Opc = (LHS == True) ? ISD::SMIN : ISD::SMAX; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETGT: - case ISD::SETGE: { - unsigned Opc = (LHS == True) ? ISD::SMAX : ISD::SMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - case ISD::SETUGE: - case ISD::SETUGT: { - unsigned Opc = (LHS == True) ? ISD::UMAX : ISD::UMIN; - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - default: - return SDValue(); - } -} - SDValue AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op, SelectionDAG &DAG) const { LoadSDNode *Load = cast<LoadSDNode>(Op); @@ -1260,7 +1214,8 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, EVT PtrVT = BasePtr.getValueType(); EVT MemVT = Load->getMemoryVT(); SDLoc SL(Op); - MachinePointerInfo SrcValue(Load->getMemOperand()->getValue()); + + const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); EVT LoVT, HiVT; EVT LoMemVT, HiMemVT; @@ -1269,23 +1224,27 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT); std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT); + + unsigned Size = LoMemVT.getStoreSize(); + unsigned BaseAlign = Load->getAlignment(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); + Load->isInvariant(), BaseAlign); SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr, - DAG.getConstant(LoMemVT.getStoreSize(), SL, - PtrVT)); + DAG.getConstant(Size, SL, PtrVT)); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, Load->isVolatile(), Load->isNonTemporal(), - Load->isInvariant(), Load->getAlignment()); + Load->isInvariant(), HiAlign); SDValue Ops[] = { DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad), @@ -1415,7 +1374,11 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, DAG.getConstant(LoMemVT.getStoreSize(), SL, PtrVT)); - MachinePointerInfo SrcValue(Store->getMemOperand()->getValue()); + const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo(); + unsigned BaseAlign = Store->getAlignment(); + unsigned Size = LoMemVT.getStoreSize(); + unsigned HiAlign = MinAlign(BaseAlign, Size); + SDValue LoStore = DAG.getTruncStore(Chain, SL, Lo, BasePtr, @@ -1423,15 +1386,15 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op, LoMemVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + BaseAlign); SDValue HiStore = DAG.getTruncStore(Chain, SL, Hi, HiPtr, - SrcValue.getWithOffset(LoMemVT.getStoreSize()), + SrcValue.getWithOffset(Size), HiMemVT, Store->isNonTemporal(), Store->isVolatile(), - Store->getAlignment()); + HiAlign); return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore); } @@ -1529,7 +1492,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) && Store->getValue().getValueType().isVector()) { - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); } EVT MemVT = Store->getMemoryVT(); @@ -1630,6 +1593,7 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool // float fb = (float)ib; SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); + // TODO: Should this propagate fast-math-flags? // float fq = native_divide(fa, fb); SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb)); @@ -1940,6 +1904,8 @@ SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { SDValue X = Op.getOperand(0); SDValue Y = Op.getOperand(1); + // TODO: Should this propagate fast-math-flags? + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); @@ -1968,6 +1934,7 @@ SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero); + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } @@ -2045,6 +2012,8 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const { SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64); SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src); + // TODO: Should this propagate fast-math-flags? + SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign); SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign); @@ -2074,6 +2043,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + // TODO: Should this propagate fast-math-flags? + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); @@ -2184,6 +2155,7 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc); SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero); + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } @@ -2206,7 +2178,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi, DAG.getConstant(32, SL, MVT::i32)); - + // TODO: Should this propagate fast-math-flags? return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo); } @@ -2231,6 +2203,7 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0, DAG.getConstant(1, DL, MVT::i32)); SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi); + // TODO: Should this propagate fast-math-flags? FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi, DAG.getConstantFP(4294967296.0f, DL, MVT::f32)); // 2^32 return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi); @@ -2257,7 +2230,7 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, MVT::f64); SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL, MVT::f64); - + // TODO: Should this propagate fast-math-flags? SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0); SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul); @@ -2511,12 +2484,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, if (VT == MVT::f32) return CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI); - - // TODO: Implement min / max Evergreen instructions. - if (VT == MVT::i32 && - Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - return CombineIMinMax(DL, VT, LHS, RHS, True, False, CC, DAG); - } } break; @@ -2652,20 +2619,14 @@ bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const { if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { return CFP->isExactlyValue(1.0); } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - return C->isAllOnesValue(); - } - return false; + return isAllOnesConstant(Op); } bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const { if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) { return CFP->getValueAPF().isZero(); } - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - return C->isNullValue(); - } - return false; + return isNullConstant(Op); } SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, @@ -2738,7 +2699,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) NODE_NAME_CASE(BFM) - NODE_NAME_CASE(BREV) NODE_NAME_CASE(MUL_U24) NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) @@ -2893,8 +2853,7 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( return 1; unsigned SignBits = 32 - Width->getZExtValue() + 1; - ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (!Offset || !Offset->isNullValue()) + if (!isNullConstant(Op.getOperand(1))) return SignBits; // TODO: Could probably figure something out with non-0 offsets. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 478b203..7314cc0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -138,6 +138,7 @@ public: bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AS) const override; + bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -149,6 +150,9 @@ public: SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; + SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const; + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; void ReplaceNodeResults(SDNode * N, @@ -165,14 +169,6 @@ public: SDValue False, SDValue CC, DAGCombinerInfo &DCI) const; - SDValue CombineIMinMax(SDLoc DL, - EVT VT, - SDValue LHS, - SDValue RHS, - SDValue True, - SDValue False, - SDValue CC, - SelectionDAG &DAG) const; const char* getTargetNodeName(unsigned Opcode) const override; @@ -216,7 +212,7 @@ public: /// \brief Helper function that returns the byte offset of the given /// type of implicit parameter. - unsigned getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, + uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const; }; @@ -267,7 +263,6 @@ enum NodeType : unsigned { BFE_I32, // Extract range of bits with sign extension to 32-bits. BFI, // (src0 & src1) | (~src0 & src2) BFM, // Insert a range of bits into a 32-bit word. - BREV, // Reverse bits. MUL_U24, MUL_I24, MAD_U24, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp index 15a3d54..a266e71 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp @@ -164,11 +164,6 @@ MachineInstr *AMDGPUInstrInfo::foldMemoryOperandImpl( // TODO: Implement this function return nullptr; } -bool AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - // TODO: Implement this function - return false; -} bool AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, @@ -312,7 +307,9 @@ int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const { return -1; } - Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexOffset(MF, -1); + unsigned IgnoredFrameReg; + Offset = MF.getSubtarget().getFrameLowering()->getFrameIndexReference( + MF, -1, IgnoredFrameReg); return getIndirectIndexBegin(MF) + Offset; } @@ -367,3 +364,14 @@ int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const { return MCOp; } + +ArrayRef<std::pair<int, const char *>> +AMDGPUInstrInfo::getSerializableTargetIndices() const { + static const std::pair<int, const char *> TargetIndices[] = { + {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; + return makeArrayRef(TargetIndices); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h index 86d3962..53e8b23 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h @@ -103,8 +103,6 @@ public: /// read or write or -1 if indirect addressing is not used by this program. int getIndirectIndexEnd(const MachineFunction &MF) const; - bool canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const override; bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const override; @@ -147,6 +145,9 @@ public: return get(pseudoToMCOpcode(Opcode)); } + ArrayRef<std::pair<int, const char *>> + getSerializableTargetIndices() const override; + //===---------------------------------------------------------------------===// // Pure virtual funtions to be implemented by sub-classes. //===---------------------------------------------------------------------===// @@ -195,6 +196,7 @@ public: }; namespace AMDGPU { + LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex); } // End namespace AMDGPU diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td index b413897..70e589c 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -191,8 +191,6 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>; def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>; def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>; -def AMDGPUbrev : SDNode<"AMDGPUISD::BREV", SDTIntUnaryOp>; - // Signed and unsigned 24-bit mulitply. The highest 8-bits are ignore when // performing the mulitply. The result is a 32-bit value. def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp, diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td index 72cab39..11f6139 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -514,7 +514,7 @@ class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul> class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> : Pat< - (sub_type (vector_extract vec_type:$src, sub_idx)), + (sub_type (extractelt vec_type:$src, sub_idx)), (EXTRACT_SUBREG $src, sub_reg) >; @@ -522,7 +522,7 @@ class Extract_Element <ValueType sub_type, ValueType vec_type, int sub_idx, class Insert_Element <ValueType elem_type, ValueType vec_type, int sub_idx, SubRegIndex sub_reg> : Pat < - (vector_insert vec_type:$vec, elem_type:$elem, sub_idx), + (insertelt vec_type:$vec, elem_type:$elem, sub_idx), (INSERT_SUBREG $vec, $elem, sub_reg) >; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td index ab489cd..1de3546 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; } // Legacy names for compatibility. diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 2083146..dfc652f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -61,7 +61,7 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createImm(MO.getImm()); break; case MachineOperand::MO_Register: - MCOp = MCOperand::createReg(MO.getReg()); + MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); break; case MachineOperand::MO_MachineBasicBlock: MCOp = MCOperand::createExpr(MCSymbolRefExpr::create( @@ -73,13 +73,6 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { MCOp = MCOperand::createExpr(MCSymbolRefExpr::create(Sym, Ctx)); break; } - case MachineOperand::MO_TargetIndex: { - assert(MO.getIndex() == AMDGPU::TI_CONSTDATA_START); - MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); - MCOp = MCOperand::createExpr(Expr); - break; - } case MachineOperand::MO_ExternalSymbol: { MCSymbol *Sym = Ctx.getOrCreateSymbol(StringRef(MO.getSymbolName())); const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); @@ -104,10 +97,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { #endif if (MI->isBundle()) { const MachineBasicBlock *MBB = MI->getParent(); - MachineBasicBlock::const_instr_iterator I = MI; - ++I; - while (I != MBB->end() && I->isInsideBundle()) { - EmitInstruction(I); + MachineBasicBlock::const_instr_iterator I = ++MI->getIterator(); + while (I != MBB->instr_end() && I->isInsideBundle()) { + EmitInstruction(&*I); ++I; } } else { @@ -136,8 +128,6 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter(); InstEmitter.encodeInstruction(TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>()); - CodeStream.flush(); - HexLines.resize(HexLines.size() + 1); std::string &HexLine = HexLines.back(); raw_string_ostream HexStream(HexLine); diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 21c7da6..5413717 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -1,11 +1,10 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" using namespace llvm; -static const char *const ShaderTypeAttribute = "ShaderType"; - // Pin the vtable to this file. void AMDGPUMachineFunction::anchor() {} @@ -13,13 +12,9 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), ShaderType(ShaderType::COMPUTE), LDSSize(0), + ABIArgOffset(0), ScratchSize(0), IsKernel(true) { - Attribute A = MF.getFunction()->getFnAttribute(ShaderTypeAttribute); - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) - llvm_unreachable("Can't parse shader type!"); - } + ShaderType = AMDGPU::getShaderType(*MF.getFunction()); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h index f5e4694..46fcee8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -37,6 +37,11 @@ public: return ShaderType; } + bool isKernel() const { + // FIXME: Assume everything is a kernel until function calls are supported. + return true; + } + unsigned ScratchSize; bool IsKernel; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp new file mode 100644 index 0000000..554bf1d --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUOpenCLImageTypeLoweringPass.cpp @@ -0,0 +1,373 @@ +//===-- AMDGPUOpenCLImageTypeLoweringPass.cpp -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This pass resolves calls to OpenCL image attribute, image resource ID and +/// sampler resource ID getter functions. +/// +/// Image attributes (size and format) are expected to be passed to the kernel +/// as kernel arguments immediately following the image argument itself, +/// therefore this pass adds image size and format arguments to the kernel +/// functions in the module. The kernel functions with image arguments are +/// re-created using the new signature. The new arguments are added to the +/// kernel metadata with kernel_arg_type set to "image_size" or "image_format". +/// Note: this pass may invalidate pointers to functions. +/// +/// Resource IDs of read-only images, write-only images and samplers are +/// defined to be their index among the kernel arguments of the same +/// type and access qualifier. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" + +using namespace llvm; + +namespace { + +StringRef GetImageSizeFunc = "llvm.OpenCL.image.get.size"; +StringRef GetImageFormatFunc = "llvm.OpenCL.image.get.format"; +StringRef GetImageResourceIDFunc = "llvm.OpenCL.image.get.resource.id"; +StringRef GetSamplerResourceIDFunc = "llvm.OpenCL.sampler.get.resource.id"; + +StringRef ImageSizeArgMDType = "__llvm_image_size"; +StringRef ImageFormatArgMDType = "__llvm_image_format"; + +StringRef KernelsMDNodeName = "opencl.kernels"; +StringRef KernelArgMDNodeNames[] = { + "kernel_arg_addr_space", + "kernel_arg_access_qual", + "kernel_arg_type", + "kernel_arg_base_type", + "kernel_arg_type_qual"}; +const unsigned NumKernelArgMDNodes = 5; + +typedef SmallVector<Metadata *, 8> MDVector; +struct KernelArgMD { + MDVector ArgVector[NumKernelArgMDNodes]; +}; + +} // end anonymous namespace + +static inline bool +IsImageType(StringRef TypeString) { + return TypeString == "image2d_t" || TypeString == "image3d_t"; +} + +static inline bool +IsSamplerType(StringRef TypeString) { + return TypeString == "sampler_t"; +} + +static Function * +GetFunctionFromMDNode(MDNode *Node) { + if (!Node) + return nullptr; + + size_t NumOps = Node->getNumOperands(); + if (NumOps != NumKernelArgMDNodes + 1) + return nullptr; + + auto F = mdconst::dyn_extract<Function>(Node->getOperand(0)); + if (!F) + return nullptr; + + // Sanity checks. + size_t ExpectNumArgNodeOps = F->arg_size() + 1; + for (size_t i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *ArgNode = dyn_cast_or_null<MDNode>(Node->getOperand(i + 1)); + if (ArgNode->getNumOperands() != ExpectNumArgNodeOps) + return nullptr; + if (!ArgNode->getOperand(0)) + return nullptr; + + // FIXME: It should be possible to do image lowering when some metadata + // args missing or not in the expected order. + MDString *StringNode = dyn_cast<MDString>(ArgNode->getOperand(0)); + if (!StringNode || StringNode->getString() != KernelArgMDNodeNames[i]) + return nullptr; + } + + return F; +} + +static StringRef +AccessQualFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgAQNode = cast<MDNode>(KernelMDNode->getOperand(2)); + return cast<MDString>(ArgAQNode->getOperand(ArgIdx + 1))->getString(); +} + +static StringRef +ArgTypeFromMD(MDNode *KernelMDNode, unsigned ArgIdx) { + MDNode *ArgTypeNode = cast<MDNode>(KernelMDNode->getOperand(3)); + return cast<MDString>(ArgTypeNode->getOperand(ArgIdx + 1))->getString(); +} + +static MDVector +GetArgMD(MDNode *KernelMDNode, unsigned OpIdx) { + MDVector Res; + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MDNode *Node = cast<MDNode>(KernelMDNode->getOperand(i + 1)); + Res.push_back(Node->getOperand(OpIdx)); + } + return Res; +} + +static void +PushArgMD(KernelArgMD &MD, const MDVector &V) { + assert(V.size() == NumKernelArgMDNodes); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) { + MD.ArgVector[i].push_back(V[i]); + } +} + +namespace { + +class AMDGPUOpenCLImageTypeLoweringPass : public ModulePass { + static char ID; + + LLVMContext *Context; + Type *Int32Type; + Type *ImageSizeType; + Type *ImageFormatType; + SmallVector<Instruction *, 4> InstsToErase; + + bool replaceImageUses(Argument &ImageArg, uint32_t ResourceID, + Argument &ImageSizeArg, + Argument &ImageFormatArg) { + bool Modified = false; + + for (auto &Use : ImageArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name.startswith(GetImageResourceIDFunc)) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else if (Name.startswith(GetImageSizeFunc)) { + Replacement = &ImageSizeArg; + } else if (Name.startswith(GetImageFormatFunc)) { + Replacement = &ImageFormatArg; + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceSamplerUses(Argument &SamplerArg, uint32_t ResourceID) { + bool Modified = false; + + for (const auto &Use : SamplerArg.uses()) { + auto Inst = dyn_cast<CallInst>(Use.getUser()); + if (!Inst) { + continue; + } + + Function *F = Inst->getCalledFunction(); + if (!F) + continue; + + Value *Replacement = nullptr; + StringRef Name = F->getName(); + if (Name == GetSamplerResourceIDFunc) { + Replacement = ConstantInt::get(Int32Type, ResourceID); + } else { + continue; + } + + Inst->replaceAllUsesWith(Replacement); + InstsToErase.push_back(Inst); + Modified = true; + } + + return Modified; + } + + bool replaceImageAndSamplerUses(Function *F, MDNode *KernelMDNode) { + uint32_t NumReadOnlyImageArgs = 0; + uint32_t NumWriteOnlyImageArgs = 0; + uint32_t NumSamplerArgs = 0; + + bool Modified = false; + InstsToErase.clear(); + for (auto ArgI = F->arg_begin(); ArgI != F->arg_end(); ++ArgI) { + Argument &Arg = *ArgI; + StringRef Type = ArgTypeFromMD(KernelMDNode, Arg.getArgNo()); + + // Handle image types. + if (IsImageType(Type)) { + StringRef AccessQual = AccessQualFromMD(KernelMDNode, Arg.getArgNo()); + uint32_t ResourceID; + if (AccessQual == "read_only") { + ResourceID = NumReadOnlyImageArgs++; + } else if (AccessQual == "write_only") { + ResourceID = NumWriteOnlyImageArgs++; + } else { + llvm_unreachable("Wrong image access qualifier."); + } + + Argument &SizeArg = *(++ArgI); + Argument &FormatArg = *(++ArgI); + Modified |= replaceImageUses(Arg, ResourceID, SizeArg, FormatArg); + + // Handle sampler type. + } else if (IsSamplerType(Type)) { + uint32_t ResourceID = NumSamplerArgs++; + Modified |= replaceSamplerUses(Arg, ResourceID); + } + } + for (unsigned i = 0; i < InstsToErase.size(); ++i) { + InstsToErase[i]->eraseFromParent(); + } + + return Modified; + } + + std::tuple<Function *, MDNode *> + addImplicitArgs(Function *F, MDNode *KernelMDNode) { + bool Modified = false; + + FunctionType *FT = F->getFunctionType(); + SmallVector<Type *, 8> ArgTypes; + + // Metadata operands for new MDNode. + KernelArgMD NewArgMDs; + PushArgMD(NewArgMDs, GetArgMD(KernelMDNode, 0)); + + // Add implicit arguments to the signature. + for (unsigned i = 0; i < FT->getNumParams(); ++i) { + ArgTypes.push_back(FT->getParamType(i)); + MDVector ArgMD = GetArgMD(KernelMDNode, i + 1); + PushArgMD(NewArgMDs, ArgMD); + + if (!IsImageType(ArgTypeFromMD(KernelMDNode, i))) + continue; + + // Add size implicit argument. + ArgTypes.push_back(ImageSizeType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageSizeArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + // Add format implicit argument. + ArgTypes.push_back(ImageFormatType); + ArgMD[2] = ArgMD[3] = MDString::get(*Context, ImageFormatArgMDType); + PushArgMD(NewArgMDs, ArgMD); + + Modified = true; + } + if (!Modified) { + return std::make_tuple(nullptr, nullptr); + } + + // Create function with new signature and clone the old body into it. + auto NewFT = FunctionType::get(FT->getReturnType(), ArgTypes, false); + auto NewF = Function::Create(NewFT, F->getLinkage(), F->getName()); + ValueToValueMapTy VMap; + auto NewFArgIt = NewF->arg_begin(); + for (auto &Arg: F->args()) { + auto ArgName = Arg.getName(); + NewFArgIt->setName(ArgName); + VMap[&Arg] = &(*NewFArgIt++); + if (IsImageType(ArgTypeFromMD(KernelMDNode, Arg.getArgNo()))) { + (NewFArgIt++)->setName(Twine("__size_") + ArgName); + (NewFArgIt++)->setName(Twine("__format_") + ArgName); + } + } + SmallVector<ReturnInst*, 8> Returns; + CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns); + + // Build new MDNode. + SmallVector<llvm::Metadata *, 6> KernelMDArgs; + KernelMDArgs.push_back(ConstantAsMetadata::get(NewF)); + for (unsigned i = 0; i < NumKernelArgMDNodes; ++i) + KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i])); + MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs); + + return std::make_tuple(NewF, NewMDNode); + } + + bool transformKernels(Module &M) { + NamedMDNode *KernelsMDNode = M.getNamedMetadata(KernelsMDNodeName); + if (!KernelsMDNode) + return false; + + bool Modified = false; + for (unsigned i = 0; i < KernelsMDNode->getNumOperands(); ++i) { + MDNode *KernelMDNode = KernelsMDNode->getOperand(i); + Function *F = GetFunctionFromMDNode(KernelMDNode); + if (!F) + continue; + + Function *NewF; + MDNode *NewMDNode; + std::tie(NewF, NewMDNode) = addImplicitArgs(F, KernelMDNode); + if (NewF) { + // Replace old function and metadata with new ones. + F->eraseFromParent(); + M.getFunctionList().push_back(NewF); + M.getOrInsertFunction(NewF->getName(), NewF->getFunctionType(), + NewF->getAttributes()); + KernelsMDNode->setOperand(i, NewMDNode); + + F = NewF; + KernelMDNode = NewMDNode; + Modified = true; + } + + Modified |= replaceImageAndSamplerUses(F, KernelMDNode); + } + + return Modified; + } + + public: + AMDGPUOpenCLImageTypeLoweringPass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + Context = &M.getContext(); + Int32Type = Type::getInt32Ty(M.getContext()); + ImageSizeType = ArrayType::get(Int32Type, 3); + ImageFormatType = ArrayType::get(Int32Type, 2); + + return transformKernels(M); + } + + const char *getPassName() const override { + return "AMDGPU OpenCL Image Type Pass"; + } +}; + +char AMDGPUOpenCLImageTypeLoweringPass::ID = 0; + +} // end anonymous namespace + +ModulePass *llvm::createAMDGPUOpenCLImageTypeLoweringPass() { + return new AMDGPUOpenCLImageTypeLoweringPass(); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 57b7a73..87d50d5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -54,7 +54,7 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) { bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - const FunctionType *FTy = F.getFunctionType(); + FunctionType *FTy = F.getFunctionType(); LocalMemAvailable = ST.getLocalMemorySize(); @@ -63,7 +63,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // possible these arguments require the entire local memory space, so // we cannot use local memory in the pass. for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { - const Type *ParamTy = FTy->getParamType(i); + Type *ParamTy = FTy->getParamType(i); if (ParamTy->isPointerTy() && ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { LocalMemAvailable = 0; @@ -77,7 +77,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { // Check how much local memory is being used by global objects for (Module::global_iterator I = Mod->global_begin(), E = Mod->global_end(); I != E; ++I) { - GlobalVariable *GV = I; + GlobalVariable *GV = &*I; PointerType *GVTy = GV->getType(); if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) continue; @@ -101,7 +101,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { return false; } -static VectorType *arrayTypeToVecType(const Type *ArrayTy) { +static VectorType *arrayTypeToVecType(Type *ArrayTy) { return VectorType::get(ArrayTy->getArrayElementType(), ArrayTy->getArrayNumElements()); } @@ -276,6 +276,9 @@ static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { } void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + if (!I.isStaticAlloca()) + return; + IRBuilder<> Builder(&I); // First try to replace the alloca with a vector diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h index cfd800b..0344834 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h @@ -37,10 +37,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo { assert(!"Unimplemented"); return BitVector(); } - virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const { - assert(!"Unimplemented"); return nullptr; - } - virtual unsigned getHWRegIndex(unsigned Reg) const { assert(!"Unimplemented"); return 0; } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 5f32a65..44e0c47 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -16,6 +16,7 @@ #include "R600ISelLowering.h" #include "R600InstrInfo.h" #include "R600MachineScheduler.h" +#include "SIFrameLowering.h" #include "SIISelLowering.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -44,6 +45,8 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, // disable it. SmallString<256> FullFS("+promote-alloca,+fp64-denormals,"); + if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. + FullFS += "+flat-for-global,"; FullFS += FS; if (GPU == "" && TT.getArch() == Triple::amdgcn) @@ -67,26 +70,36 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, DumpCode(false), R600ALUInst(false), HasVertexCache(false), TexVTXClauseSize(0), Gen(AMDGPUSubtarget::R600), FP64(false), FP64Denormals(false), FP32Denormals(false), FastFMAF32(false), - CaymanISA(false), FlatAddressSpace(false), EnableIRStructurizer(true), - EnablePromoteAlloca(false), EnableIfCvt(true), EnableLoadStoreOpt(false), - EnableUnsafeDSOffsetFolding(false), + CaymanISA(false), FlatAddressSpace(false), FlatForGlobal(false), + EnableIRStructurizer(true), EnablePromoteAlloca(false), EnableIfCvt(true), + EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), WavefrontSize(0), CFALUBug(false), LocalMemorySize(0), EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false), GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), - FrameLowering(TargetFrameLowering::StackGrowsUp, - 64 * 16, // Maximum stack alignment (long16) - 0), + FrameLowering(nullptr), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { initializeSubtargetDependencies(TT, GPU, FS); + const unsigned MaxStackAlign = 64 * 16; // Maximum stack alignment (long16) + if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { InstrInfo.reset(new R600InstrInfo(*this)); TLInfo.reset(new R600TargetLowering(TM, *this)); + + // FIXME: Should have R600 specific FrameLowering + FrameLowering.reset(new AMDGPUFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); } else { InstrInfo.reset(new SIInstrInfo(*this)); TLInfo.reset(new SITargetLowering(TM, *this)); + FrameLowering.reset(new SIFrameLowering( + TargetFrameLowering::StackGrowsUp, + MaxStackAlign, + 0)); } } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 735f01d..9c7bb88 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -1,4 +1,4 @@ -//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====// +//=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// // // The LLVM Compiler Infrastructure // @@ -12,17 +12,15 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H -#define LLVM_LIB_TARGET_R600_AMDGPUSUBTARGET_H +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H + #include "AMDGPU.h" #include "AMDGPUFrameLowering.h" #include "AMDGPUInstrInfo.h" -#include "AMDGPUIntrinsicInfo.h" +#include "AMDGPUISelLowering.h" #include "AMDGPUSubtarget.h" -#include "R600ISelLowering.h" -#include "AMDKernelCodeT.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Target/TargetSubtargetInfo.h" @@ -72,6 +70,7 @@ private: bool FastFMAF32; bool CaymanISA; bool FlatAddressSpace; + bool FlatForGlobal; bool EnableIRStructurizer; bool EnablePromoteAlloca; bool EnableIfCvt; @@ -88,10 +87,10 @@ private: bool CIInsts; bool FeatureDisable; int LDSBankCount; - unsigned IsaVersion; + unsigned IsaVersion; bool EnableHugeScratchBuffer; - AMDGPUFrameLowering FrameLowering; + std::unique_ptr<AMDGPUFrameLowering> FrameLowering; std::unique_ptr<AMDGPUTargetLowering> TLInfo; std::unique_ptr<AMDGPUInstrInfo> InstrInfo; InstrItineraryData InstrItins; @@ -104,7 +103,7 @@ public: StringRef GPU, StringRef FS); const AMDGPUFrameLowering *getFrameLowering() const override { - return &FrameLowering; + return FrameLowering.get(); } const AMDGPUInstrInfo *getInstrInfo() const override { return InstrInfo.get(); @@ -161,6 +160,10 @@ public: return FlatAddressSpace; } + bool useFlatForGlobal() const { + return FlatForGlobal; + } + bool hasBFE() const { return (getGeneration() >= EVERGREEN); } @@ -305,6 +308,9 @@ public: return isAmdHsaOS() ? 0 : 36; } + unsigned getMaxNumUserSGPRs() const { + return 16; + } }; } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2297b52..22f85b3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUTargetMachine.h" +#include "AMDGPUTargetObjectFile.h" #include "AMDGPU.h" #include "AMDGPUTargetTransformInfo.h" #include "R600ISelLowering.h" @@ -41,6 +42,23 @@ extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); RegisterTargetMachine<GCNTargetMachine> Y(TheGCNTarget); + + PassRegistry *PR = PassRegistry::getPassRegistry(); + initializeSILowerI1CopiesPass(*PR); + initializeSIFixSGPRCopiesPass(*PR); + initializeSIFoldOperandsPass(*PR); + initializeSIFixSGPRLiveRangesPass(*PR); + initializeSIFixControlFlowLiveIntervalsPass(*PR); + initializeSILoadStoreOptimizerPass(*PR); + initializeAMDGPUAnnotateKernelFeaturesPass(*PR); + initializeAMDGPUAnnotateUniformValuesPass(*PR); +} + +static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { + if (TT.getOS() == Triple::AMDHSA) + return make_unique<AMDGPUHSATargetObjectFile>(); + + return make_unique<AMDGPUTargetObjectFile>(); } static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) { @@ -72,15 +90,13 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OptLevel) : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, RM, CM, OptLevel), - TLOF(new TargetLoweringObjectFileELF()), Subtarget(TT, CPU, FS, *this), + TLOF(createTLOF(getTargetTriple())), Subtarget(TT, CPU, FS, *this), IntrinsicInfo() { setRequiresStructuredCFG(true); initAsmInfo(); } -AMDGPUTargetMachine::~AMDGPUTargetMachine() { - delete TLOF; -} +AMDGPUTargetMachine::~AMDGPUTargetMachine() { } //===----------------------------------------------------------------------===// // R600 Target Machine (R600 -> Cayman) @@ -110,7 +126,13 @@ namespace { class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + + // Exceptions and StackMaps are not supported, so these passes will never do + // anything. + disablePass(&StackMapLivenessID); + disablePass(&FuncletLayoutID); + } AMDGPUTargetMachine &getAMDGPUTargetMachine() const { return getTM<AMDGPUTargetMachine>(); @@ -126,8 +148,9 @@ public: void addIRPasses() override; void addCodeGenPrepare() override; - virtual bool addPreISel() override; - virtual bool addInstSelector() override; + bool addPreISel() override; + bool addInstSelector() override; + bool addGCPasses() override; }; class R600PassConfig : public AMDGPUPassConfig { @@ -147,6 +170,8 @@ public: : AMDGPUPassConfig(TM, PM) { } bool addPreISel() override; bool addInstSelector() override; + void addFastRegAlloc(FunctionPass *RegAllocPass) override; + void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreSched2() override; @@ -156,7 +181,7 @@ public: } // End of anonymous namespace TargetIRAnalysis AMDGPUTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo( AMDGPUTTIImpl(this, F.getParent()->getDataLayout())); }); @@ -172,6 +197,10 @@ void AMDGPUPassConfig::addIRPasses() { // functions, then we will generate code for the first function // without ever running any passes on the second. addPass(createBarrierNoopPass()); + + // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. + addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + TargetPassConfig::addIRPasses(); } @@ -198,6 +227,11 @@ bool AMDGPUPassConfig::addInstSelector() { return false; } +bool AMDGPUPassConfig::addGCPasses() { + // Do nothing. GC is not supported. + return false; +} + //===----------------------------------------------------------------------===// // R600 Pass Setup //===----------------------------------------------------------------------===// @@ -238,16 +272,23 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + + // FIXME: We need to run a pass to propagate the attributes when calls are + // supported. + addPass(&AMDGPUAnnotateKernelFeaturesID); + addPass(createSinkingPass()); addPass(createSITypeRewriter()); addPass(createSIAnnotateControlFlowPass()); + addPass(createAMDGPUAnnotateUniformValues()); + return false; } bool GCNPassConfig::addInstSelector() { AMDGPUPassConfig::addInstSelector(); addPass(createSILowerI1CopiesPass()); - addPass(createSIFixSGPRCopiesPass(*TM)); + addPass(&SIFixSGPRCopiesID); addPass(createSIFoldOperandsPass()); return false; } @@ -259,7 +300,6 @@ void GCNPassConfig::addPreRegAlloc() { // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass if (getOptLevel() > CodeGenOpt::None) { - initializeSIFixControlFlowLiveIntervalsPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } @@ -269,16 +309,27 @@ void GCNPassConfig::addPreRegAlloc() { // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. - initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } addPass(createSIShrinkInstructionsPass(), false); - addPass(createSIFixSGPRLiveRangesPass(), false); +} + +void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { + addPass(&SIFixSGPRLiveRangesID); + TargetPassConfig::addFastRegAlloc(RegAllocPass); +} + +void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { + // We want to run this after LiveVariables is computed to avoid computing them + // twice. + // FIXME: We shouldn't disable the verifier here. r249087 introduced a failure + // that needs to be fixed. + insertPass(&LiveVariablesID, &SIFixSGPRLiveRangesID, /*VerifyAfter=*/false); + TargetPassConfig::addOptimizedRegAlloc(RegAllocPass); } void GCNPassConfig::addPostRegAlloc() { - addPass(createSIPrepareScratchRegs(), false); addPass(createSIShrinkInstructionsPass(), false); } diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 14792e3..236e3f8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -32,7 +32,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine { private: protected: - TargetLoweringObjectFile *TLOF; + std::unique_ptr<TargetLoweringObjectFile> TLOF; AMDGPUSubtarget Subtarget; AMDGPUIntrinsicInfo IntrinsicInfo; @@ -52,7 +52,7 @@ public: TargetIRAnalysis getTargetIRAnalysis() override; TargetLoweringObjectFile *getObjFileLowering() const override { - return TLOF; + return TLOF.get(); } }; diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp new file mode 100644 index 0000000..e050f21 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp @@ -0,0 +1,87 @@ +//===-- AMDGPUHSATargetObjectFile.cpp - AMDGPU Object Files ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetObjectFile.h" +#include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/Support/ELF.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Generic Object File +//===----------------------------------------------------------------------===// + +MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, + SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GV)) + return TextSection; + + return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM); +} + +//===----------------------------------------------------------------------===// +// HSA Object File +//===----------------------------------------------------------------------===// + + +void AMDGPUHSATargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM){ + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); + + TextSection = AMDGPU::getHSATextSection(Ctx); + + DataGlobalAgentSection = AMDGPU::getHSADataGlobalAgentSection(Ctx); + DataGlobalProgramSection = AMDGPU::getHSADataGlobalProgramSection(Ctx); + + RodataReadonlyAgentSection = AMDGPU::getHSARodataReadonlyAgentSection(Ctx); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocationSection( + const char *SectionName) const { + return cast<MCSectionELF>(DataGlobalAgentSection) + ->getSectionName() + .equals(SectionName); +} + +bool AMDGPUHSATargetObjectFile::isAgentAllocation(const GlobalValue *GV) const { + // Read-only segments can only have agent allocation. + return AMDGPU::isReadOnlySegment(GV) || + (AMDGPU::isGlobalSegment(GV) && GV->hasSection() && + isAgentAllocationSection(GV->getSection())); +} + +bool AMDGPUHSATargetObjectFile::isProgramAllocation( + const GlobalValue *GV) const { + // The default for global segments is program allocation. + return AMDGPU::isGlobalSegment(GV) && !isAgentAllocation(GV); +} + +MCSection *AMDGPUHSATargetObjectFile::SelectSectionForGlobal( + const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const { + if (Kind.isText() && !GV->hasComdat()) + return getTextSection(); + + if (AMDGPU::isGlobalSegment(GV)) { + if (isAgentAllocation(GV)) + return DataGlobalAgentSection; + + if (isProgramAllocation(GV)) + return DataGlobalProgramSection; + } + + return AMDGPUTargetObjectFile::SelectSectionForGlobal(GV, Kind, Mang, TM); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h new file mode 100644 index 0000000..921341e --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h @@ -0,0 +1,51 @@ +//===-- AMDGPUTargetObjectFile.h - AMDGPU Object Info ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares the AMDGPU-specific subclass of +/// TargetLoweringObjectFile. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H + +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + +class AMDGPUTargetObjectFile : public TargetLoweringObjectFileELF { + public: + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +class AMDGPUHSATargetObjectFile final : public AMDGPUTargetObjectFile { +private: + MCSection *DataGlobalAgentSection; + MCSection *DataGlobalProgramSection; + MCSection *RodataReadonlyAgentSection; + + bool isAgentAllocationSection(const char *SectionName) const; + bool isAgentAllocation(const GlobalValue *GV) const; + bool isProgramAllocation(const GlobalValue *GV) const; + +public: + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; + + MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, + Mangler &Mang, + const TargetMachine &TM) const override; +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6dacc74..54a003d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -74,9 +74,109 @@ unsigned AMDGPUTTIImpl::getNumberOfRegisters(bool Vec) { return 4 * 128; // XXX - 4 channels. Should these count as vector instead? } -unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool) { return 32; } +unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) { + return Vector ? 0 : 32; +} unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { // Semi-arbitrary large amount. return 64; } + +unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) { + // XXX - For some reason this isn't called for switch. + switch (Opcode) { + case Instruction::Br: + case Instruction::Ret: + return 10; + default: + return BaseT::getCFInstrCost(Opcode); + } +} + +int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { + switch (Opcode) { + case Instruction::ExtractElement: + // Dynamic indexing isn't free and is best avoided. + return Index == ~0u ? 2 : 0; + default: + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } +} + +static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, + const IntrinsicInst *I) { + switch (I->getIntrinsicID()) { + default: + return false; + case Intrinsic::not_intrinsic: + // This means we have an intrinsic that isn't defined in + // IntrinsicsAMDGPU.td + break; + + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: + case Intrinsic::r600_read_tidig_x: + case Intrinsic::r600_read_tidig_y: + case Intrinsic::r600_read_tidig_z: + return true; + } + + StringRef Name = I->getCalledFunction()->getName(); + switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) { + default: + return false; + case AMDGPUIntrinsic::SI_tid: + case AMDGPUIntrinsic::SI_fs_interp: + return true; + } +} + +static bool isArgPassedInSGPR(const Argument *A) { + const Function *F = A->getParent(); + unsigned ShaderType = AMDGPU::getShaderType(*F); + + // Arguments to compute shaders are never a source of divergence. + if (ShaderType == ShaderType::COMPUTE) + return true; + + // For non-compute shaders, SGPR inputs are marked with either inreg or byval. + if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) || + F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal)) + return true; + + // Everything else is in VGPRs. + return false; +} + +/// +/// \returns true if the result of the value could potentially be +/// different across workitems in a wavefront. +bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { + + if (const Argument *A = dyn_cast<Argument>(V)) + return !isArgPassedInSGPR(A); + + // Loads from the private address space are divergent, because threads + // can execute the load instruction with the same inputs and get different + // results. + // + // All other loads are not divergent, because if threads issue loads with the + // same arguments, they will always get the same result. + if (const LoadInst *Load = dyn_cast<LoadInst>(V)) + return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS; + + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + const TargetMachine &TM = getTLI()->getTargetMachine(); + return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic); + } + + // Assume all function calls are a source of divergence. + if (isa<CallInst>(V) || isa<InvokeInst>(V)) + return true; + + return false; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index dee0a69..976afb0 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -60,6 +60,11 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); + + unsigned getCFInstrCost(unsigned Opcode); + + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); + bool isSourceOfDivergence(const Value *V) const; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index d918ac3..917efd1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -185,7 +185,7 @@ protected: MachinePostDominatorTree *PDT; MachineLoopInfo *MLI; const R600InstrInfo *TII; - const AMDGPURegisterInfo *TRI; + const R600RegisterInfo *TRI; // PRINT FUNCTIONS /// Print the ordered Blocks. @@ -881,7 +881,7 @@ bool AMDGPUCFGStructurizer::run() { } //while, "one iteration" over the function. MachineBasicBlock *EntryMBB = - GraphTraits<MachineFunction *>::nodes_begin(FuncRep); + &*GraphTraits<MachineFunction *>::nodes_begin(FuncRep); if (EntryMBB->succ_size() == 0) { Finish = true; DEBUG( @@ -904,7 +904,7 @@ bool AMDGPUCFGStructurizer::run() { } while (!Finish && MakeProgress); // Misc wrap up to maintain the consistency of the Function representation. - wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); + wrapup(&*GraphTraits<MachineFunction *>::nodes_begin(FuncRep)); // Detach retired Block, release memory. for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end(); @@ -1164,7 +1164,7 @@ int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep, for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(), E = ContMBB.end(); It != E; ++It) { - (*It)->removeSuccessor(LoopHeader); + (*It)->removeSuccessor(LoopHeader, true); } numLoopcontPatternMatch += NumCont; @@ -1353,7 +1353,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // If MigrateTrue is true, then TrueBB is the block being "branched into" // and if MigrateFalse is true, then FalseBB is the block being // "branched into" - // + // // Here is the pseudo code for how I think the optimization should work: // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head. // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from. @@ -1372,7 +1372,7 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB, // the late machine optimization passes, however if we implement // bool TargetRegisterInfo::requiresRegisterScavenging( // const MachineFunction &MF) - // and have it return true, liveness will be tracked correctly + // and have it return true, liveness will be tracked correctly // by generic optimization passes. We will also need to make sure that // all of our target-specific passes that run after regalloc and before // the CFGStructurizer track liveness and we will need to modify this pass @@ -1487,7 +1487,7 @@ void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB, ); DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end()); - DstMBB->removeSuccessor(SrcMBB); + DstMBB->removeSuccessor(SrcMBB, true); cloneSuccessorList(DstMBB, SrcMBB); removeSuccessor(SrcMBB); @@ -1537,9 +1537,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, if (TrueMBB) { MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end()); - MBB->removeSuccessor(TrueMBB); + MBB->removeSuccessor(TrueMBB, true); if (LandMBB && TrueMBB->succ_size()!=0) - TrueMBB->removeSuccessor(LandMBB); + TrueMBB->removeSuccessor(LandMBB, true); retireBlock(TrueMBB); MLI->removeBlock(TrueMBB); } @@ -1548,9 +1548,9 @@ void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI, insertInstrBefore(I, AMDGPU::ELSE); MBB->splice(I, FalseMBB, FalseMBB->begin(), FalseMBB->end()); - MBB->removeSuccessor(FalseMBB); + MBB->removeSuccessor(FalseMBB, true); if (LandMBB && FalseMBB->succ_size() != 0) - FalseMBB->removeSuccessor(LandMBB); + FalseMBB->removeSuccessor(LandMBB, true); retireBlock(FalseMBB); MLI->removeBlock(FalseMBB); } @@ -1570,8 +1570,7 @@ void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk, insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc()); insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc()); - DstBlk->addSuccessor(LandMBB); - DstBlk->removeSuccessor(DstBlk); + DstBlk->replaceSuccessor(DstBlk, LandMBB); } @@ -1592,7 +1591,7 @@ void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB, //now branchInst can be erase safely BranchMI->eraseFromParent(); //now take care of successors, retire blocks - ExitingMBB->removeSuccessor(LandMBB); + ExitingMBB->removeSuccessor(LandMBB, true); } void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB, @@ -1666,8 +1665,7 @@ AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB, replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB); //srcBlk, oldBlk, newBlk - PredMBB->removeSuccessor(MBB); - PredMBB->addSuccessor(CloneMBB); + PredMBB->replaceSuccessor(MBB, CloneMBB); // add all successor to cloneBlk cloneSuccessorList(CloneMBB, MBB); @@ -1695,10 +1693,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, ); SpliceEnd = SrcMBB->end(); } else { - DEBUG( - dbgs() << "migrateInstruction see branch instr\n" ; - BranchMI->dump(); - ); + DEBUG(dbgs() << "migrateInstruction see branch instr: " << *BranchMI); SpliceEnd = BranchMI; } DEBUG( @@ -1711,7 +1706,7 @@ void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB, DEBUG( dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size() - << "srcSize = " << SrcMBB->size() << "\n"; + << "srcSize = " << SrcMBB->size() << '\n'; ); } @@ -1743,7 +1738,7 @@ void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) { // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. while ((BranchMI = getLoopendBlockBranchInstr(MBB)) && isUncondBranch(BranchMI)) { - DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump();); + DEBUG(dbgs() << "Removing uncond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); } } @@ -1759,10 +1754,10 @@ void AMDGPUCFGStructurizer::removeRedundantConditionalBranch( MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB); assert(BranchMI && isCondBranch(BranchMI)); - DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump();); + DEBUG(dbgs() << "Removing unneeded cond branch instr: " << *BranchMI); BranchMI->eraseFromParent(); SHOWNEWBLK(MBB1, "Removing redundant successor"); - MBB->removeSuccessor(MBB1); + MBB->removeSuccessor(MBB1, true); } void AMDGPUCFGStructurizer::addDummyExitBlock( diff --git a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2018983..d9f753f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -28,7 +28,9 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -83,6 +85,7 @@ public: unsigned RegNo; int Modifiers; const MCRegisterInfo *TRI; + const MCSubtargetInfo *STI; bool IsForcedVOP3; }; @@ -102,7 +105,7 @@ public: } void addRegOperands(MCInst &Inst, unsigned N) const { - Inst.addOperand(MCOperand::createReg(getReg())); + Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), *Reg.STI))); } void addRegOrImmOperands(MCInst &Inst, unsigned N) const { @@ -215,6 +218,10 @@ public: (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)); } + bool isSCSrc64() const { + return (isReg() && isRegClass(AMDGPU::SReg_64RegClassID)) || isInlineImm(); + } + bool isVCSrc32() const { return isInlineImm() || (isReg() && isRegClass(AMDGPU::VS_32RegClassID)); } @@ -251,7 +258,22 @@ public: return EndLoc; } - void print(raw_ostream &OS) const override { } + void print(raw_ostream &OS) const override { + switch (Kind) { + case Register: + OS << "<register " << getReg() << " mods: " << Reg.Modifiers << '>'; + break; + case Immediate: + OS << getImm(); + break; + case Token: + OS << '\'' << getToken() << '\''; + break; + case Expression: + OS << "<expr " << *Expr << '>'; + break; + } + } static std::unique_ptr<AMDGPUOperand> CreateImm(int64_t Val, SMLoc Loc, enum ImmTy Type = ImmTyNone, @@ -278,10 +300,12 @@ public: static std::unique_ptr<AMDGPUOperand> CreateReg(unsigned RegNo, SMLoc S, SMLoc E, const MCRegisterInfo *TRI, + const MCSubtargetInfo *STI, bool ForceVOP3) { auto Op = llvm::make_unique<AMDGPUOperand>(Register); Op->Reg.RegNo = RegNo; Op->Reg.TRI = TRI; + Op->Reg.STI = STI; Op->Reg.Modifiers = -1; Op->Reg.IsForcedVOP3 = ForceVOP3; Op->StartLoc = S; @@ -301,14 +325,32 @@ public: bool isDSOffset01() const; bool isSWaitCnt() const; bool isMubufOffset() const; + bool isSMRDOffset() const; + bool isSMRDLiteralOffset() const; }; class AMDGPUAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; MCAsmParser &Parser; unsigned ForcedEncodingSize; + + bool isSI() const { + return AMDGPU::isSI(getSTI()); + } + + bool isCI() const { + return AMDGPU::isCI(getSTI()); + } + + bool isVI() const { + return AMDGPU::isVI(getSTI()); + } + + bool hasSGPR102_SGPR103() const { + return !isVI(); + } + /// @name Auto-generated Match Functions /// { @@ -323,20 +365,34 @@ private: bool ParseDirectiveHSACodeObjectISA(); bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header); bool ParseDirectiveAMDKernelCodeT(); + bool ParseSectionDirectiveHSAText(); + bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const; + bool ParseDirectiveAMDGPUHsaKernel(); + bool ParseDirectiveAMDGPUHsaModuleGlobal(); + bool ParseDirectiveAMDGPUHsaProgramGlobal(); + bool ParseSectionDirectiveHSADataGlobalAgent(); + bool ParseSectionDirectiveHSADataGlobalProgram(); + bool ParseSectionDirectiveHSARodataReadonlyAgent(); public: - AMDGPUAsmParser(MCSubtargetInfo &STI, MCAsmParser &_Parser, +public: + enum AMDGPUMatchResultTy { + Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY + }; + + AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI), MII(MII), Parser(_Parser), - ForcedEncodingSize(0){ + : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser), + ForcedEncodingSize(0) { + MCAsmParserExtension::Initialize(Parser); - if (STI.getFeatureBits().none()) { + if (getSTI().getFeatureBits().none()) { // Set default features. - STI.ToggleFeature("SOUTHERN_ISLANDS"); + copySTI().ToggleFeature("SOUTHERN_ISLANDS"); } - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } AMDGPUTargetStreamer &getTargetStreamer() { @@ -420,10 +476,10 @@ struct OptionalOperand { } -static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { +static int getRegClass(bool IsVgpr, unsigned RegWidth) { if (IsVgpr) { switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); + default: return -1; case 1: return AMDGPU::VGPR_32RegClassID; case 2: return AMDGPU::VReg_64RegClassID; case 3: return AMDGPU::VReg_96RegClassID; @@ -434,7 +490,7 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { } switch (RegWidth) { - default: llvm_unreachable("Unknown register width"); + default: return -1; case 1: return AMDGPU::SGPR_32RegClassID; case 2: return AMDGPU::SGPR_64RegClassID; case 4: return AMDGPU::SReg_128RegClassID; @@ -443,16 +499,16 @@ static unsigned getRegClass(bool IsVgpr, unsigned RegWidth) { } } -static unsigned getRegForName(const StringRef &RegName) { +static unsigned getRegForName(StringRef RegName) { return StringSwitch<unsigned>(RegName) .Case("exec", AMDGPU::EXEC) .Case("vcc", AMDGPU::VCC) - .Case("flat_scr", AMDGPU::FLAT_SCR) + .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("m0", AMDGPU::M0) .Case("scc", AMDGPU::SCC) - .Case("flat_scr_lo", AMDGPU::FLAT_SCR_LO) - .Case("flat_scr_hi", AMDGPU::FLAT_SCR_HI) + .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) + .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) .Case("vcc_lo", AMDGPU::VCC_LO) .Case("vcc_hi", AMDGPU::VCC_HI) .Case("exec_lo", AMDGPU::EXEC_LO) @@ -464,12 +520,14 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End const AsmToken Tok = Parser.getTok(); StartLoc = Tok.getLoc(); EndLoc = Tok.getEndLoc(); - const StringRef &RegName = Tok.getString(); + const MCRegisterInfo *TRI = getContext().getRegisterInfo(); + + StringRef RegName = Tok.getString(); RegNo = getRegForName(RegName); if (RegNo) { Parser.Lex(); - return false; + return !subtargetHasRegister(*TRI, RegNo); } // Match vgprs and sgprs @@ -514,16 +572,24 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &End RegIndexInClass = RegLo; } else { // SGPR registers are aligned. Max alignment is 4 dwords. - RegIndexInClass = RegLo / std::min(RegWidth, 4u); + unsigned Size = std::min(RegWidth, 4u); + if (RegLo % Size != 0) + return true; + + RegIndexInClass = RegLo / Size; } } - const MCRegisterInfo *TRC = getContext().getRegisterInfo(); - unsigned RC = getRegClass(IsVgpr, RegWidth); - if (RegIndexInClass > TRC->getRegClass(RC).getNumRegs()) + int RCID = getRegClass(IsVgpr, RegWidth); + if (RCID == -1) return true; - RegNo = TRC->getRegClass(RC).getRegister(RegIndexInClass); - return false; + + const MCRegisterClass RC = TRI->getRegClass(RCID); + if (RegIndexInClass >= RC.getNumRegs()) + return true; + + RegNo = RC.getRegister(RegIndexInClass); + return !subtargetHasRegister(*TRI, RegNo); } unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { @@ -534,6 +600,11 @@ unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) { (getForcedEncodingSize() == 64 && !(TSFlags & SIInstrFlags::VOP3))) return Match_InvalidOperand; + if ((TSFlags & SIInstrFlags::VOP3) && + (TSFlags & SIInstrFlags::VOPAsmPrefer32Bit) && + getForcedEncodingSize() != 64) + return Match_PreferE32; + return Match_Success; } @@ -549,7 +620,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, default: break; case Match_Success: Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: return Error(IDLoc, "instruction not supported on this GPU"); @@ -592,6 +663,9 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } return Error(ErrorLoc, "invalid operand for instruction"); } + case Match_PreferE32: + return Error(IDLoc, "internal error: instruction without _e64 suffix " + "should be encoded as e32"); } llvm_unreachable("Implement any new match types added!"); } @@ -640,7 +714,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() { // If this directive has no arguments, then use the ISA version for the // targeted GPU. if (getLexer().is(AsmToken::EndOfStatement)) { - AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(STI.getFeatureBits()); + AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits()); getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor, Isa.Stepping, "AMD", "AMDGPU"); @@ -852,7 +926,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { amd_kernel_code_t Header; - AMDGPU::initDefaultAMDKernelCodeT(Header, STI.getFeatureBits()); + AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits()); while (true) { @@ -882,6 +956,64 @@ bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() { return false; } +bool AMDGPUAsmParser::ParseSectionDirectiveHSAText() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSATextSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaKernel() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef KernelName = Parser.getTok().getString(); + + getTargetStreamer().EmitAMDGPUSymbolType(KernelName, + ELF::STT_AMDGPU_HSA_KERNEL); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaModuleGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaModuleScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseDirectiveAMDGPUHsaProgramGlobal() { + if (getLexer().isNot(AsmToken::Identifier)) + return TokError("expected symbol name"); + + StringRef GlobalName = Parser.getTok().getIdentifier(); + + getTargetStreamer().EmitAMDGPUHsaProgramScopeGlobal(GlobalName); + Lex(); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalAgentSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSADataGlobalProgram() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSADataGlobalProgramSection(getContext())); + return false; +} + +bool AMDGPUAsmParser::ParseSectionDirectiveHSARodataReadonlyAgent() { + getParser().getStreamer().SwitchSection( + AMDGPU::getHSARodataReadonlyAgentSection(getContext())); + return false; +} + bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { StringRef IDVal = DirectiveID.getString(); @@ -894,6 +1026,55 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".amd_kernel_code_t") return ParseDirectiveAMDKernelCodeT(); + if (IDVal == ".hsatext" || IDVal == ".text") + return ParseSectionDirectiveHSAText(); + + if (IDVal == ".amdgpu_hsa_kernel") + return ParseDirectiveAMDGPUHsaKernel(); + + if (IDVal == ".amdgpu_hsa_module_global") + return ParseDirectiveAMDGPUHsaModuleGlobal(); + + if (IDVal == ".amdgpu_hsa_program_global") + return ParseDirectiveAMDGPUHsaProgramGlobal(); + + if (IDVal == ".hsadata_global_agent") + return ParseSectionDirectiveHSADataGlobalAgent(); + + if (IDVal == ".hsadata_global_program") + return ParseSectionDirectiveHSADataGlobalProgram(); + + if (IDVal == ".hsarodata_readonly_agent") + return ParseSectionDirectiveHSARodataReadonlyAgent(); + + return true; +} + +bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI, + unsigned RegNo) const { + if (isCI()) + return true; + + if (isSI()) { + // No flat_scr + switch (RegNo) { + case AMDGPU::FLAT_SCR: + case AMDGPU::FLAT_SCR_LO: + case AMDGPU::FLAT_SCR_HI: + return false; + default: + return true; + } + } + + // VI only has 102 SGPRs, so make sure we aren't trying to use the 2 more that + // SI/CI have. + for (MCRegAliasIterator R(AMDGPU::SGPR102_SGPR103, &MRI, true); + R.isValid(); ++R) { + if (*R == RegNo) + return false; + } + return true; } @@ -943,13 +1124,11 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { int64_t IntVal; if (getParser().parseAbsoluteExpression(IntVal)) return MatchOperand_ParseFail; - APInt IntVal32(32, IntVal); - if (IntVal32.getSExtValue() != IntVal) { + if (!isInt<32>(IntVal) && !isUInt<32>(IntVal)) { Error(S, "invalid immediate: only 32-bit values are legal"); return MatchOperand_ParseFail; } - IntVal = IntVal32.getSExtValue(); if (Negate) IntVal *= -1; Operands.push_back(AMDGPUOperand::CreateImm(IntVal, S)); @@ -1002,7 +1181,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { Operands.push_back(AMDGPUOperand::CreateReg( - RegNo, S, E, getContext().getRegisterInfo(), + RegNo, S, E, getContext().getRegisterInfo(), &getSTI(), isForcedVOP3())); if (HasModifiers || Modifiers) { @@ -1571,6 +1750,23 @@ AMDGPUAsmParser::parseR128(OperandVector &Operands) { } //===----------------------------------------------------------------------===// +// smrd +//===----------------------------------------------------------------------===// + +bool AMDGPUOperand::isSMRDOffset() const { + + // FIXME: Support 20-bit offsets on VI. We need to to pass subtarget + // information here. + return isImm() && isUInt<8>(getImm()); +} + +bool AMDGPUOperand::isSMRDLiteralOffset() const { + // 32-bit literals are only supported on CI and we only want to use them + // when the offset is > 8-bits. + return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm()); +} + +//===----------------------------------------------------------------------===// // vop3 //===----------------------------------------------------------------------===// @@ -1653,8 +1849,12 @@ AMDGPUAsmParser::parseVOP3OptionalOps(OperandVector &Operands) { } void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); - unsigned i = 2; + + unsigned i = 1; + const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); + if (Desc.getNumDefs() > 0) { + ((AMDGPUOperand &)*Operands[i++]).addRegOperands(Inst, 1); + } std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx; diff --git a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td index 2f5fdbe..88a090d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CIInstructions.td @@ -8,6 +8,22 @@ //===----------------------------------------------------------------------===// // Instruction definitions for CI and newer. //===----------------------------------------------------------------------===// +// Remaining instructions: +// S_CBRANCH_CDBGUSER +// S_CBRANCH_CDBGSYS +// S_CBRANCH_CDBGSYS_OR_USER +// S_CBRANCH_CDBGSYS_AND_USER +// DS_NOP +// DS_GWS_SEMA_RELEASE_ALL +// DS_WRAP_RTN_B32 +// DS_CNDXCHG32_RTN_B64 +// DS_WRITE_B96 +// DS_WRITE_B128 +// DS_CONDXCHG32_RTN_B128 +// DS_READ_B96 +// DS_READ_B128 +// BUFFER_LOAD_DWORDX3 +// BUFFER_STORE_DWORDX3 def isCIVI : Predicate < @@ -23,6 +39,7 @@ def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">; let SubtargetPredicate = isCIVI in { +let SchedRW = [WriteDoubleAdd] in { defm V_TRUNC_F64 : VOP1Inst <vop1<0x17>, "v_trunc_f64", VOP_F64_F64, ftrunc >; @@ -35,82 +52,218 @@ defm V_FLOOR_F64 : VOP1Inst <vop1<0x1A>, "v_floor_f64", defm V_RNDNE_F64 : VOP1Inst <vop1<0x19>, "v_rndne_f64", VOP_F64_F64, frint >; +} // End SchedRW = [WriteDoubleAdd] + +let SchedRW = [WriteQuarterRate32] in { defm V_LOG_LEGACY_F32 : VOP1Inst <vop1<0x45, 0x4c>, "v_log_legacy_f32", VOP_F32_F32 >; defm V_EXP_LEGACY_F32 : VOP1Inst <vop1<0x46, 0x4b>, "v_exp_legacy_f32", VOP_F32_F32 >; +} // End SchedRW = [WriteQuarterRate32] + +//===----------------------------------------------------------------------===// +// VOP3 Instructions +//===----------------------------------------------------------------------===// + +defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", + VOP_I32_I32_I32 +>; +defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", + VOP_I32_I32_I32 +>; + +let isCommutable = 1 in { +defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", + VOP_I64_I32_I32_I64 +>; + +// XXX - Does this set VCC? +defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", + VOP_I64_I32_I32_I64 +>; +} // End isCommutable = 1 + + +//===----------------------------------------------------------------------===// +// DS Instructions +//===----------------------------------------------------------------------===// +defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; + +// DS_CONDXCHG32_RTN_B64 +// DS_CONDXCHG32_RTN_B128 + +//===----------------------------------------------------------------------===// +// SMRD Instructions +//===----------------------------------------------------------------------===// + +defm S_DCACHE_INV_VOL : SMRD_Inval <smrd<0x1d, 0x22>, + "s_dcache_inv_vol", int_amdgcn_s_dcache_inv_vol>; + +//===----------------------------------------------------------------------===// +// MUBUF Instructions +//===----------------------------------------------------------------------===// + +defm BUFFER_WBINVL1_VOL : MUBUF_Invalidate <mubuf<0x70, 0x3f>, + "buffer_wbinvl1_vol", int_amdgcn_buffer_wbinvl1_vol +>; //===----------------------------------------------------------------------===// // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x8, "flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x9, "flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Helper <0xa, "flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0xb, "flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Helper <0xc, "flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0xd, "flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0xe, "flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0xf, "flat_load_dwordx3", VReg_96>; -def FLAT_STORE_BYTE : FLAT_Store_Helper <0x18, "flat_store_byte", VGPR_32>; -def FLAT_STORE_SHORT : FLAT_Store_Helper <0x1a, "flat_store_short", VGPR_32>; -def FLAT_STORE_DWORD : FLAT_Store_Helper <0x1c, "flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Helper < - 0x1d, "flat_store_dwordx2", VReg_64 +defm FLAT_LOAD_UBYTE : FLAT_Load_Helper < + flat<0x8, 0x10>, "flat_load_ubyte", VGPR_32 +>; +defm FLAT_LOAD_SBYTE : FLAT_Load_Helper < + flat<0x9, 0x11>, "flat_load_sbyte", VGPR_32 +>; +defm FLAT_LOAD_USHORT : FLAT_Load_Helper < + flat<0xa, 0x12>, "flat_load_ushort", VGPR_32 +>; +defm FLAT_LOAD_SSHORT : FLAT_Load_Helper < + flat<0xb, 0x13>, "flat_load_sshort", VGPR_32> +; +defm FLAT_LOAD_DWORD : FLAT_Load_Helper < + flat<0xc, 0x14>, "flat_load_dword", VGPR_32 +>; +defm FLAT_LOAD_DWORDX2 : FLAT_Load_Helper < + flat<0xd, 0x15>, "flat_load_dwordx2", VReg_64 +>; +defm FLAT_LOAD_DWORDX4 : FLAT_Load_Helper < + flat<0xe, 0x17>, "flat_load_dwordx4", VReg_128 +>; +defm FLAT_LOAD_DWORDX3 : FLAT_Load_Helper < + flat<0xf, 0x16>, "flat_load_dwordx3", VReg_96 +>; +defm FLAT_STORE_BYTE : FLAT_Store_Helper < + flat<0x18>, "flat_store_byte", VGPR_32 +>; +defm FLAT_STORE_SHORT : FLAT_Store_Helper < + flat <0x1a>, "flat_store_short", VGPR_32 +>; +defm FLAT_STORE_DWORD : FLAT_Store_Helper < + flat<0x1c>, "flat_store_dword", VGPR_32 +>; +defm FLAT_STORE_DWORDX2 : FLAT_Store_Helper < + flat<0x1d>, "flat_store_dwordx2", VReg_64 +>; +defm FLAT_STORE_DWORDX4 : FLAT_Store_Helper < + flat<0x1e, 0x1f>, "flat_store_dwordx4", VReg_128 >; -def FLAT_STORE_DWORDX4 : FLAT_Store_Helper < - 0x1e, "flat_store_dwordx4", VReg_128 +defm FLAT_STORE_DWORDX3 : FLAT_Store_Helper < + flat<0x1f, 0x1e>, "flat_store_dwordx3", VReg_96 >; -def FLAT_STORE_DWORDX3 : FLAT_Store_Helper < - 0x1f, "flat_store_dwordx3", VReg_96 +defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC < + flat<0x30, 0x40>, "flat_atomic_swap", VGPR_32 >; -defm FLAT_ATOMIC_SWAP : FLAT_ATOMIC <0x30, "flat_atomic_swap", VGPR_32>; defm FLAT_ATOMIC_CMPSWAP : FLAT_ATOMIC < - 0x31, "flat_atomic_cmpswap", VGPR_32, VReg_64 ->; -defm FLAT_ATOMIC_ADD : FLAT_ATOMIC <0x32, "flat_atomic_add", VGPR_32>; -defm FLAT_ATOMIC_SUB : FLAT_ATOMIC <0x33, "flat_atomic_sub", VGPR_32>; -defm FLAT_ATOMIC_RSUB : FLAT_ATOMIC <0x34, "flat_atomic_rsub", VGPR_32>; -defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC <0x35, "flat_atomic_smin", VGPR_32>; -defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC <0x36, "flat_atomic_umin", VGPR_32>; -defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC <0x37, "flat_atomic_smax", VGPR_32>; -defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC <0x38, "flat_atomic_umax", VGPR_32>; -defm FLAT_ATOMIC_AND : FLAT_ATOMIC <0x39, "flat_atomic_and", VGPR_32>; -defm FLAT_ATOMIC_OR : FLAT_ATOMIC <0x3a, "flat_atomic_or", VGPR_32>; -defm FLAT_ATOMIC_XOR : FLAT_ATOMIC <0x3b, "flat_atomic_xor", VGPR_32>; -defm FLAT_ATOMIC_INC : FLAT_ATOMIC <0x3c, "flat_atomic_inc", VGPR_32>; -defm FLAT_ATOMIC_DEC : FLAT_ATOMIC <0x3d, "flat_atomic_dec", VGPR_32>; -defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < - 0x3e, "flat_atomic_fcmpswap", VGPR_32, VReg_64 + flat<0x31, 0x41>, "flat_atomic_cmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_ADD : FLAT_ATOMIC < + flat<0x32, 0x42>, "flat_atomic_add", VGPR_32 +>; +defm FLAT_ATOMIC_SUB : FLAT_ATOMIC < + flat<0x33, 0x43>, "flat_atomic_sub", VGPR_32 +>; +defm FLAT_ATOMIC_SMIN : FLAT_ATOMIC < + flat<0x35, 0x44>, "flat_atomic_smin", VGPR_32 +>; +defm FLAT_ATOMIC_UMIN : FLAT_ATOMIC < + flat<0x36, 0x45>, "flat_atomic_umin", VGPR_32 +>; +defm FLAT_ATOMIC_SMAX : FLAT_ATOMIC < + flat<0x37, 0x46>, "flat_atomic_smax", VGPR_32 +>; +defm FLAT_ATOMIC_UMAX : FLAT_ATOMIC < + flat<0x38, 0x47>, "flat_atomic_umax", VGPR_32 +>; +defm FLAT_ATOMIC_AND : FLAT_ATOMIC < + flat<0x39, 0x48>, "flat_atomic_and", VGPR_32 +>; +defm FLAT_ATOMIC_OR : FLAT_ATOMIC < + flat<0x3a, 0x49>, "flat_atomic_or", VGPR_32 +>; +defm FLAT_ATOMIC_XOR : FLAT_ATOMIC < + flat<0x3b, 0x4a>, "flat_atomic_xor", VGPR_32 +>; +defm FLAT_ATOMIC_INC : FLAT_ATOMIC < + flat<0x3c, 0x4b>, "flat_atomic_inc", VGPR_32 +>; +defm FLAT_ATOMIC_DEC : FLAT_ATOMIC < + flat<0x3d, 0x4c>, "flat_atomic_dec", VGPR_32 +>; +defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC < + flat<0x50, 0x60>, "flat_atomic_swap_x2", VReg_64 >; -defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC <0x3f, "flat_atomic_fmin", VGPR_32>; -defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC <0x40, "flat_atomic_fmax", VGPR_32>; -defm FLAT_ATOMIC_SWAP_X2 : FLAT_ATOMIC <0x50, "flat_atomic_swap_x2", VReg_64>; defm FLAT_ATOMIC_CMPSWAP_X2 : FLAT_ATOMIC < - 0x51, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 ->; -defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC <0x52, "flat_atomic_add_x2", VReg_64>; -defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC <0x53, "flat_atomic_sub_x2", VReg_64>; -defm FLAT_ATOMIC_RSUB_X2 : FLAT_ATOMIC <0x54, "flat_atomic_rsub_x2", VReg_64>; -defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC <0x55, "flat_atomic_smin_x2", VReg_64>; -defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC <0x56, "flat_atomic_umin_x2", VReg_64>; -defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC <0x57, "flat_atomic_smax_x2", VReg_64>; -defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC <0x58, "flat_atomic_umax_x2", VReg_64>; -defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC <0x59, "flat_atomic_and_x2", VReg_64>; -defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC <0x5a, "flat_atomic_or_x2", VReg_64>; -defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC <0x5b, "flat_atomic_xor_x2", VReg_64>; -defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC <0x5c, "flat_atomic_inc_x2", VReg_64>; -defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC <0x5d, "flat_atomic_dec_x2", VReg_64>; -defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < - 0x5e, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 + flat<0x51, 0x61>, "flat_atomic_cmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_ADD_X2 : FLAT_ATOMIC < + flat<0x52, 0x62>, "flat_atomic_add_x2", VReg_64 +>; +defm FLAT_ATOMIC_SUB_X2 : FLAT_ATOMIC < + flat<0x53, 0x63>, "flat_atomic_sub_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMIN_X2 : FLAT_ATOMIC < + flat<0x55, 0x64>, "flat_atomic_smin_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMIN_X2 : FLAT_ATOMIC < + flat<0x56, 0x65>, "flat_atomic_umin_x2", VReg_64 +>; +defm FLAT_ATOMIC_SMAX_X2 : FLAT_ATOMIC < + flat<0x57, 0x66>, "flat_atomic_smax_x2", VReg_64 +>; +defm FLAT_ATOMIC_UMAX_X2 : FLAT_ATOMIC < + flat<0x58, 0x67>, "flat_atomic_umax_x2", VReg_64 +>; +defm FLAT_ATOMIC_AND_X2 : FLAT_ATOMIC < + flat<0x59, 0x68>, "flat_atomic_and_x2", VReg_64 +>; +defm FLAT_ATOMIC_OR_X2 : FLAT_ATOMIC < + flat<0x5a, 0x69>, "flat_atomic_or_x2", VReg_64 +>; +defm FLAT_ATOMIC_XOR_X2 : FLAT_ATOMIC < + flat<0x5b, 0x6a>, "flat_atomic_xor_x2", VReg_64 +>; +defm FLAT_ATOMIC_INC_X2 : FLAT_ATOMIC < + flat<0x5c, 0x6b>, "flat_atomic_inc_x2", VReg_64 +>; +defm FLAT_ATOMIC_DEC_X2 : FLAT_ATOMIC < + flat<0x5d, 0x6c>, "flat_atomic_dec_x2", VReg_64 >; -defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC <0x5f, "flat_atomic_fmin_x2", VReg_64>; -defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC <0x60, "flat_atomic_fmax_x2", VReg_64>; } // End SubtargetPredicate = isCIVI +// CI Only flat instructions + +let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst in { + +defm FLAT_ATOMIC_FCMPSWAP : FLAT_ATOMIC < + flat<0x3e>, "flat_atomic_fcmpswap", VGPR_32, VReg_64 +>; +defm FLAT_ATOMIC_FMIN : FLAT_ATOMIC < + flat<0x3f>, "flat_atomic_fmin", VGPR_32 +>; +defm FLAT_ATOMIC_FMAX : FLAT_ATOMIC < + flat<0x40>, "flat_atomic_fmax", VGPR_32 +>; +defm FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_ATOMIC < + flat<0x5e>, "flat_atomic_fcmpswap_x2", VReg_64, VReg_128 +>; +defm FLAT_ATOMIC_FMIN_X2 : FLAT_ATOMIC < + flat<0x5f>, "flat_atomic_fmin_x2", VReg_64 +>; +defm FLAT_ATOMIC_FMAX_X2 : FLAT_ATOMIC < + flat<0x60>, "flat_atomic_fmax_x2", VReg_64 +>; + +} // End let SubtargetPredicate = isCI, VIAssemblerPredicate = DisableInst + //===----------------------------------------------------------------------===// // Flat Patterns //===----------------------------------------------------------------------===// @@ -147,3 +300,80 @@ def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>; } // End HasFlatAddressSpace predicate +let Predicates = [isCI] in { + +// Convert (x - floor(x)) to fract(x) +def : Pat < + (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), + (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), + (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +// Convert (x + (-floor(x))) to fract(x) +def : Pat < + (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), + (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), + (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) +>; + +} // End Predicates = [isCI] + + +//===----------------------------------------------------------------------===// +// Patterns to generate flat for global +//===----------------------------------------------------------------------===// + +def useFlatForGlobal : Predicate < + "Subtarget->useFlatForGlobal() || " + "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">; + +let Predicates = [useFlatForGlobal] in { + +// 1. Offset as 20bit DWORD immediate +def : Pat < + (SIload_constant v4i32:$sbase, IMM20bit:$offset), + (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) +>; + +// Patterns for global loads with no offset +class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr)), + (inst $addr, 0, 0, 0) +>; + +def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; +def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; +def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; + +class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (node vt:$data, i64:$addr), + (inst $data, $addr, 0, 0, 0) +>; + +def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; +def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; +def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; +def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; +def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; + +class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < + (vt (node i64:$addr, vt:$data)), + (inst $addr, $data, 0, 0) +>; + +def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; +def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; + +} // End Predicates = [useFlatForGlobal] diff --git a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td index ba4df82..a6c3785 100644 --- a/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -82,6 +82,10 @@ def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>; def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>; def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>; +def RAT_STORE_TYPED_cm: CF_MEM_RAT_STORE_TYPED<0> { + let eop = 0; // This bit is not used on Cayman. +} + class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern> : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> { diff --git a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td index 7adcd46..779a14e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -40,6 +40,15 @@ class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name, : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins, "MEM_RAT "#name, pattern>; +class CF_MEM_RAT_STORE_TYPED<bits<1> has_eop> + : CF_MEM_RAT <0x1, ?, (ins R600_Reg128:$rw_gpr, R600_Reg128:$index_gpr, + i32imm:$rat_id, InstFlag:$eop), + "STORE_TYPED RAT($rat_id) $rw_gpr, $index_gpr" + #!if(has_eop, ", $eop", ""), + [(int_r600_rat_store_typed R600_Reg128:$rw_gpr, + R600_Reg128:$index_gpr, + (i32 imm:$rat_id))]>; + def RAT_MSKOR : CF_MEM_RAT <0x11, 0, (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr), "MSKOR $rw_gpr.XW, $index_gpr", @@ -105,6 +114,8 @@ def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf, [(global_store v4i32:$rw_gpr, i32:$index_gpr)] >; +def RAT_STORE_TYPED_eg: CF_MEM_RAT_STORE_TYPED<1>; + } // End usesCustomInserter = 1 class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern> diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index e811d5c..a187de8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -283,8 +284,13 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, raw_ostream &O) { O << "4.0"; else if (Imm == DoubleToBits(-4.0)) O << "-4.0"; - else - llvm_unreachable("64-bit literal constants not supported"); + else { + assert(isUInt<32>(Imm)); + + // In rare situations, we will have a 32-bit literal in a 64-bit + // operand. This is technically allowed for the encoding of s_mov_b64. + O << formatHex(static_cast<uint64_t>(Imm)); + } } void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, @@ -592,11 +598,11 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo, } else { unsigned Stream = (SImm16 >> 8) & 0x3; if (Op == 1) - O << "cut"; + O << "cut"; else if (Op == 2) - O << "emit"; + O << "emit"; else if (Op == 3) - O << "emit-cut"; + O << "emit-cut"; O << " stream " << Stream; } O << "), [m0] "; diff --git a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 14fb511..90541d8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/contrib/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -13,9 +13,7 @@ #ifndef LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H #define LLVM_LIB_TARGET_R600_INSTPRINTER_AMDGPUINSTPRINTER_H -#include "llvm/ADT/StringRef.h" #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" namespace llvm { diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 4434d9b..60e8c8f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -99,14 +99,22 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, case AMDGPU::fixup_si_rodata: { uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - *Dst = Value; - break; - } - - case AMDGPU::fixup_si_end_of_text: { - uint32_t *Dst = (uint32_t*)(Data + Fixup.getOffset()); - // The value points to the last instruction in the text section, so we - // need to add 4 bytes to get to the start of the constants. + // We emit constant data at the end of the text section and generate its + // address using the following code sequence: + // s_getpc_b64 s[0:1] + // s_add_u32 s0, s0, $symbol + // s_addc_u32 s1, s1, 0 + // + // s_getpc_b64 returns the address of the s_add_u32 instruction and then + // the fixup replaces $symbol with a literal constant, which is a + // pc-relative offset from the encoding of the $symbol operand to the + // constant data. + // + // What we want here is an offset from the start of the s_add_u32 + // instruction to the constant data, but since the encoding of $symbol + // starts 4 bytes after the start of the add instruction, we end up + // with an offset that is 4 bytes too small. This requires us to + // add 4 to the fixup value before applying it. *Dst = Value + 4; break; } @@ -136,8 +144,7 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo( const static MCFixupKindInfo Infos[AMDGPU::NumTargetFixupKinds] = { // name offset bits flags { "fixup_si_sopp_br", 0, 16, MCFixupKindInfo::FKF_IsPCRel }, - { "fixup_si_rodata", 0, 32, 0 }, - { "fixup_si_end_of_text", 0, 32, MCFixupKindInfo::FKF_IsPCRel } + { "fixup_si_rodata", 0, 32, MCFixupKindInfo::FKF_IsPCRel } }; if (Kind < FirstTargetFixupKind) diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp new file mode 100644 index 0000000..9ff9fe7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.cpp @@ -0,0 +1,26 @@ +//===-------- AMDGPUELFStreamer.cpp - ELF Object Output -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUELFStreamer.h" +#include "Utils/AMDGPUBaseInfo.h" + +using namespace llvm; + +void AMDGPUELFStreamer::InitSections(bool NoExecStack) { + // Start with the .hsatext section by default. + SwitchSection(AMDGPU::getHSATextSection(getContext())); +} + +MCELFStreamer *llvm::createAMDGPUELFStreamer(MCContext &Context, + MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll) { + return new AMDGPUELFStreamer(Context, MAB, OS, Emitter); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h new file mode 100644 index 0000000..488d7e7 --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFStreamer.h @@ -0,0 +1,40 @@ +//===-------- AMDGPUELFStreamer.h - ELF Object Output ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This is a custom MCELFStreamer which allows us to insert some hooks before +// emitting data into an actual object file. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H +#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUELFSTREAMER_H + +#include "llvm/MC/MCELFStreamer.h" + +namespace llvm { +class MCAsmBackend; +class MCCodeEmitter; +class MCContext; +class MCSubtargetInfo; + +class AMDGPUELFStreamer : public MCELFStreamer { +public: + AMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter) + : MCELFStreamer(Context, MAB, OS, Emitter) { } + + virtual void InitSections(bool NoExecStac) override; +}; + +MCELFStreamer *createAMDGPUELFStreamer(MCContext &Context, MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll); +} // namespace llvm. + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h index 01021d6..59a9178 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUFixupKinds.h @@ -21,9 +21,6 @@ enum Fixups { /// fixup for global addresses with constant initializers fixup_si_rodata, - /// fixup for offset from instruction to end of text section - fixup_si_end_of_text, - // Marker LastTargetFixupKind, NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 028a86d..68b1d1a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -22,13 +22,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { InlineAsmEnd = ";#ASMEND"; //===--- Data Emission Directives -------------------------------------===// - ZeroDirective = ".zero"; - AsciiDirective = ".ascii\t"; - AscizDirective = ".asciz\t"; - Data8bitsDirective = ".byte\t"; - Data16bitsDirective = ".short\t"; - Data32bitsDirective = ".long\t"; - Data64bitsDirective = ".quad\t"; SunStyleELFSectionSwitchSyntax = true; UsesELFSectionDirectiveForBSS = true; @@ -41,3 +34,10 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT) : MCAsmInfoELF() { //===--- Dwarf Emission Directives -----------------------------------===// SupportsDebugInformation = true; } + +bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const { + return SectionName == ".hsatext" || SectionName == ".hsadata_global_agent" || + SectionName == ".hsadata_global_program" || + SectionName == ".hsarodata_readonly_agent" || + MCAsmInfo::shouldOmitSectionDirective(SectionName); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h index a5bac51..a546961 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h @@ -21,12 +21,13 @@ class Triple; // If you need to create another MCAsmInfo class, which inherits from MCAsmInfo, // you will need to make sure your new class sets PrivateGlobalPrefix to -// a prefix that won't appeary in a fuction name. The default value +// a prefix that won't appear in a function name. The default value // for PrivateGlobalPrefix is 'L', so it will consider any function starting // with 'L' as a local symbol. class AMDGPUMCAsmInfo : public MCAsmInfoELF { public: explicit AMDGPUMCAsmInfo(const Triple &TT); + bool shouldOmitSectionDirective(StringRef SectionName) const override; }; } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index c709741..f704094 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUMCTargetDesc.h" +#include "AMDGPUELFStreamer.h" #include "AMDGPUMCAsmInfo.h" #include "AMDGPUTargetStreamer.h" #include "InstPrinter/AMDGPUInstPrinter.h" @@ -85,6 +86,15 @@ static MCTargetStreamer * createAMDGPUObjectTargetStreamer( return new AMDGPUTargetELFStreamer(S); } +static MCStreamer *createMCStreamer(const Triple &T, MCContext &Context, + MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll) { + if (T.getOS() == Triple::AMDHSA) + return createAMDGPUELFStreamer(Context, MAB, OS, Emitter, RelaxAll); + + return createELFStreamer(Context, MAB, OS, Emitter, RelaxAll); +} + extern "C" void LLVMInitializeAMDGPUTargetMC() { for (Target *T : {&TheAMDGPUTarget, &TheGCNTarget}) { RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T); @@ -95,6 +105,7 @@ extern "C" void LLVMInitializeAMDGPUTargetMC() { TargetRegistry::RegisterMCSubtargetInfo(*T, createAMDGPUMCSubtargetInfo); TargetRegistry::RegisterMCInstPrinter(*T, createAMDGPUMCInstPrinter); TargetRegistry::RegisterMCAsmBackend(*T, createAMDGPUAsmBackend); + TargetRegistry::RegisterELFStreamer(*T, createMCStreamer); } // R600 specific registration diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 09e6cb1..b91134d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -13,6 +13,7 @@ #include "AMDGPUTargetStreamer.h" #include "SIDefines.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Twine.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFStreamer.h" @@ -220,6 +221,26 @@ AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { } +void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + switch (Type) { + default: llvm_unreachable("Invalid AMDGPU symbol type"); + case ELF::STT_AMDGPU_HSA_KERNEL: + OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ; + break; + } +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_module_global " << GlobalName << '\n'; +} + +void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n'; +} + //===----------------------------------------------------------------------===// // AMDGPUTargetELFStreamer //===----------------------------------------------------------------------===// @@ -291,7 +312,35 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) { MCStreamer &OS = getStreamer(); OS.PushSection(); - OS.SwitchSection(OS.getContext().getObjectFileInfo()->getTextSection()); + // The MCObjectFileInfo that is available to the assembler is a generic + // implementation and not AMDGPUHSATargetObjectFile, so we can't use + // MCObjectFileInfo::getTextSection() here for fetching the HSATextSection. + OS.SwitchSection(AMDGPU::getHSATextSection(OS.getContext())); OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header))); OS.PopSection(); } + +void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName, + unsigned Type) { + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(SymbolName)); + Symbol->setType(ELF::STT_AMDGPU_HSA_KERNEL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaModuleScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_LOCAL); +} + +void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal( + StringRef GlobalName) { + + MCSymbolELF *Symbol = cast<MCSymbolELF>( + getStreamer().getContext().getOrCreateSymbol(GlobalName)); + Symbol->setType(ELF::STT_OBJECT); + Symbol->setBinding(ELF::STB_GLOBAL); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h index d37677c..83bb728 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -7,6 +7,9 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H +#define LLVM_LIB_TARGET_R600_MCTARGETDESC_AMDGPUTARGETSTREAMER_H + #include "AMDKernelCodeT.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" @@ -27,6 +30,12 @@ public: StringRef ArchName) = 0; virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0; + + virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0; + + virtual void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) = 0; + + virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0; }; class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer { @@ -41,6 +50,12 @@ public: StringRef ArchName) override; void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer { @@ -72,6 +87,12 @@ public: void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override; + void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override; + + void EmitAMDGPUHsaModuleScopeGlobal(StringRef GlobalName) override; + + void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override; }; } +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp index e683498..3c1142d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp @@ -37,7 +37,6 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { const MCRegisterInfo &MRI; public: - R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri) : MCII(mcii), MRI(mri) { } @@ -50,8 +49,8 @@ public: uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const override; -private: +private: void EmitByte(unsigned int byte, raw_ostream &OS) const; void Emit(uint32_t value, raw_ostream &OS) const; @@ -59,7 +58,6 @@ private: unsigned getHWRegChan(unsigned reg) const; unsigned getHWReg(unsigned regNo) const; - }; } // End anonymous namespace @@ -83,7 +81,7 @@ enum FCInstr { MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - MCContext &Ctx) { + MCContext &Ctx) { return new R600MCCodeEmitter(MCII, MRI); } diff --git a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index 65a0eeb..9eb3dad 100644 --- a/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -36,7 +36,6 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { void operator=(const SIMCCodeEmitter &) = delete; const MCInstrInfo &MCII; const MCRegisterInfo &MRI; - MCContext &Ctx; /// \brief Can this operand also contain immediate values? bool isSrcOperand(const MCInstrDesc &Desc, unsigned OpNo) const; @@ -47,7 +46,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { public: SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, MCContext &ctx) - : MCII(mcii), MRI(mri), Ctx(ctx) { } + : MCII(mcii), MRI(mri) { } ~SIMCCodeEmitter() override {} @@ -250,17 +249,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, if (MO.isExpr()) { const MCSymbolRefExpr *Expr = cast<MCSymbolRefExpr>(MO.getExpr()); - MCFixupKind Kind; - const MCSymbol *Sym = - Ctx.getOrCreateSymbol(StringRef(END_OF_TEXT_LABEL_NAME)); - - if (&Expr->getSymbol() == Sym) { - // Add the offset to the beginning of the constant values. - Kind = (MCFixupKind)AMDGPU::fixup_si_end_of_text; - } else { - // This is used for constant data stored in .rodata. - Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; - } + MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_rodata; Fixups.push_back(MCFixup::create(4, Expr, Kind, MI.getLoc())); } diff --git a/contrib/llvm/lib/Target/AMDGPU/Processors.td b/contrib/llvm/lib/Target/AMDGPU/Processors.td index d9a0723..a1584a2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Processors.td +++ b/contrib/llvm/lib/Target/AMDGPU/Processors.td @@ -142,3 +142,7 @@ def : ProcessorModel<"carrizo", SIQuarterSpeedModel, def : ProcessorModel<"fiji", SIQuarterSpeedModel, [FeatureVolcanicIslands, FeatureISAVersion8_0_1] >; + +def : ProcessorModel<"stoney", SIQuarterSpeedModel, + [FeatureVolcanicIslands, FeatureISAVersion8_0_1] +>; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index c8f37f6..bd80bb2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -405,8 +405,8 @@ private: if (MO.isReg() && MO.isInternalRead()) MO.setIsInternalRead(false); } - getLiteral(BI, Literals); - ClauseContent.push_back(BI); + getLiteral(&*BI, Literals); + ClauseContent.push_back(&*BI); } I = BI; DeleteMI->eraseFromParent(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp index 4e4d554..124a9c6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -190,6 +190,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM, setSchedulingPreference(Sched::Source); } +static inline bool isEOP(MachineBasicBlock::iterator I) { + return std::next(I)->getOpcode() == AMDGPU::RETURN; +} + MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { MachineFunction * MF = BB->getParent(); @@ -276,12 +280,18 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( case AMDGPU::RAT_WRITE_CACHELESS_32_eg: case AMDGPU::RAT_WRITE_CACHELESS_64_eg: case AMDGPU::RAT_WRITE_CACHELESS_128_eg: { - unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; - BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) .addOperand(MI->getOperand(0)) .addOperand(MI->getOperand(1)) - .addImm(EOP); // Set End of program bit + .addImm(isEOP(I)); // Set End of program bit + break; + } + case AMDGPU::RAT_STORE_TYPED_eg: { + BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode())) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)) + .addOperand(MI->getOperand(2)) + .addImm(isEOP(I)); // Set End of program bit break; } @@ -539,7 +549,7 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( } } } - bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0; + bool EOP = isEOP(I); if (!EOP && !isLastInstructionOfItsType) return BB; unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40; @@ -946,6 +956,8 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); SDLoc DL(Op); + + // TODO: Should this propagate fast-math-flags? SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, DAG.getNode(ISD::FADD, DL, VT, DAG.getNode(ISD::FMUL, DL, VT, Arg, @@ -1936,6 +1948,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N, Arg->getOperand(0).getOperand(Element)); } } + break; } case ISD::SELECT_CC: { diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp index 855fa9f..8b6eea1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp @@ -922,7 +922,7 @@ bool R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const{ + BranchProbability Probability) const{ return true; } @@ -933,14 +933,14 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, MachineBasicBlock &FMBB, unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const { + BranchProbability Probability) const { return true; } bool R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) + BranchProbability Probability) const { return true; } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h index dee4c2b..e7251c3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600InstrInfo.h @@ -174,18 +174,18 @@ namespace llvm { bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override ; + BranchProbability Probability) const override ; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB, unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td index 7beed09..33ef6a4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1655,7 +1655,7 @@ def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; // ISel Patterns //===----------------------------------------------------------------------===// -// CND*_INT Pattterns for f32 True / False values +// CND*_INT Patterns for f32 True / False values class CND_INT_f32 <InstR600 cnd, CondCode cc> : Pat < (selectcc i32:$src0, 0, f32:$src1, f32:$src2, cc), diff --git a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp index 0c06ccc..5efb3b9 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp @@ -318,7 +318,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) { MRI = &(Fn.getRegInfo()); for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); MBB != MBBe; ++MBB) { - MachineBasicBlock *MB = MBB; + MachineBasicBlock *MB = &*MBB; PreviousRegSeq.clear(); PreviousRegSeqByReg.clear(); PreviousRegSeqByUndefCount.clear(); diff --git a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp index deee5bc..2126961 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/R600Packetizer.cpp @@ -81,11 +81,11 @@ private: int LastDstChan = -1; do { bool isTrans = false; - int BISlot = getSlot(BI); + int BISlot = getSlot(&*BI); if (LastDstChan >= BISlot) isTrans = true; LastDstChan = BISlot; - if (TII->isPredicated(BI)) + if (TII->isPredicated(&*BI)) continue; int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write); if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0) @@ -95,7 +95,7 @@ private: continue; } unsigned Dst = BI->getOperand(DstIdx).getReg(); - if (isTrans || TII->isTransOnly(BI)) { + if (isTrans || TII->isTransOnly(&*BI)) { Result[Dst] = AMDGPU::PS; continue; } @@ -149,7 +149,7 @@ private: public: // Ctor. R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI) - : VLIWPacketizerList(MF, MLI, true), + : VLIWPacketizerList(MF, MLI, nullptr), TII(static_cast<const R600InstrInfo *>( MF.getSubtarget().getInstrInfo())), TRI(TII->getRegisterInfo()) { @@ -162,14 +162,14 @@ public: } // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) override { + bool ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock *MBB) override { return false; } // isSoloInstruction - return true if instruction MI can not be packetized // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) override { + bool isSoloInstruction(const MachineInstr *MI) override { if (TII->isVector(*MI)) return true; if (!TII->isALUInstr(MI->getOpcode())) @@ -375,7 +375,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { // instruction stream until we find the nearest boundary. MachineBasicBlock::iterator I = RegionEnd; for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) + if (TII->isSchedulingBoundary(&*std::prev(I), &*MBB, Fn)) break; } I = MBB->begin(); @@ -392,7 +392,7 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { continue; } - Packetizer.PacketizeMIs(MBB, I, RegionEnd); + Packetizer.PacketizeMIs(&*MBB, &*I, RegionEnd); RegionEnd = I; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h index 9713e60..4f8a129 100644 --- a/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/R600RegisterInfo.h @@ -35,7 +35,7 @@ struct R600RegisterInfo : public AMDGPURegisterInfo { /// \brief get the register class of the specified type to use in the /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; + const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const; const RegClassWeight & getRegClassWeight(const TargetRegisterClass *RC) const override; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index ccfbf1b..fa4d24a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -312,11 +312,10 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { if (std::find(Latches.begin(), Latches.end(), *PI) == Latches.end()) Preds.push_back(*PI); } - BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", nullptr, DT, - LI, false); + BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false); } - CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); + CallInst::Create(EndCf, popSaved(), "", &*BB->getFirstInsertionPt()); } /// \brief Annotate the control flow with intrinsics so the backend can diff --git a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h index 4c32639..7f79dd3 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIDefines.h @@ -37,7 +37,8 @@ enum { MIMG = 1 << 18, FLAT = 1 << 19, WQM = 1 << 20, - VGPRSpill = 1 << 21 + VGPRSpill = 1 << 21, + VOPAsmPrefer32Bit = 1 << 22 }; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp index 5fe8d19..636750d 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixControlFlowLiveIntervals.cpp @@ -16,15 +16,9 @@ #include "AMDGPU.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 23502b4..96e37c5 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -82,22 +82,10 @@ using namespace llvm; namespace { class SIFixSGPRCopies : public MachineFunctionPass { - -private: +public: static char ID; - const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const; - bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const; -public: - SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { } + SIFixSGPRCopies() : MachineFunctionPass(ID) { } bool runOnMachineFunction(MachineFunction &MF) override; @@ -105,14 +93,23 @@ public: return "SI Fix SGPR copies"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace +INITIALIZE_PASS(SIFixSGPRCopies, DEBUG_TYPE, + "SI Fix SGPR copies", false, false) + char SIFixSGPRCopies::ID = 0; -FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) { - return new SIFixSGPRCopies(tm); +char &llvm::SIFixSGPRCopiesID = SIFixSGPRCopies::ID; + +FunctionPass *llvm::createSIFixSGPRCopiesPass() { + return new SIFixSGPRCopies(); } static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { @@ -128,77 +125,115 @@ static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) { return false; } -/// This functions walks the use list of Reg until it finds an Instruction -/// that isn't a COPY returns the register class of that instruction. -/// \return The register defined by the first non-COPY instruction. -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - - const TargetRegisterClass *RC - = TargetRegisterInfo::isVirtualRegister(Reg) ? - MRI.getRegClass(Reg) : - TRI->getPhysRegClass(Reg); - - RC = TRI->getSubRegClass(RC, SubReg); - for (MachineRegisterInfo::use_instr_iterator - I = MRI.use_instr_begin(Reg), E = MRI.use_instr_end(); I != E; ++I) { - switch (I->getOpcode()) { - case AMDGPU::COPY: - RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI, - I->getOperand(0).getReg(), - I->getOperand(0).getSubReg())); - break; - } - } +static std::pair<const TargetRegisterClass *, const TargetRegisterClass *> +getCopyRegClasses(const MachineInstr &Copy, + const SIRegisterInfo &TRI, + const MachineRegisterInfo &MRI) { + unsigned DstReg = Copy.getOperand(0).getReg(); + unsigned SrcReg = Copy.getOperand(1).getReg(); + + const TargetRegisterClass *SrcRC = + TargetRegisterInfo::isVirtualRegister(SrcReg) ? + MRI.getRegClass(SrcReg) : + TRI.getPhysRegClass(SrcReg); - return RC; + // We don't really care about the subregister here. + // SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg()); + + const TargetRegisterClass *DstRC = + TargetRegisterInfo::isVirtualRegister(DstReg) ? + MRI.getRegClass(DstReg) : + TRI.getPhysRegClass(DstReg); + + return std::make_pair(SrcRC, DstRC); } -const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef( - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI, - unsigned Reg, - unsigned SubReg) const { - if (!TargetRegisterInfo::isVirtualRegister(Reg)) { - const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg); - return TRI->getSubRegClass(RC, SubReg); - } - MachineInstr *Def = MRI.getVRegDef(Reg); - if (Def->getOpcode() != AMDGPU::COPY) { - return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg); - } +static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC); +} - return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(), - Def->getOperand(1).getSubReg()); +static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC, + const TargetRegisterClass *DstRC, + const SIRegisterInfo &TRI) { + return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC); } -bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy, - const SIRegisterInfo *TRI, - const MachineRegisterInfo &MRI) const { +// Distribute an SGPR->VGPR copy of a REG_SEQUENCE into a VGPR REG_SEQUENCE. +// +// SGPRx = ... +// SGPRy = REG_SEQUENCE SGPRx, sub0 ... +// VGPRz = COPY SGPRy +// +// ==> +// +// VGPRx = COPY SGPRx +// VGPRz = REG_SEQUENCE VGPRx, sub0 +// +// This exposes immediate folding opportunities when materializing 64-bit +// immediates. +static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI, + const SIRegisterInfo *TRI, + const SIInstrInfo *TII, + MachineRegisterInfo &MRI) { + assert(MI.isRegSequence()); + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!TRI->isSGPRClass(MRI.getRegClass(DstReg))) + return false; - unsigned DstReg = Copy.getOperand(0).getReg(); - unsigned SrcReg = Copy.getOperand(1).getReg(); - unsigned SrcSubReg = Copy.getOperand(1).getSubReg(); + if (!MRI.hasOneUse(DstReg)) + return false; - if (!TargetRegisterInfo::isVirtualRegister(DstReg)) { - // If the destination register is a physical register there isn't really - // much we can do to fix this. + MachineInstr &CopyUse = *MRI.use_instr_begin(DstReg); + if (!CopyUse.isCopy()) return false; - } - const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI); - const TargetRegisterClass *SrcRC; + if (!isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) + return false; - if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || - MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass) + // TODO: Could have multiple extracts? + unsigned SubReg = CopyUse.getOperand(1).getSubReg(); + if (SubReg != AMDGPU::NoSubRegister) return false; - SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg); - return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC); + MRI.setRegClass(DstReg, DstRC); + + // SGPRx = ... + // SGPRy = REG_SEQUENCE SGPRx, sub0 ... + // VGPRz = COPY SGPRy + + // => + // VGPRx = COPY SGPRx + // VGPRz = REG_SEQUENCE VGPRx, sub0 + + MI.getOperand(0).setReg(CopyUse.getOperand(0).getReg()); + + for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) { + unsigned SrcReg = MI.getOperand(I).getReg(); + unsigned SrcSubReg = MI.getOperand(I).getReg(); + + const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); + assert(TRI->isSGPRClass(SrcRC) && + "Expected SGPR REG_SEQUENCE to only have SGPR inputs"); + + SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg); + const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC); + + unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC); + + BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), TmpReg) + .addOperand(MI.getOperand(I)); + + MI.getOperand(I).setReg(TmpReg); + } + + CopyUse.eraseFromParent(); + return true; } bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { @@ -207,40 +242,38 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + + SmallVector<MachineInstr *, 16> Worklist; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { + I != E; ++I) { MachineInstr &MI = *I; - if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) { - DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n"); - DEBUG(MI.print(dbgs())); - TII->moveToVALU(MI); - - } switch (MI.getOpcode()) { - default: continue; - case AMDGPU::PHI: { - DEBUG(dbgs() << "Fixing PHI: " << MI); - - for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { - const MachineOperand &Op = MI.getOperand(i); - unsigned Reg = Op.getReg(); - const TargetRegisterClass *RC - = inferRegClassFromDef(TRI, MRI, Reg, Op.getSubReg()); + default: + continue; + case AMDGPU::COPY: { + // If the destination register is a physical register there isn't really + // much we can do to fix this. + if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) + continue; - MRI.constrainRegClass(Op.getReg(), RC); - } - unsigned Reg = MI.getOperand(0).getReg(); - const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg, - MI.getOperand(0).getSubReg()); - if (TRI->getCommonSubClass(RC, &AMDGPU::VGPR_32RegClass)) { - MRI.constrainRegClass(Reg, &AMDGPU::VGPR_32RegClass); + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); + if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { + DEBUG(dbgs() << "Fixing VGPR -> SGPR copy: " << MI); + TII->moveToVALU(MI); } + break; + } + case AMDGPU::PHI: { + DEBUG(dbgs() << "Fixing PHI: " << MI); + unsigned Reg = MI.getOperand(0).getReg(); if (!TRI->isSGPRClass(MRI.getRegClass(Reg))) break; @@ -310,8 +343,10 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { } case AMDGPU::REG_SEQUENCE: { if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) || - !hasVGPROperands(MI, TRI)) + !hasVGPROperands(MI, TRI)) { + foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI); continue; + } DEBUG(dbgs() << "Fixing REG_SEQUENCE: " << MI); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp index 0c54446..8bda283 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFixSGPRLiveRanges.cpp @@ -7,9 +7,8 @@ // //===----------------------------------------------------------------------===// // -/// \file -/// SALU instructions ignore control flow, so we need to modify the live ranges -/// of the registers they define in some cases. +/// \file SALU instructions ignore the execution mask, so we need to modify the +/// live ranges of the registers they define in some cases. /// /// The main case we need to handle is when a def is used in one side of a /// branch and not another. For example: @@ -42,13 +41,15 @@ /// ENDIF /// %use /// -/// Adding this use will make the def live thoughout the IF branch, which is +/// Adding this use will make the def live throughout the IF branch, which is /// what we want. #include "AMDGPU.h" #include "SIInstrInfo.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachinePostDominators.h" @@ -79,9 +80,13 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<LiveIntervals>(); + AU.addRequired<LiveVariables>(); + AU.addPreserved<LiveVariables>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addPreserved<MachinePostDominatorTree>(); AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -90,7 +95,7 @@ public: INITIALIZE_PASS_BEGIN(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(LiveVariables) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIFixSGPRLiveRanges, DEBUG_TYPE, "SI Fix SGPR Live Ranges", false, false) @@ -108,40 +113,48 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>( MF.getSubtarget().getRegisterInfo()); - LiveIntervals *LIS = &getAnalysis<LiveIntervals>(); - MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); - std::vector<std::pair<unsigned, LiveRange *>> SGPRLiveRanges; + bool MadeChange = false; + + MachinePostDominatorTree *PDT = &getAnalysis<MachinePostDominatorTree>(); + SmallVector<unsigned, 16> SGPRLiveRanges; + + LiveVariables *LV = &getAnalysis<LiveVariables>(); + MachineBasicBlock *Entry = &MF.front(); - // First pass, collect all live intervals for SGPRs - for (const MachineBasicBlock &MBB : MF) { - for (const MachineInstr &MI : MBB) { + // Use a depth first order so that in SSA, we encounter all defs before + // uses. Once the defs of the block have been found, attempt to insert + // SGPR_USE instructions in successor blocks if required. + for (MachineBasicBlock *MBB : depth_first(Entry)) { + for (const MachineInstr &MI : *MBB) { for (const MachineOperand &MO : MI.defs()) { - if (MO.isImplicit()) - continue; + // We should never see a live out def of a physical register, so we also + // do not need to worry about implicit_defs(). unsigned Def = MO.getReg(); if (TargetRegisterInfo::isVirtualRegister(Def)) { - if (TRI->isSGPRClass(MRI.getRegClass(Def))) - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getInterval(Def))); - } else if (TRI->isSGPRClass(TRI->getPhysRegClass(Def))) { - SGPRLiveRanges.push_back( - std::make_pair(Def, &LIS->getRegUnit(Def))); + if (TRI->isSGPRClass(MRI.getRegClass(Def))) { + // Only consider defs that are live outs. We don't care about def / + // use within the same block. + + // LiveVariables does not consider registers that are only used in a + // phi in a sucessor block as live out, unlike LiveIntervals. + // + // This is OK because SIFixSGPRCopies replaced any SGPR phis with + // VGPRs. + if (LV->isLiveOut(Def, *MBB)) + SGPRLiveRanges.push_back(Def); + } } } } - } - // Second pass fix the intervals - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - MachineBasicBlock &MBB = *BI; - if (MBB.succ_size() < 2) + if (MBB->succ_size() < 2) continue; - // We have structured control flow, so number of succesors should be two. - assert(MBB.succ_size() == 2); - MachineBasicBlock *SuccA = *MBB.succ_begin(); - MachineBasicBlock *SuccB = *(++MBB.succ_begin()); + // We have structured control flow, so the number of successors should be + // two. + assert(MBB->succ_size() == 2); + MachineBasicBlock *SuccA = *MBB->succ_begin(); + MachineBasicBlock *SuccB = *(++MBB->succ_begin()); MachineBasicBlock *NCD = PDT->findNearestCommonDominator(SuccA, SuccB); if (!NCD) @@ -156,37 +169,51 @@ bool SIFixSGPRLiveRanges::runOnMachineFunction(MachineFunction &MF) { NCD = PDT->findNearestCommonDominator(*NCD->succ_begin(), *(++NCD->succ_begin())); } - assert(SuccA && SuccB); - for (std::pair<unsigned, LiveRange*> RegLR : SGPRLiveRanges) { - unsigned Reg = RegLR.first; - LiveRange *LR = RegLR.second; - - // FIXME: We could be smarter here. If the register is Live-In to - // one block, but the other doesn't have any SGPR defs, then there - // won't be a conflict. Also, if the branch decision is based on - // a value in an SGPR, then there will be no conflict. - bool LiveInToA = LIS->isLiveInToMBB(*LR, SuccA); - bool LiveInToB = LIS->isLiveInToMBB(*LR, SuccB); - - if ((!LiveInToA && !LiveInToB) || - (LiveInToA && LiveInToB)) + + for (unsigned Reg : SGPRLiveRanges) { + // FIXME: We could be smarter here. If the register is Live-In to one + // block, but the other doesn't have any SGPR defs, then there won't be a + // conflict. Also, if the branch condition is uniform then there will be + // no conflict. + bool LiveInToA = LV->isLiveIn(Reg, *SuccA); + bool LiveInToB = LV->isLiveIn(Reg, *SuccB); + + if (!LiveInToA && !LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into neither successor\n"); continue; + } + + if (LiveInToA && LiveInToB) { + DEBUG(dbgs() << PrintReg(Reg, TRI, 0) + << " is live into both successors\n"); + continue; + } // This interval is live in to one successor, but not the other, so // we need to update its range so it is live in to both. - DEBUG(dbgs() << "Possible SGPR conflict detected " << " in " << *LR << - " BB#" << SuccA->getNumber() << ", BB#" << - SuccB->getNumber() << - " with NCD = " << NCD->getNumber() << '\n'); + DEBUG(dbgs() << "Possible SGPR conflict detected for " + << PrintReg(Reg, TRI, 0) + << " BB#" << SuccA->getNumber() + << ", BB#" << SuccB->getNumber() + << " with NCD = BB#" << NCD->getNumber() << '\n'); + + assert(TargetRegisterInfo::isVirtualRegister(Reg) && + "Not expecting to extend live range of physreg"); // FIXME: Need to figure out how to update LiveRange here so this pass // will be able to preserve LiveInterval analysis. - BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::SGPR_USE)) - .addReg(Reg, RegState::Implicit); - DEBUG(NCD->getFirstNonPHI()->dump()); + MachineInstr *NCDSGPRUse = + BuildMI(*NCD, NCD->getFirstNonPHI(), DebugLoc(), + TII->get(AMDGPU::SGPR_USE)) + .addReg(Reg, RegState::Implicit); + + MadeChange = true; + LV->HandleVirtRegUse(Reg, NCD, NCDSGPRUse); + + DEBUG(NCDSGPRUse->dump()); } } - return false; + return MadeChange; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index c288725..02a3930 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -45,6 +45,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -164,8 +165,8 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, // Operand is not legal, so try to commute the instruction to // see if this makes it possible to fold. - unsigned CommuteIdx0; - unsigned CommuteIdx1; + unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex; + unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex; bool CanCommute = TII->findCommutedOpIndices(MI, CommuteIdx0, CommuteIdx1); if (CanCommute) { @@ -175,7 +176,16 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, OpNo = CommuteIdx0; } - if (!CanCommute || !TII->commuteInstruction(MI)) + // One of operands might be an Imm operand, and OpNo may refer to it after + // the call of commuteInstruction() below. Such situations are avoided + // here explicitly as OpNo must be a register operand to be a candidate + // for memory folding. + if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() || + !MI->getOperand(CommuteIdx1).isReg())) + return false; + + if (!CanCommute || + !TII->commuteInstruction(MI, false, CommuteIdx0, CommuteIdx1)) return false; if (!TII->isOperandLegal(MI, OpNo, OpToFold)) @@ -186,6 +196,110 @@ static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList, return true; } +static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, + unsigned UseOpIdx, + std::vector<FoldCandidate> &FoldList, + SmallVectorImpl<MachineInstr *> &CopiesToReplace, + const SIInstrInfo *TII, const SIRegisterInfo &TRI, + MachineRegisterInfo &MRI) { + const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); + + // FIXME: Fold operands with subregs. + if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || + UseOp.isImplicit())) { + return; + } + + bool FoldingImm = OpToFold.isImm(); + APInt Imm; + + if (FoldingImm) { + unsigned UseReg = UseOp.getReg(); + const TargetRegisterClass *UseRC + = TargetRegisterInfo::isVirtualRegister(UseReg) ? + MRI.getRegClass(UseReg) : + TRI.getPhysRegClass(UseReg); + + Imm = APInt(64, OpToFold.getImm()); + + const MCInstrDesc &FoldDesc = TII->get(OpToFold.getParent()->getOpcode()); + const TargetRegisterClass *FoldRC = + TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + + // Split 64-bit constants into 32-bits for folding. + if (FoldRC->getSize() == 8 && UseOp.getSubReg()) { + if (UseRC->getSize() != 8) + return; + + if (UseOp.getSubReg() == AMDGPU::sub0) { + Imm = Imm.getLoBits(32); + } else { + assert(UseOp.getSubReg() == AMDGPU::sub1); + Imm = Imm.getHiBits(32); + } + } + + // In order to fold immediates into copies, we need to change the + // copy to a MOV. + if (UseMI->getOpcode() == AMDGPU::COPY) { + unsigned DestReg = UseMI->getOperand(0).getReg(); + const TargetRegisterClass *DestRC + = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + TRI.getPhysRegClass(DestReg); + + unsigned MovOp = TII->getMovOpcode(DestRC); + if (MovOp == AMDGPU::COPY) + return; + + UseMI->setDesc(TII->get(MovOp)); + CopiesToReplace.push_back(UseMI); + } + } + + // Special case for REG_SEQUENCE: We can't fold literals into + // REG_SEQUENCE instructions, so we have to fold them into the + // uses of REG_SEQUENCE. + if (UseMI->getOpcode() == AMDGPU::REG_SEQUENCE) { + unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + + for (MachineRegisterInfo::use_iterator + RSUse = MRI.use_begin(RegSeqDstReg), + RSE = MRI.use_end(); RSUse != RSE; ++RSUse) { + + MachineInstr *RSUseMI = RSUse->getParent(); + if (RSUse->getSubReg() != RegSeqDstSubReg) + continue; + + foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); + } + return; + } + + const MCInstrDesc &UseDesc = UseMI->getDesc(); + + // Don't fold into target independent nodes. Target independent opcodes + // don't have defined register classes. + if (UseDesc.isVariadic() || + UseDesc.OpInfo[UseOpIdx].RegClass == -1) + return; + + if (FoldingImm) { + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; + } + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); + + // FIXME: We could try to change the instruction from 64-bit to 32-bit + // to enable more folding opportunites. The shrink operands pass + // already does this. + return; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = @@ -226,88 +340,36 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { OpToFold.getSubReg())) continue; + + // We need mutate the operands of new mov instructions to add implicit + // uses of EXEC, but adding them invalidates the use_iterator, so defer + // this. + SmallVector<MachineInstr *, 4> CopiesToReplace; + std::vector<FoldCandidate> FoldList; for (MachineRegisterInfo::use_iterator Use = MRI.use_begin(MI.getOperand(0).getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); - const MachineOperand &UseOp = UseMI->getOperand(Use.getOperandNo()); - // FIXME: Fold operands with subregs. - if (UseOp.isReg() && ((UseOp.getSubReg() && OpToFold.isReg()) || - UseOp.isImplicit())) { - continue; - } - - APInt Imm; - - if (FoldingImm) { - unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI.getRegClass(UseReg) : - TRI.getPhysRegClass(UseReg); - - Imm = APInt(64, OpToFold.getImm()); - - // Split 64-bit constants into 32-bits for folding. - if (UseOp.getSubReg()) { - if (UseRC->getSize() != 8) - continue; - - if (UseOp.getSubReg() == AMDGPU::sub0) { - Imm = Imm.getLoBits(32); - } else { - assert(UseOp.getSubReg() == AMDGPU::sub1); - Imm = Imm.getHiBits(32); - } - } - - // In order to fold immediates into copies, we need to change the - // copy to a MOV. - if (UseMI->getOpcode() == AMDGPU::COPY) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI.getRegClass(DestReg) : - TRI.getPhysRegClass(DestReg); - - unsigned MovOp = TII->getMovOpcode(DestRC); - if (MovOp == AMDGPU::COPY) - continue; - - UseMI->setDesc(TII->get(MovOp)); - } - } - - const MCInstrDesc &UseDesc = UseMI->getDesc(); - - // Don't fold into target independent nodes. Target independent opcodes - // don't have defined register classes. - if (UseDesc.isVariadic() || - UseDesc.OpInfo[Use.getOperandNo()].RegClass == -1) - continue; - - if (FoldingImm) { - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &ImmOp, TII); - continue; - } - - tryAddToFoldList(FoldList, UseMI, Use.getOperandNo(), &OpToFold, TII); - - // FIXME: We could try to change the instruction from 64-bit to 32-bit - // to enable more folding opportunites. The shrink operands pass - // already does this. + foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, + CopiesToReplace, TII, TRI, MRI); } + // Make sure we add EXEC uses to any new v_mov instructions created. + for (MachineInstr *Copy : CopiesToReplace) + Copy->addImplicitDefUseOperands(MF); + for (FoldCandidate &Fold : FoldList) { if (updateOperand(Fold, TRI)) { // Clear kill flags. if (!Fold.isImm()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); - Fold.OpToFold->setIsKill(false); + // FIXME: Probably shouldn't bother trying to fold if not an + // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR + // copies. + MRI.clearKillFlags(Fold.OpToFold->getReg()); } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << Fold.UseOpNo << " of " << *Fold.UseMI << '\n'); diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp new file mode 100644 index 0000000..6b3c81c --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -0,0 +1,243 @@ +//===----------------------- SIFrameLowering.cpp --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#include "SIFrameLowering.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/RegisterScavenging.h" + +using namespace llvm; + + +static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo, + const MachineFrameInfo *FrameInfo) { + if (!FuncInfo->hasSpilledSGPRs()) + return false; + + if (FuncInfo->hasSpilledVGPRs()) + return false; + + for (int I = FrameInfo->getObjectIndexBegin(), + E = FrameInfo->getObjectIndexEnd(); I != E; ++I) { + if (!FrameInfo->isSpillSlotObjectIndex(I)) + return false; + } + + return true; +} + +static ArrayRef<MCPhysReg> getAllSGPR128() { + return makeArrayRef(AMDGPU::SReg_128RegClass.begin(), + AMDGPU::SReg_128RegClass.getNumRegs()); +} + +static ArrayRef<MCPhysReg> getAllSGPRs() { + return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), + AMDGPU::SGPR_32RegClass.getNumRegs()); +} + +void SIFrameLowering::emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + if (!MF.getFrameInfo()->hasStackObjects()) + return; + + assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // If we only have SGPR spills, we won't actually be using scratch memory + // since these spill to VGPRs. + // + // FIXME: We should be cleaning up these unused SGPR spill frame indices + // somewhere. + if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) + return; + + const SIInstrInfo *TII = + static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + // We need to insert initialization of the scratch resource descriptor. + unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); + assert(ScratchRsrcReg != AMDGPU::NoRegister); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + assert(ScratchWaveOffsetReg != AMDGPU::NoRegister); + + unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOS()) { + PreloadedPrivateBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + } + + // If we reserved the original input registers, we don't need to copy to the + // reserved registers. + if (ScratchRsrcReg == PreloadedPrivateBufferReg) { + // We should always reserve these 5 registers at the same time. + assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg && + "scratch wave offset and private segment buffer inconsistent"); + return; + } + + + // We added live-ins during argument lowering, but since they were not used + // they were deleted. We're adding the uses now, so add them back. + MachineRegisterInfo &MRI = MF.getRegInfo(); + MRI.addLiveIn(PreloadedScratchWaveOffsetReg); + MBB.addLiveIn(PreloadedScratchWaveOffsetReg); + + if (ST.isAmdHsaOS()) { + MRI.addLiveIn(PreloadedPrivateBufferReg); + MBB.addLiveIn(PreloadedPrivateBufferReg); + } + + // We reserved the last registers for this. Shift it down to the end of those + // which were actually used. + // + // FIXME: It might be safer to use a pseudoregister before replacement. + + // FIXME: We should be able to eliminate unused input registers. We only + // cannot do this for the resources required for scratch access. For now we + // skip over user SGPRs and may leave unused holes. + + // We find the resource first because it has an alignment requirement. + if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4; + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) { + // Pick the first unallocated one. Make sure we don't clobber the other + // reserved input we needed. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg)); + MRI.replaceRegWith(ScratchRsrcReg, Reg); + ScratchRsrcReg = Reg; + MFI->setScratchRSrcReg(ScratchRsrcReg); + break; + } + } + } + + if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + // Skip the last 2 elements because the last one is reserved for VCC, and + // this is the 2nd to last element already. + unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); + for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) { + // Pick the first unallocated SGPR. Be careful not to pick an alias of the + // scratch descriptor, since we haven’t added its uses yet. + if (!MRI.isPhysRegUsed(Reg)) { + assert(MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(ScratchRsrcReg, Reg)); + + MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); + ScratchWaveOffsetReg = Reg; + MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + break; + } + } + } + + + assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg)); + + const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) { + // Make sure we emit the copy for the offset first. We may have chosen to copy + // the buffer resource into a register that aliases the input offset register. + BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg) + .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill); + } + + if (ST.isAmdHsaOS()) { + // Insert copies from argument register. + assert( + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) && + !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg)); + + unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1); + unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3); + + unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1); + unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3); + + const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64); + + BuildMI(MBB, I, DL, SMovB64, Rsrc01) + .addReg(Lo, RegState::Kill); + BuildMI(MBB, I, DL, SMovB64, Rsrc23) + .addReg(Hi, RegState::Kill); + } else { + unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); + unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); + + // Use relocations to get the pointer, and setup the other bits manually. + uint64_t Rsrc23 = TII->getScratchRsrcWords23(); + BuildMI(MBB, I, DL, SMovB32, Rsrc0) + .addExternalSymbol("SCRATCH_RSRC_DWORD0") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc1) + .addExternalSymbol("SCRATCH_RSRC_DWORD1") + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc2) + .addImm(Rsrc23 & 0xffffffff) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + + BuildMI(MBB, I, DL, SMovB32, Rsrc3) + .addImm(Rsrc23 >> 32) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } + + // Make the register selected live throughout the function. + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB == &MBB) + continue; + + OtherBB.addLiveIn(ScratchRsrcReg); + OtherBB.addLiveIn(ScratchWaveOffsetReg); + } +} + +void SIFrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS) const { + MachineFrameInfo *MFI = MF.getFrameInfo(); + + if (!MFI->hasStackObjects()) + return; + + bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects(); + + assert((RS || !MayNeedScavengingEmergencySlot) && + "RegScavenger required if spilling"); + + if (MayNeedScavengingEmergencySlot) { + int ScavengeFI = MFI->CreateSpillStackObject( + AMDGPU::SGPR_32RegClass.getSize(), + AMDGPU::SGPR_32RegClass.getAlignment()); + RS->addScavengingFrameIndex(ScavengeFI); + } +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h new file mode 100644 index 0000000..a9152fd --- /dev/null +++ b/contrib/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -0,0 +1,34 @@ +//===--------------------- SIFrameLowering.h --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H +#define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H + +#include "AMDGPUFrameLowering.h" + +namespace llvm { + +class SIFrameLowering final : public AMDGPUFrameLowering { +public: + SIFrameLowering(StackDirection D, unsigned StackAl, int LAO, + unsigned TransAl = 1) : + AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {} + ~SIFrameLowering() override {} + + void emitPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const override; + + void processFunctionBeforeFrameFinalized( + MachineFunction &MF, + RegScavenger *RS = nullptr) const override; +}; + +} + +#endif diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index c2db9ff..0e043cb 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20,6 +20,7 @@ #include "SIISelLowering.h" #include "AMDGPU.h" +#include "AMDGPUDiagnosticInfoUnsupported.h" #include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" @@ -51,6 +52,9 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); @@ -103,6 +107,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SETCC, MVT::v4i1, Expand); setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); @@ -155,13 +160,30 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); + + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand); + + setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); + setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setOperationAction(ISD::LOAD, MVT::i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::STORE, MVT::v2i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32); + + setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); @@ -173,9 +195,14 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::SELECT, MVT::i1, Promote); + setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); + + + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}) { + for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch(Op) { case ISD::LOAD: @@ -186,6 +213,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: case ISD::EXTRACT_SUBVECTOR: + case ISD::SCALAR_TO_VECTOR: break; case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); @@ -197,6 +225,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM, } } + // Most operations are naturally 32-bit vector operations. We only support + // load and store of i64 vectors, so promote v2i64 vector operations to v4i32. + for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); + } + if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -261,6 +305,41 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1); } +bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const { + // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and + // additionally can do r + r + i with addr64. 32-bit has more addressing + // mode options. Depending on the resource constant, it can also do + // (i64 r0) + (i32 r1) * (i14 i). + // + // Private arrays end up using a scratch buffer most of the time, so also + // assume those use MUBUF instructions. Scratch loads / stores are currently + // implemented as mubuf instructions with offen bit set, so slightly + // different than the normal addr64. + if (!isUInt<12>(AM.BaseOffs)) + return false; + + // FIXME: Since we can split immediate into soffset and immediate offset, + // would it make sense to allow any immediate? + + switch (AM.Scale) { + case 0: // r + i or just i, depending on HasBaseReg. + return true; + case 1: + return true; // We have r + r or r + i. + case 2: + if (AM.HasBaseReg) { + // Reject 2 * r + r. + return false; + } + + // Allow 2 * r as r + r + // Or 2 * r + i is allowed as r + r + i. + return true; + default: // Don't allow n * r + return false; + } +} + bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { @@ -269,7 +348,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return false; switch (AS) { - case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::GLOBAL_ADDRESS: { if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { // Assume the we will use FLAT for all global memory accesses // on VI. @@ -282,51 +361,51 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // because it has never been validated. return isLegalFlatAddressingMode(AM); } - // fall-through - case AMDGPUAS::PRIVATE_ADDRESS: - case AMDGPUAS::CONSTANT_ADDRESS: // XXX - Should we assume SMRD instructions? - case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: { - // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and - // additionally can do r + r + i with addr64. 32-bit has more addressing - // mode options. Depending on the resource constant, it can also do - // (i64 r0) + (i32 r1) * (i14 i). - // - // SMRD instructions have an 8-bit, dword offset. - // - // Assume nonunifom access, since the address space isn't enough to know - // what instruction we will use, and since we don't know if this is a load - // or store and scalar stores are only available on VI. - // - // We also know if we are doing an extload, we can't do a scalar load. - // - // Private arrays end up using a scratch buffer most of the time, so also - // assume those use MUBUF instructions. Scratch loads / stores are currently - // implemented as mubuf instructions with offen bit set, so slightly - // different than the normal addr64. - if (!isUInt<12>(AM.BaseOffs)) - return false; - // FIXME: Since we can split immediate into soffset and immediate offset, - // would it make sense to allow any immediate? + return isLegalMUBUFAddressingMode(AM); + } + case AMDGPUAS::CONSTANT_ADDRESS: { + // If the offset isn't a multiple of 4, it probably isn't going to be + // correctly aligned. + if (AM.BaseOffs % 4 != 0) + return isLegalMUBUFAddressingMode(AM); + + // There are no SMRD extloads, so if we have to do a small type access we + // will use a MUBUF load. + // FIXME?: We also need to do this if unaligned, but we don't know the + // alignment here. + if (DL.getTypeStoreSize(Ty) < 4) + return isLegalMUBUFAddressingMode(AM); + + if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + // SMRD instructions have an 8-bit, dword offset on SI. + if (!isUInt<8>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { + // On CI+, this can also be a 32-bit literal constant offset. If it fits + // in 8-bits, it can use a smaller encoding. + if (!isUInt<32>(AM.BaseOffs / 4)) + return false; + } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // On VI, these use the SMEM format and the offset is 20-bit in bytes. + if (!isUInt<20>(AM.BaseOffs)) + return false; + } else + llvm_unreachable("unhandled generation"); - switch (AM.Scale) { - case 0: // r + i or just i, depending on HasBaseReg. + if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg. return true; - case 1: - return true; // We have r + r or r + i. - case 2: - if (AM.HasBaseReg) { - // Reject 2 * r + r. - return false; - } - // Allow 2 * r as r + r - // Or 2 * r + i is allowed as r + r + i. + if (AM.Scale == 1 && AM.HasBaseReg) return true; - default: // Don't allow n * r - return false; - } + + return false; } + + case AMDGPUAS::PRIVATE_ADDRESS: + case AMDGPUAS::UNKNOWN_ADDRESS_SPACE: + return isLegalMUBUFAddressingMode(AM); + case AMDGPUAS::LOCAL_ADDRESS: case AMDGPUAS::REGION_ADDRESS: { // Basic, single offset DS instructions allow a 16-bit unsigned immediate @@ -374,7 +453,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte // aligned, 8 byte access in a single operation using ds_read2/write2_b32 // with adjacent offsets. - return Align % 4 == 0; + bool AlignedBy4 = (Align % 4 == 0); + if (IsFast) + *IsFast = AlignedBy4; + return AlignedBy4; } // Smaller than dword value must be aligned. @@ -411,6 +493,32 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, return MVT::Other; } +static bool isFlatGlobalAddrSpace(unsigned AS) { + return AS == AMDGPUAS::GLOBAL_ADDRESS || + AS == AMDGPUAS::FLAT_ADDRESS || + AS == AMDGPUAS::CONSTANT_ADDRESS; +} + +bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { + return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); +} + + +bool SITargetLowering::isMemOpUniform(const SDNode *N) const { + const MemSDNode *MemNode = cast<MemSDNode>(N); + const Value *Ptr = MemNode->getMemOperand()->getValue(); + + // UndefValue means this is a load of a kernel input. These are uniform. + // Sometimes LDS instructions have constant pointers + if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) || + isa<GlobalValue>(Ptr)) + return true; + + const Instruction *I = dyn_cast_or_null<Instruction>(Ptr); + return I && I->getMetadata("amdgpu.uniform"); +} + TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(EVT VT) const { if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) @@ -426,12 +534,6 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return TII->isInlineConstant(Imm); } -static EVT toIntegerVT(EVT VT) { - if (VT.isVector()) - return VT.changeVectorElementTypeToInteger(); - return MVT::getIntegerVT(VT.getSizeInBits()); -} - SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc SL, SDValue Chain, unsigned Offset, bool Signed) const { @@ -439,7 +541,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, MachineFunction &MF = DAG.getMachineFunction(); const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); - unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); + unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); @@ -455,30 +557,10 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, unsigned Align = DL.getABITypeAlignment(Ty); - if (VT != MemVT && VT.isFloatingPoint()) { - // Do an integer load and convert. - // FIXME: This is mostly because load legalization after type legalization - // doesn't handle FP extloads. - assert(VT.getScalarType() == MVT::f32 && - MemVT.getScalarType() == MVT::f16); - - EVT IVT = toIntegerVT(VT); - EVT MemIVT = toIntegerVT(MemVT); - SDValue Load = DAG.getLoad(ISD::UNINDEXED, ISD::ZEXTLOAD, - IVT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemIVT, - false, // isVolatile - true, // isNonTemporal - true, // isInvariant - Align); // Alignment - SDValue Ops[] = { - DAG.getNode(ISD::FP16_TO_FP, SL, VT, Load), - Load.getValue(1) - }; - - return DAG.getMergeValues(Ops, SL); - } - ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + if (MemVT.isFloatingPoint()) + ExtTy = ISD::EXTLOAD; + return DAG.getLoad(ISD::UNINDEXED, ExtTy, VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT, false, // isVolatile @@ -497,8 +579,16 @@ SDValue SITargetLowering::LowerFormalArguments( MachineFunction &MF = DAG.getMachineFunction(); FunctionType *FType = MF.getFunction()->getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + const Function *Fn = MF.getFunction(); + DiagnosticInfoUnsupported NoGraphicsHSA(*Fn, "non-compute shaders with HSA"); + DAG.getContext()->diagnose(NoGraphicsHSA); + return SDValue(); + } - assert(CallConv == CallingConv::C); + // FIXME: We currently assume all calling conventions are kernels. SmallVector<ISD::InputArg, 16> Splits; BitVector Skipped(Ins.size()); @@ -513,7 +603,7 @@ SDValue SITargetLowering::LowerFormalArguments( assert((PSInputNum <= 15) && "Too many PS inputs!"); if (!Arg.Used) { - // We can savely skip PS inputs + // We can safely skip PS inputs Skipped.set(i); ++PSInputNum; continue; @@ -530,7 +620,7 @@ SDValue SITargetLowering::LowerFormalArguments( // We REALLY want the ORIGINAL number of vertex elements here, e.g. a // three or five element vertex only needs three or five registers, - // NOT four or eigth. + // NOT four or eight. Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); unsigned NumElements = ParamType->getVectorNumElements(); @@ -556,41 +646,30 @@ SDValue SITargetLowering::LowerFormalArguments( CCInfo.AllocateReg(AMDGPU::VGPR1); } - // The pointer to the list of arguments is stored in SGPR0, SGPR1 - // The pointer to the scratch buffer is stored in SGPR2, SGPR3 - if (Info->getShaderType() == ShaderType::COMPUTE) { - if (Subtarget->isAmdHsaOS()) - Info->NumUserSGPRs = 2; // FIXME: Need to support scratch buffers. - else - Info->NumUserSGPRs = 4; - - unsigned InputPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR); - unsigned InputPtrRegLo = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned InputPtrRegHi = - TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1); - - unsigned ScratchPtrReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchPtrRegLo = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 0); - unsigned ScratchPtrRegHi = - TRI->getPhysRegSubReg(ScratchPtrReg, &AMDGPU::SReg_32RegClass, 1); - - CCInfo.AllocateReg(InputPtrRegLo); - CCInfo.AllocateReg(InputPtrRegHi); - CCInfo.AllocateReg(ScratchPtrRegLo); - CCInfo.AllocateReg(ScratchPtrRegHi); - MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); - MF.addLiveIn(ScratchPtrReg, &AMDGPU::SReg_64RegClass); - } - if (Info->getShaderType() == ShaderType::COMPUTE) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } + // FIXME: How should these inputs interact with inreg / custom SGPR inputs? + if (Info->hasPrivateSegmentBuffer()) { + unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI); + MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass); + CCInfo.AllocateReg(PrivateSegmentBufferReg); + } + + if (Info->hasDispatchPtr()) { + unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI); + MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(DispatchPtrReg); + } + + if (Info->hasKernargSegmentPtr()) { + unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI); + MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass); + CCInfo.AllocateReg(InputPtrReg); + } + AnalyzeFormalArguments(CCInfo, Splits); SmallVector<SDValue, 16> Chains; @@ -617,7 +696,7 @@ SDValue SITargetLowering::LowerFormalArguments( Offset, Ins[i].Flags.isSExt()); Chains.push_back(Arg.getValue(1)); - const PointerType *ParamTy = + auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { @@ -678,10 +757,113 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (Info->getShaderType() != ShaderType::COMPUTE) { - unsigned ScratchIdx = CCInfo.getFirstUnallocated(ArrayRef<MCPhysReg>( - AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs())); - Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx); + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. + + // Start adding system SGPRs. + if (Info->hasWorkGroupIDX()) { + unsigned Reg = Info->addWorkGroupIDX(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("work group id x is always enabled"); + + if (Info->hasWorkGroupIDY()) { + unsigned Reg = Info->addWorkGroupIDY(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupIDZ()) { + unsigned Reg = Info->addWorkGroupIDZ(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkGroupInfo()) { + unsigned Reg = Info->addWorkGroupInfo(); + MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasPrivateSegmentWaveByteOffset()) { + // Scratch wave offset passed in system SGPR. + unsigned PrivateSegmentWaveByteOffsetReg + = Info->addPrivateSegmentWaveByteOffset(); + + MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass); + CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg); + } + + // Now that we've figured out where the scratch register inputs are, see if + // should reserve the arguments and use them directly. + + bool HasStackObjects = MF.getFrameInfo()->hasStackObjects(); + + if (ST.isAmdHsaOS()) { + // TODO: Assume we will spill without optimizations. + if (HasStackObjects) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the HSA ABI, this will be the first 4 user SGPR + // inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER); + Info->setScratchRSrcReg(PrivateSegmentBufferReg); + + unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); + } else { + unsigned ReservedBufferReg + = TRI->reservedPrivateSegmentBufferReg(MF); + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + + // We tentatively reserve the last registers (skipping the last two + // which may contain VCC). After register allocation, we'll replace + // these with the ones immediately after those which were really + // allocated. In the prologue copies will be inserted from the argument + // to these reserved registers. + Info->setScratchRSrcReg(ReservedBufferReg); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } else { + unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF); + + // Without HSA, relocations are used for the scratch pointer and the + // buffer resource setup is always inserted in the prologue. Scratch wave + // offset is still in an input SGPR. + Info->setScratchRSrcReg(ReservedBufferReg); + + if (HasStackObjects) { + unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue( + MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg); + } else { + unsigned ReservedOffsetReg + = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); + Info->setScratchWaveOffsetReg(ReservedOffsetReg); + } + } + + if (Info->hasWorkItemIDX()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } else + llvm_unreachable("workitem id x should always be enabled"); + + if (Info->hasWorkItemIDY()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); + } + + if (Info->hasWorkItemIDZ()) { + unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z); + MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + CCInfo.AllocateReg(Reg); } if (Chains.empty()) @@ -693,27 +875,11 @@ SDValue SITargetLowering::LowerFormalArguments( MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { - MachineBasicBlock::iterator I = *MI; - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - switch (MI->getOpcode()) { default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); case AMDGPU::BRANCH: return BB; - case AMDGPU::SI_RegisterStorePseudo: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - MachineInstrBuilder MIB = - BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore), - Reg); - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) - MIB.addOperand(MI->getOperand(i)); - - MI->eraseFromParent(); - break; - } } return BB; } @@ -944,20 +1110,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace()); - SDValue Ptr = DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT); SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); - - SDValue PtrLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(0, DL, MVT::i32)); - SDValue PtrHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Ptr, - DAG.getConstant(1, DL, MVT::i32)); - - SDValue Lo = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrLo, GA); - SDValue Hi = DAG.getNode(ISD::ADDE, DL, DAG.getVTList(MVT::i32, MVT::Glue), - PtrHi, DAG.getConstant(0, DL, MVT::i32), - SDValue(Lo.getNode(), 1)); - return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi); + return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA); } SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, @@ -977,6 +1131,18 @@ SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, // a glue result. } +SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, + SDValue Op, + MVT VT, + unsigned Offset) const { + SDLoc SL(Op); + SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL, + DAG.getEntryNode(), Offset, false); + // The local size values will have the hi 16-bits as zero. + return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, + DAG.getValueType(VT)); +} + SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -988,7 +1154,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc DL(Op); unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); + // TODO: Should this propagate fast-math-flags? + switch (IntrinsicID) { + case Intrinsic::amdgcn_dispatch_ptr: + return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, + TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT); + case Intrinsic::r600_read_ngroups_x: return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); @@ -1008,37 +1180,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_X, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Y, false); + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::LOCAL_SIZE_Z, false); - + return lowerImplicitZextParam(DAG, Op, MVT::i16, + SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::AMDGPU_read_workdim: - return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - getImplicitParameterOffset(MFI, GRID_DIM), false); - + // Really only 2 bits. + return lowerImplicitZextParam(DAG, Op, MVT::i8, + getImplicitParameterOffset(MFI, GRID_DIM)); case Intrinsic::r600_read_tgid_x: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT); case Intrinsic::r600_read_tgid_y: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT); case Intrinsic::r600_read_tgid_z: return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TGID_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT); case Intrinsic::r600_read_tidig_x: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_X), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT); case Intrinsic::r600_read_tidig_y: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Y), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT); case Intrinsic::r600_read_tidig_z: return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass, - TRI->getPreloadedValue(MF, SIRegisterInfo::TIDIG_Z), VT); + TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT); case AMDGPUIntrinsic::SI_load_const: { SDValue Ops[] = { Op.getOperand(1), @@ -1077,6 +1248,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(2, DL, MVT::i32), // P0 Op.getOperand(1), Op.getOperand(2), Glue); } + case AMDGPUIntrinsic::SI_packf16: + if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) + return DAG.getUNDEF(MVT::i32); + return Op; case AMDGPUIntrinsic::SI_fs_interp: { SDValue IJ = Op.getOperand(4); SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, @@ -1092,6 +1267,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, Op.getOperand(1), Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_interp_p1: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); + SDValue Glue = M0.getValue(1); + return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } + case Intrinsic::amdgcn_interp_p2: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = SDValue(M0.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Glue); + } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); } @@ -1152,16 +1340,29 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { "Custom lowering for non-i32 vectors hasn't been implemented."); unsigned NumElements = Op.getValueType().getVectorNumElements(); assert(NumElements != 2 && "v2 loads are supported for all address spaces."); + switch (Load->getAddressSpace()) { default: break; + case AMDGPUAS::CONSTANT_ADDRESS: + if (isMemOpUniform(Load)) + break; + // Non-uniform loads will be selected to MUBUF instructions, so they + // have the same legalization requires ments as global and private + // loads. + // + // Fall-through case AMDGPUAS::GLOBAL_ADDRESS: case AMDGPUAS::PRIVATE_ADDRESS: + if (NumElements >= 8) + return SplitVectorLoad(Op, DAG); + // v4 loads are supported for private and global memory. if (NumElements <= 4) break; // fall-through case AMDGPUAS::LOCAL_ADDRESS: - return ScalarizeVectorLoad(Op, DAG); + // If properly aligned, if we split we might be able to use ds_read_b64. + return SplitVectorLoad(Op, DAG); } } @@ -1236,8 +1437,10 @@ SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const { if (Unsafe) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) + SDNodeFlags Flags; + Flags.setUnsafeAlgebra(true); SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); } return SDValue(); @@ -1274,6 +1477,8 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One); + // TODO: Should this propagate fast-math-flags? + r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3); SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1); @@ -1379,7 +1584,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return Ret; if (VT.isVector() && VT.getVectorNumElements() >= 8) - return ScalarizeVectorStore(Op, DAG); + return SplitVectorStore(Op, DAG); if (VT == MVT::i1) return DAG.getTruncStore(Store->getChain(), DL, @@ -1393,6 +1598,7 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue Arg = Op.getOperand(0); + // TODO: Should this propagate fast-math-flags? SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT, DAG.getNode(ISD::FMUL, DL, VT, Arg, DAG.getConstantFP(0.5/M_PI, DL, @@ -2125,9 +2331,14 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - TII->legalizeOperands(MI); - if (TII->isMIMG(MI->getOpcode())) { + if (TII->isVOP3(MI->getOpcode())) { + // Make sure constant bus requirements are respected. + TII->legalizeOperandsVOP3(MRI, MI); + return; + } + + if (TII->isMIMG(*MI)) { unsigned VReg = MI->getOperand(0).getReg(); unsigned Writemask = MI->getOperand(1).getImm(); unsigned BitsSet = 0; @@ -2169,53 +2380,38 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, SDLoc DL, SDValue Ptr) const { const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); -#if 1 - // XXX - Workaround for moveToVALU not handling different register class - // inserts for REG_SEQUENCE. - - // Build the half of the subregister with the constants. - const SDValue Ops0[] = { - DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, - MVT::v2i32, Ops0), 0); - - // Combine the constants and the pointer. - const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), - SubRegHi, - DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) - }; + static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); + + // Build the half of the subregister with the constants before building the + // full 128-bit register. If we are building multiple resource descriptors, + // this will allow CSEing of the 2-component register. + const SDValue Ops0[] = { + DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32), + buildSMovImm32(DAG, DL, 0), + DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), + DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); -#else - const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32), - Ptr, - DAG.getTargetConstant(AMDGPU::sub0_sub1, MVT::i32), - buildSMovImm32(DAG, DL, 0), - DAG.getTargetConstant(AMDGPU::sub2, MVT::i32), - buildSMovImm32(DAG, DL, TII->getDefaultRsrcFormat() >> 32), - DAG.getTargetConstant(AMDGPU::sub3, MVT::i32) - }; + SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::v2i32, Ops0), 0); - return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); + // Combine the constants and the pointer. + const SDValue Ops1[] = { + DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + Ptr, + DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), + SubRegHi, + DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32) + }; -#endif + return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1); } /// \brief Return a resource descriptor with the 'Add TID' bit enabled -/// The TID (Thread ID) is multipled by the stride value (bits [61:48] -/// of the resource descriptor) to create an offset, which is added to the -/// resource ponter. +/// The TID (Thread ID) is multiplied by the stride value (bits [61:48] +/// of the resource descriptor) to create an offset, which is added to +/// the resource pointer. MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, SDLoc DL, SDValue Ptr, @@ -2248,15 +2444,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops); } -MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const { - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo()); - - return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23()); -} - SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { @@ -2274,13 +2461,41 @@ std::pair<unsigned, const TargetRegisterClass *> SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { - if (Constraint == "r") { - switch(VT.SimpleTy) { - default: llvm_unreachable("Unhandled type for 'r' inline asm constraint"); - case MVT::i64: - return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); - case MVT::i32: + + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 's': + case 'r': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: return std::make_pair(0U, &AMDGPU::SGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::SGPR_64RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::SReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::SReg_256RegClass); + } + + case 'v': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + return std::make_pair(0U, &AMDGPU::VGPR_32RegClass); + case 64: + return std::make_pair(0U, &AMDGPU::VReg_64RegClass); + case 96: + return std::make_pair(0U, &AMDGPU::VReg_96RegClass); + case 128: + return std::make_pair(0U, &AMDGPU::VReg_128RegClass); + case 256: + return std::make_pair(0U, &AMDGPU::VReg_256RegClass); + case 512: + return std::make_pair(0U, &AMDGPU::VReg_512RegClass); + } } } @@ -2301,3 +2516,16 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } + +SITargetLowering::ConstraintType +SITargetLowering::getConstraintType(StringRef Constraint) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 's': + case 'v': + return C_RegisterClass; + } + } + return TargetLowering::getConstraintType(Constraint); +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h index d84c32e..e2f8cb1 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -28,6 +28,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const override; + SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, + MVT VT, unsigned Offset) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; @@ -57,6 +60,7 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; + bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); @@ -76,6 +80,9 @@ public: bool MemcpyStrSrc, MachineFunction &MF) const override; + bool isMemOpUniform(const SDNode *N) const; + bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override; + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; @@ -112,13 +119,10 @@ public: SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const; - MachineSDNode *buildScratchRSRC(SelectionDAG &DAG, - SDLoc DL, - SDValue Ptr) const; - std::pair<unsigned, const TargetRegisterClass *> getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; + ConstraintType getConstraintType(StringRef Constraint) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL, SDValue V) const; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp index 90a37f1..821aada 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp @@ -91,7 +91,8 @@ private: bool isOpRelevant(MachineOperand &Op); /// \brief Get register interval an operand affects. - RegInterval getRegInterval(MachineOperand &Op); + RegInterval getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const; /// \brief Handle instructions async components void pushInstruction(MachineBasicBlock &MBB, @@ -121,9 +122,13 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; const char *getPassName() const override { - return "SI insert wait instructions"; + return "SI insert wait instructions"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -138,9 +143,8 @@ FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) { } Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { - - uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags; - Counters Result; + uint64_t TSFlags = MI.getDesc().TSFlags; + Counters Result = { { 0, 0, 0 } }; Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT); @@ -151,15 +155,22 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { // LGKM may uses larger values if (TSFlags & SIInstrFlags::LGKM_CNT) { - if (TII->isSMRD(MI.getOpcode())) { - - MachineOperand &Op = MI.getOperand(0); - assert(Op.isReg() && "First LGKM operand must be a register!"); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - Result.Named.LGKM = Size > 4 ? 2 : 1; - + if (TII->isSMRD(MI)) { + + if (MI.getNumOperands() != 0) { + assert(MI.getOperand(0).isReg() && + "First LGKM operand must be a register!"); + + // XXX - What if this is a write into a super register? + const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0); + unsigned Size = RC->getSize(); + Result.Named.LGKM = Size > 4 ? 2 : 1; + } else { + // s_dcache_inv etc. do not have a a destination register. Assume we + // want a wait on these. + // XXX - What is the right value? + Result.Named.LGKM = 1; + } } else { // DS Result.Named.LGKM = 1; @@ -173,9 +184,8 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) { } bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { - // Constants are always irrelevant - if (!Op.isReg()) + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) return false; // Defines are always relevant @@ -196,7 +206,7 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { // operand comes before the value operand and it may have // multiple data operands. - if (TII->isDS(MI.getOpcode())) { + if (TII->isDS(MI)) { MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::data); if (Data && Op.isIdenticalTo(*Data)) return true; @@ -224,18 +234,13 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) { return false; } -RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) { - - if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) - return std::make_pair(0, 0); - - unsigned Reg = Op.getReg(); - unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize(); - +RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC, + const MachineOperand &Reg) const { + unsigned Size = RC->getSize(); assert(Size >= 4); RegInterval Result; - Result.first = TRI->getEncodingValue(Reg); + Result.first = TRI->getEncodingValue(Reg.getReg()); Result.second = Result.first + Size / 4; return Result; @@ -246,10 +251,13 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // Get the hardware counter increments and sum them up Counters Increment = getHwCounts(*I); + Counters Limit = ZeroCounts; unsigned Sum = 0; for (unsigned i = 0; i < 3; ++i) { LastIssued.Array[i] += Increment.Array[i]; + if (Increment.Array[i]) + Limit.Array[i] = LastIssued.Array[i]; Sum += Increment.Array[i]; } @@ -261,7 +269,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, if (MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { - // Any occurence of consecutive VMEM or SMEM instructions forms a VMEM + // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM // or SMEM clause, respectively. // // The temporary workaround is to break the clauses with S_NOP. @@ -270,7 +278,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, // and destination registers don't overlap, e.g. this is illegal: // r0 = load r2 // r2 = load r0 - if ((LastOpcodeType == SMEM && TII->isSMRD(I->getOpcode())) || + if ((LastOpcodeType == SMEM && TII->isSMRD(*I)) || (LastOpcodeType == VMEM && Increment.Named.VM)) { // Insert a NOP to break the clause. BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)) @@ -278,7 +286,7 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, LastInstWritesM0 = false; } - if (TII->isSMRD(I->getOpcode())) + if (TII->isSMRD(*I)) LastOpcodeType = SMEM; else if (Increment.Named.VM) LastOpcodeType = VMEM; @@ -290,21 +298,21 @@ void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB, } for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) { - MachineOperand &Op = I->getOperand(i); if (!isOpRelevant(Op)) continue; - RegInterval Interval = getRegInterval(Op); + const TargetRegisterClass *RC = TII->getOpRegClass(*I, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { // Remember which registers we define if (Op.isDef()) - DefinedRegs[j] = LastIssued; + DefinedRegs[j] = Limit; // and which one we are using if (Op.isUse()) - UsedRegs[j] = LastIssued; + UsedRegs[j] = Limit; } } } @@ -390,12 +398,18 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { if (MI.getOpcode() == AMDGPU::S_SENDMSG) return LastIssued; - // For each register affected by this - // instruction increase the result sequence + // For each register affected by this instruction increase the result + // sequence. + // + // TODO: We could probably just look at explicit operands if we removed VCC / + // EXEC from SMRD dest reg classes. for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { - MachineOperand &Op = MI.getOperand(i); - RegInterval Interval = getRegInterval(Op); + if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg())) + continue; + + const TargetRegisterClass *RC = TII->getOpRegClass(MI, i); + RegInterval Interval = getRegInterval(RC, Op); for (unsigned j = Interval.first; j < Interval.second; ++j) { if (Op.isDef()) { diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td index 211666a..0e883f6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -41,6 +41,10 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : field bits<1> WQM = 0; field bits<1> VGPRSpill = 0; + // This bit tells the assembler to use the 32-bit encoding in case it + // is unable to infer the encoding from the operands. + field bits<1> VOPAsmPrefer32Bit = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = VM_CNT; let TSFlags{1} = EXP_CNT; @@ -68,10 +72,8 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : let TSFlags{19} = FLAT; let TSFlags{20} = WQM; let TSFlags{21} = VGPRSpill; + let TSFlags{22} = VOPAsmPrefer32Bit; - // Most instructions require adjustments after selection to satisfy - // operand requirements. - let hasPostISelHook = 1; let SchedRW = [Write32Bit]; } @@ -86,7 +88,6 @@ class Enc64 { } class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">; -def VOPDstVCC : VOPDstOperand <VCCReg>; let Uses = [EXEC] in { @@ -101,11 +102,11 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> : } class VOPCCommon <dag ins, string asm, list<dag> pattern> : - VOPAnyCommon <(outs VOPDstVCC:$dst), ins, asm, pattern> { + VOPAnyCommon <(outs), ins, asm, pattern> { - let DisableEncoding = "$dst"; let VOPC = 1; let Size = 4; + let Defs = [VCC]; } class VOP1Common <dag outs, dag ins, string asm, list<dag> pattern> : @@ -138,6 +139,11 @@ class VOP3Common <dag outs, dag ins, string asm, list<dag> pattern> : let isCodeGenOnly = 0; int Size = 8; + + // Because SGPRs may be allowed if there are multiple operands, we + // need a post-isel hook to insert copies in order to avoid + // violating constant bus requirements. + let hasPostISelHook = 1; } } // End Uses = [EXEC] @@ -222,6 +228,20 @@ class SMRDe <bits<5> op, bits<1> imm> : Enc32 { let Inst{31-27} = 0x18; //encoding } +class SMRD_IMMe_ci <bits<5> op> : Enc64 { + bits<7> sdst; + bits<7> sbase; + bits<32> offset; + + let Inst{7-0} = 0xff; + let Inst{8} = 0; + let Inst{14-9} = sbase{6-1}; + let Inst{21-15} = sdst; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding + let Inst{63-32} = offset; +} + let SchedRW = [WriteSALU] in { class SOP1 <dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern> { @@ -249,13 +269,13 @@ class SOP2 <dag outs, dag ins, string asm, list<dag> pattern> : class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : InstSI<outs, ins, asm, pattern>, SOPCe <op> { - let DisableEncoding = "$dst"; let mayLoad = 0; let mayStore = 0; let hasSideEffects = 0; let SALU = 1; let SOPC = 1; let isCodeGenOnly = 0; + let Defs = [SCC]; let UseNamedOperandTable = 1; } @@ -598,15 +618,13 @@ class VINTRPCommon <dag outs, dag ins, string asm, list<dag> pattern> : // Vector I/O operations //===----------------------------------------------------------------------===// -let Uses = [EXEC] in { - class DS <dag outs, dag ins, string asm, list<dag> pattern> : InstSI <outs, ins, asm, pattern> { let LGKM_CNT = 1; let DS = 1; let UseNamedOperandTable = 1; - let Uses = [M0]; + let Uses = [M0, EXEC]; // Most instruction load and store data, so set this as the default. let mayLoad = 1; @@ -623,6 +641,7 @@ class MUBUF <dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MUBUF = 1; + let Uses = [EXEC]; let hasSideEffects = 0; let UseNamedOperandTable = 1; @@ -636,6 +655,7 @@ class MTBUF <dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; + let Uses = [EXEC]; let hasSideEffects = 0; let UseNamedOperandTable = 1; @@ -665,9 +685,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : let VM_CNT = 1; let EXP_CNT = 1; let MIMG = 1; + let Uses = [EXEC]; let hasSideEffects = 0; // XXX ???? } - - -} // End Uses = [EXEC] diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index cfd2c42..a08a5a8 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -82,6 +82,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, switch (MI->getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: return true; default: return false; @@ -204,7 +205,8 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, unsigned &Offset, const TargetRegisterInfo *TRI) const { unsigned Opc = LdSt->getOpcode(); - if (isDS(Opc)) { + + if (isDS(*LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(*LdSt, AMDGPU::OpName::offset); if (OffsetImm) { @@ -254,7 +256,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return false; } - if (isMUBUF(Opc) || isMTBUF(Opc)) { + if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) return false; @@ -270,7 +272,7 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return true; } - if (isSMRD(Opc)) { + if (isSMRD(*LdSt)) { const MachineOperand *OffsetImm = getNamedOperand(*LdSt, AMDGPU::OpName::offset); if (!OffsetImm) @@ -289,20 +291,18 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt, unsigned NumLoads) const { - unsigned Opc0 = FirstLdSt->getOpcode(); - unsigned Opc1 = SecondLdSt->getOpcode(); - // TODO: This needs finer tuning if (NumLoads > 4) return false; - if (isDS(Opc0) && isDS(Opc1)) + if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) return true; - if (isSMRD(Opc0) && isSMRD(Opc1)) + if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) return true; - if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) + if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && + (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) return true; return false; @@ -323,28 +323,45 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, - AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0 + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, }; static const int16_t Sub0_7[] = { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, - AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0 + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + }; + + static const int16_t Sub0_7_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, }; static const int16_t Sub0_3[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0 + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + }; + + static const int16_t Sub0_3_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, }; static const int16_t Sub0_2[] = { - AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0 + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, }; static const int16_t Sub0_1[] = { - AMDGPU::sub0, AMDGPU::sub1, 0 + AMDGPU::sub0, AMDGPU::sub1, }; unsigned Opcode; - const int16_t *SubIndices; + ArrayRef<int16_t> SubIndices; + bool Forward; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); @@ -360,7 +377,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else { // FIXME: Hack until VReg_1 removed. assert(AMDGPU::VGPR_32RegClass.contains(SrcReg)); - BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32), AMDGPU::VCC) + BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32)) .addImm(0) .addReg(SrcReg, getKillRegState(KillSrc)); } @@ -375,18 +392,18 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) { assert(AMDGPU::SReg_128RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_3; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_3_64; } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) { assert(AMDGPU::SReg_256RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_7; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_7_64; } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) { assert(AMDGPU::SReg_512RegClass.contains(SrcReg)); - Opcode = AMDGPU::S_MOV_B32; - SubIndices = Sub0_15; + Opcode = AMDGPU::S_MOV_B64; + SubIndices = Sub0_15_64; } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || @@ -428,13 +445,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Can't copy register!"); } - while (unsigned SubIdx = *SubIndices++) { + if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) + Forward = true; + else + Forward = false; + + for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { + unsigned SubIdx; + if (Forward) + SubIdx = SubIndices[Idx]; + else + SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)); - Builder.addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); + Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); + + if (Idx == SubIndices.size() - 1) + Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); - if (*SubIndices) + if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); } } @@ -471,6 +502,40 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { return AMDGPU::COPY; } +static unsigned getSGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_SAVE; + case 8: + return AMDGPU::SI_SPILL_S64_SAVE; + case 16: + return AMDGPU::SI_SPILL_S128_SAVE; + case 32: + return AMDGPU::SI_SPILL_S256_SAVE; + case 64: + return AMDGPU::SI_SPILL_S512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillSaveOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_SAVE; + case 8: + return AMDGPU::SI_SPILL_V64_SAVE; + case 16: + return AMDGPU::SI_SPILL_V128_SAVE; + case 32: + return AMDGPU::SI_SPILL_V256_SAVE; + case 64: + return AMDGPU::SI_SPILL_V512_SAVE; + default: + llvm_unreachable("unknown register size"); + } +} + void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, @@ -481,47 +546,83 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; + + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); + MachineMemOperand *MMO + = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, + Size, Align); if (RI.isSGPRClass(RC)) { + MFI->setHasSpilledSGPRs(); + // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. - switch (RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - MFI->setHasSpilledVGPRs(); - - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; - } + unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); + + return; } - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); - BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else { + if (!ST.isVGPRSpillingEnabled(MFI)) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); BuildMI(MBB, MI, DL, get(AMDGPU::KILL)) - .addReg(SrcReg); + .addReg(SrcReg); + + return; + } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); + MFI->setHasSpilledVGPRs(); + BuildMI(MBB, MI, DL, get(Opcode)) + .addReg(SrcReg) // src + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); +} + +static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_S32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_S64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_S128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_S256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_S512_RESTORE; + default: + llvm_unreachable("unknown register size"); + } +} + +static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { + switch (Size) { + case 4: + return AMDGPU::SI_SPILL_V32_RESTORE; + case 8: + return AMDGPU::SI_SPILL_V64_RESTORE; + case 16: + return AMDGPU::SI_SPILL_V128_RESTORE; + case 32: + return AMDGPU::SI_SPILL_V256_RESTORE; + case 64: + return AMDGPU::SI_SPILL_V512_RESTORE; + default: + llvm_unreachable("unknown register size"); } } @@ -534,42 +635,43 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); - int Opcode = -1; - - if (RI.isSGPRClass(RC)){ - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - } - } else if(RI.hasVGPRs(RC) && ST.isVGPRSpillingEnabled(MFI)) { - switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; - case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; - } - } + unsigned Align = FrameInfo->getObjectAlignment(FrameIndex); + unsigned Size = FrameInfo->getObjectSize(FrameIndex); + + MachinePointerInfo PtrInfo + = MachinePointerInfo::getFixedStack(*MF, FrameIndex); - if (Opcode != -1) { - FrameInfo->setObjectAlignment(FrameIndex, 4); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, Size, Align); + + if (RI.isSGPRClass(RC)) { + // FIXME: Maybe this should not include a memoperand because it will be + // lowered to non-memory instructions. + unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode), DestReg) - .addFrameIndex(FrameIndex) - // Place-holder registers, these will be filled in by - // SIPrepareScratchRegs. - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); + .addFrameIndex(FrameIndex) // frame_idx + .addMemOperand(MMO); - } else { + return; + } + + if (!ST.isVGPRSpillingEnabled(MFI)) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg); + + return; } + + assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected"); + + unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize()); + BuildMI(MBB, MI, DL, get(Opcode), DestReg) + .addFrameIndex(FrameIndex) // frame_idx + .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc + .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled @@ -601,17 +703,21 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (MFI->getShaderType() == ShaderType::COMPUTE && WorkGroupSize > WavefrontSize) { - unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); - unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); - unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned TIDIGXReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X); + unsigned TIDIGYReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y); + unsigned TIDIGZReg + = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z); unsigned InputPtrReg = - TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR); for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) { if (!Entry.isLiveIn(Reg)) Entry.addLiveIn(Reg); } RS->enterBasicBlock(&Entry); + // FIXME: Can we scavenge an SReg_64 and access the subregs? unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) @@ -667,8 +773,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, return TmpReg; } -void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, - int Count) const { +void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, + int Count) const { while (Count > 0) { int Arg; if (Count >= 8) @@ -687,26 +793,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::SI_CONSTDATA_PTR: { - unsigned Reg = MI->getOperand(0).getReg(); - unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0); - unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1); - - BuildMI(MBB, MI, DL, get(AMDGPU::S_GETPC_B64), Reg); - - // Add 32-bit offset from this instruction to the start of the constant data. - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addTargetIndex(AMDGPU::TI_CONSTDATA_START) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit); - BuildMI(MBB, MI, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi) - .addImm(0) - .addReg(AMDGPU::SCC, RegState::Define | RegState::Implicit) - .addReg(AMDGPU::SCC, RegState::Implicit); - MI->eraseFromParent(); - break; - } case AMDGPU::SGPR_USE: // This is just a placeholder for register allocation. MI->eraseFromParent(); @@ -760,49 +846,90 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MI->eraseFromParent(); break; } + + case AMDGPU::SI_CONSTDATA_PTR: { + const SIRegisterInfo *TRI = + static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); + MachineFunction &MF = *MBB.getParent(); + unsigned Reg = MI->getOperand(0).getReg(); + unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); + unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); + + // Create a bundle so these instructions won't be re-ordered by the + // post-RA scheduler. + MIBundleBuilder Bundler(MBB, MI); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); + + // Add 32-bit offset from this instruction to the start of the + // constant data. + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) + .addReg(RegLo) + .addOperand(MI->getOperand(1))); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .addImm(0)); + + llvm::finalizeBundle(MBB, Bundler.begin()); + + MI->eraseFromParent(); + break; + } } return true; } -MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, - bool NewMI) const { - - if (MI->getNumOperands() < 3) - return nullptr; - +/// Commutes the operands in the given instruction. +/// The commutable operands are specified by their indices OpIdx0 and OpIdx1. +/// +/// Do not call this method for a non-commutable instruction or for +/// non-commutable pair of operand indices OpIdx0 and OpIdx1. +/// Even though the instruction is commutable, the method may still +/// fail to commute the operands, null pointer is returned in such cases. +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const { int CommutedOpcode = commuteOpcode(*MI); if (CommutedOpcode == -1) return nullptr; int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src0); - assert(Src0Idx != -1 && "Should always have src0 operand"); - MachineOperand &Src0 = MI->getOperand(Src0Idx); if (!Src0.isReg()) return nullptr; int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::src1); - if (Src1Idx == -1) + + if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || + OpIdx1 != static_cast<unsigned>(Src1Idx)) && + (OpIdx0 != static_cast<unsigned>(Src1Idx) || + OpIdx1 != static_cast<unsigned>(Src0Idx))) return nullptr; MachineOperand &Src1 = MI->getOperand(Src1Idx); - // Make sure it's legal to commute operands for VOP2. - if (isVOP2(MI->getOpcode()) && - (!isOperandLegal(MI, Src0Idx, &Src1) || - !isOperandLegal(MI, Src1Idx, &Src0))) { - return nullptr; + + if (isVOP2(*MI)) { + const MCInstrDesc &InstrDesc = MI->getDesc(); + // For VOP2 instructions, any operand type is valid to use for src0. Make + // sure we can use the src1 as src0. + // + // We could be stricter here and only allow commuting if there is a reason + // to do so. i.e. if both operands are VGPRs there is no real benefit, + // although MachineCSE attempts to find matches by commuting. + const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) + return nullptr; } if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. if (NewMI || !Src1.isImm() || - (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) { + (!isVOP2(*MI) && !isVOP3(*MI))) { return nullptr; } - // Be sure to copy the source modifiers to the right place. if (MachineOperand *Src0Mods = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { @@ -832,7 +959,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); } if (MI) @@ -845,8 +972,8 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI, // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx1, - unsigned &SrcOpIdx2) const { + unsigned &SrcOpIdx0, + unsigned &SrcOpIdx1) const { const MCInstrDesc &MCID = MI->getDesc(); if (!MCID.isCommutable()) return false; @@ -857,7 +984,8 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, return false; // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on - // immediate. + // immediate. Also, immediate src0 operand is not handled in + // SIInstrInfo::commuteInstruction(); if (!MI->getOperand(Src0Idx).isReg()) return false; @@ -865,18 +993,22 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, if (Src1Idx == -1) return false; - if (!MI->getOperand(Src1Idx).isReg()) - return false; - - // If any source modifiers are set, the generic instruction commuting won't - // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + MachineOperand &Src1 = MI->getOperand(Src1Idx); + if (Src1.isImm()) { + // SIInstrInfo::commuteInstruction() does support commuting the immediate + // operand src1 in 2 and 3 operand instructions. + if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + return false; + } else if (Src1.isReg()) { + // If any source modifiers are set, the generic instruction commuting won't + // understand how to copy the source modifiers. + if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + return false; + } else return false; - SrcOpIdx1 = Src0Idx; - SrcOpIdx2 = Src1Idx; - return true; + return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, @@ -898,11 +1030,6 @@ bool SIInstrInfo::isMov(unsigned Opcode) const { } } -bool -SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { - return RC != &AMDGPU::EXECRegRegClass; -} - static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, @@ -984,9 +1111,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::src2)); - // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); removeModOperands(*UseMI); @@ -1045,18 +1169,6 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, return false; } -bool -SIInstrInfo::isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA) const { - switch(MI->getOpcode()) { - default: return AMDGPUInstrInfo::isTriviallyReMaterializable(MI, AA); - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - return MI->getOperand(1).isImm(); - } -} - static bool offsetsDoNotOverlap(int WidthA, int OffsetA, int WidthB, int OffsetB) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; @@ -1088,9 +1200,6 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, AliasAnalysis *AA) const { - unsigned Opc0 = MIa->getOpcode(); - unsigned Opc1 = MIb->getOpcode(); - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && "MIa must load from or modify a memory location"); assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && @@ -1105,32 +1214,32 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the - // underlying addres space, even if it was lowered to a different one, + // underlying address space, even if it was lowered to a different one, // e.g. private accesses lowered to use MUBUF instructions on a scratch // buffer. - if (isDS(Opc0)) { - if (isDS(Opc1)) + if (isDS(*MIa)) { + if (isDS(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1); + return !isFLAT(*MIb); } - if (isMUBUF(Opc0) || isMTBUF(Opc0)) { - if (isMUBUF(Opc1) || isMTBUF(Opc1)) + if (isMUBUF(*MIa) || isMTBUF(*MIa)) { + if (isMUBUF(*MIb) || isMTBUF(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1) && !isSMRD(Opc1); + return !isFLAT(*MIb) && !isSMRD(*MIb); } - if (isSMRD(Opc0)) { - if (isSMRD(Opc1)) + if (isSMRD(*MIa)) { + if (isSMRD(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(Opc1) && !isMUBUF(Opc0) && !isMTBUF(Opc0); + return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); } - if (isFLAT(Opc0)) { - if (isFLAT(Opc1)) + if (isFLAT(*MIa)) { + if (isFLAT(*MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return false; @@ -1319,6 +1428,26 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, return false; } +static unsigned findImplicitSGPRRead(const MachineInstr &MI) { + for (const MachineOperand &MO : MI.implicit_operands()) { + // We only care about reads. + if (MO.isDef()) + continue; + + switch (MO.getReg()) { + case AMDGPU::VCC: + case AMDGPU::M0: + case AMDGPU::FLAT_SCR: + return MO.getReg(); + + default: + break; + } + } + + return AMDGPU::NoRegister; +} + bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, StringRef &ErrInfo) const { uint16_t Opcode = MI->getOpcode(); @@ -1335,7 +1464,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, return false; } - // Make sure the register classes are correct + // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI->getOperand(i).isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " @@ -1392,14 +1521,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify VOP* - if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) { + if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; - unsigned SGPRUsed = AMDGPU::NoRegister; + unsigned SGPRUsed = findImplicitSGPRRead(*MI); + if (SGPRUsed != AMDGPU::NoRegister) + ++ConstantBusCount; + for (int OpIdx : OpIndices) { if (OpIdx == -1) break; @@ -1435,6 +1567,16 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } + // Make sure we aren't losing exec uses in the td files. This mostly requires + // being careful when using let Uses to try to add other use registers. + if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { + const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); + if (!Exec || !Exec->isImplicit()) { + ErrInfo = "VALU instruction does not implicitly read exec mask"; + return false; + } + } + return true; } @@ -1483,11 +1625,17 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; + case AMDGPU::S_LOAD_DWORD_SGPR: + case AMDGPU::S_LOAD_DWORD_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; + case AMDGPU::S_LOAD_DWORDX2_SGPR: + case AMDGPU::S_LOAD_DWORDX2_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_LOAD_DWORDX4_SGPR: + case AMDGPU::S_LOAD_DWORDX4_IMM_ci: + return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; @@ -1562,17 +1710,21 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, unsigned SubIdx, const TargetRegisterClass *SubRC) const { - assert(SuperReg.isReg()); - - unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); + MachineBasicBlock *MBB = MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); unsigned SubReg = MRI.createVirtualRegister(SubRC); + if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) { + BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg) + .addReg(SuperReg.getReg(), 0, SubIdx); + return SubReg; + } + // Just in case the super register is itself a sub-register, copy it to a new // value so we don't need to worry about merging its subreg index with the // SubIdx passed to this function. The register coalescer should be able to // eliminate this extra copy. - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); + unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC); BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg) .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg()); @@ -1605,36 +1757,6 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( return MachineOperand::CreateReg(SubReg, false); } -unsigned SIInstrInfo::split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const { - MachineBasicBlock *MBB = MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned LoDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned HiDst = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned Dst = MRI.createVirtualRegister(RC); - - MachineInstr *Lo = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - LoDst) - .addImm(Op.getImm() & 0xFFFFFFFF); - MachineInstr *Hi = BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), - HiDst) - .addImm(Op.getImm() >> 32); - - BuildMI(*MBB, MI, DL, get(TargetOpcode::REG_SEQUENCE), Dst) - .addReg(LoDst) - .addImm(AMDGPU::sub0) - .addReg(HiDst) - .addImm(AMDGPU::sub1); - - Worklist.push_back(Lo); - Worklist.push_back(Hi); - - return Dst; -} - // Change the order of operands from (0, 1, 2) to (0, 2, 1) void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { assert(Inst->getNumExplicitOperands() == 3); @@ -1643,6 +1765,41 @@ void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { Inst->addOperand(Op1); } +bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (!MO.isReg()) + return false; + + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RC = + TargetRegisterInfo::isVirtualRegister(Reg) ? + MRI.getRegClass(Reg) : + RI.getPhysRegClass(Reg); + + // In order to be legal, the common sub-class must be equal to the + // class of the current operand. For example: + // + // v_mov_b32 s0 ; Operand defined as vsrc_32 + // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL + // + // s_sendmsg 0, s0 ; Operand defined as m0reg + // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL + + return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; +} + +bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const { + if (MO.isReg()) + return isLegalRegOperand(MRI, OpInfo, MO); + + // Handle non-register types that are treated like immediates. + assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); + return true; +} + bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO) const { const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -1653,7 +1810,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (!MO) MO = &MI->getOperand(OpIdx); - if (isVALU(InstDesc.Opcode) && + if (isVALU(*MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { unsigned SGPRUsed = MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; @@ -1670,21 +1827,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, if (MO->isReg()) { assert(DefinedRC); - const TargetRegisterClass *RC = - TargetRegisterInfo::isVirtualRegister(MO->getReg()) ? - MRI.getRegClass(MO->getReg()) : - RI.getPhysRegClass(MO->getReg()); - - // In order to be legal, the common sub-class must be equal to the - // class of the current operand. For example: - // - // v_mov_b32 s0 ; Operand defined as vsrc_32 - // ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL - // - // s_sendmsg 0, s0 ; Operand defined as m0reg - // ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL - - return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC; + return isLegalRegOperand(MRI, OpInfo, *MO); } @@ -1699,81 +1842,143 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); +void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &InstrDesc = get(Opc); - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); - int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src2); + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + MachineOperand &Src1 = MI->getOperand(Src1Idx); - // Legalize VOP2 - if (isVOP2(MI->getOpcode()) && Src1Idx != -1) { - // Legalize src0 - if (!isOperandLegal(MI, Src0Idx)) + // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 + // we need to only have one constant bus use. + // + // Note we do not need to worry about literal constants here. They are + // disabled for the operand type for instructions because they will always + // violate the one constant bus use rule. + bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + if (HasImplicitSGPR) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + + if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) legalizeOpWithMove(MI, Src0Idx); + } - // Legalize src1 - if (isOperandLegal(MI, Src1Idx)) - return; + // VOP2 src0 instructions support all operand types, so we don't need to check + // their legality. If src1 is already legal, we don't need to do anything. + if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1)) + return; - // Usually src0 of VOP2 instructions allow more types of inputs - // than src1, so try to commute the instruction to decrease our - // chances of having to insert a MOV instruction to legalize src1. - if (MI->isCommutable()) { - if (commuteInstruction(MI)) - // If we are successful in commuting, then we know MI is legal, so - // we are done. - return; - } + // We do not use commuteInstruction here because it is too aggressive and will + // commute if it is possible. We only want to commute here if it improves + // legality. This can be called a fairly large number of times so don't waste + // compile time pointlessly swapping and checking legality again. + if (HasImplicitSGPR || !MI->isCommutable()) { + legalizeOpWithMove(MI, Src1Idx); + return; + } + + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + MachineOperand &Src0 = MI->getOperand(Src0Idx); + // If src0 can be used as src1, commuting will make the operands legal. + // Otherwise we have to give up and insert a move. + // + // TODO: Other immediate-like operand kinds could be commuted if there was a + // MachineOperand::ChangeTo* for them. + if ((!Src1.isImm() && !Src1.isReg()) || + !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) { legalizeOpWithMove(MI, Src1Idx); return; } - // XXX - Do any VOP3 instructions read VCC? - // Legalize VOP3 - if (isVOP3(MI->getOpcode())) { - int VOP3Idx[3] = { Src0Idx, Src1Idx, Src2Idx }; + int CommutedOpc = commuteOpcode(*MI); + if (CommutedOpc == -1) { + legalizeOpWithMove(MI, Src1Idx); + return; + } - // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + MI->setDesc(get(CommutedOpc)); - for (unsigned i = 0; i < 3; ++i) { - int Idx = VOP3Idx[i]; - if (Idx == -1) - break; - MachineOperand &MO = MI->getOperand(Idx); + unsigned Src0Reg = Src0.getReg(); + unsigned Src0SubReg = Src0.getSubReg(); + bool Src0Kill = Src0.isKill(); - if (MO.isReg()) { - if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - continue; // VGPRs are legal + if (Src1.isImm()) + Src0.ChangeToImmediate(Src1.getImm()); + else if (Src1.isReg()) { + Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill()); + Src0.setSubReg(Src1.getSubReg()); + } else + llvm_unreachable("Should only have register or immediate operands"); - assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction"); + Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill); + Src1.setSubReg(Src0SubReg); +} - if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { - SGPRReg = MO.getReg(); - // We can use one SGPR in each VOP3 instruction. - continue; - } - } else if (!isLiteralConstant(MO, getOpSize(MI->getOpcode(), Idx))) { - // If it is not a register and not a literal constant, then it must be - // an inline constant which is always legal. - continue; - } - // If we make it this far, then the operand is not legal and we must - // legalize it. - legalizeOpWithMove(MI, Idx); +// Legalize VOP3 operands. Because all operand types are supported for any +// operand, and since literal constants are not allowed and should never be +// seen, we only need to worry about inserting copies if we use multiple SGPR +// operands. +void SIInstrInfo::legalizeOperandsVOP3( + MachineRegisterInfo &MRI, + MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + + int VOP3Idx[3] = { + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2) + }; + + // Find the one SGPR operand we are allowed to use. + unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx); + + for (unsigned i = 0; i < 3; ++i) { + int Idx = VOP3Idx[i]; + if (Idx == -1) + break; + MachineOperand &MO = MI->getOperand(Idx); + + // We should never see a VOP3 instruction with an illegal immediate operand. + if (!MO.isReg()) + continue; + + if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) + continue; // VGPRs are legal + + if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) { + SGPRReg = MO.getReg(); + // We can use one SGPR in each VOP3 instruction. + continue; } + + // If we make it this far, then the operand is not legal and we must + // legalize it. + legalizeOpWithMove(MI, Idx); + } +} + +void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { + MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + + // Legalize VOP2 + if (isVOP2(*MI)) { + legalizeOperandsVOP2(MRI, MI); + return; + } + + // Legalize VOP3 + if (isVOP3(*MI)) { + legalizeOperandsVOP3(MRI, MI); + return; } // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE || - MI->getOpcode() == AMDGPU::PHI) { + if (MI->getOpcode() == AMDGPU::PHI) { const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { if (!MI->getOperand(i).isReg() || @@ -1802,26 +2007,53 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } // Update all the operands so they have the same type. - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); - MachineBasicBlock *InsertBB; - MachineBasicBlock::iterator Insert; - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - InsertBB = MI->getParent(); - Insert = MI; - } else { - // MI is a PHI instruction. - InsertBB = MI->getOperand(i + 1).getMBB(); - Insert = InsertBB->getFirstTerminator(); + + // MI is a PHI instruction. + MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); + MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); + + BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + Op.setReg(DstReg); + } + } + + // REG_SEQUENCE doesn't really require operand legalization, but if one has a + // VGPR dest type and SGPR sources, insert copies so all operands are + // VGPRs. This seems to help operand folding / the register coalescer. + if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { + MachineBasicBlock *MBB = MI->getParent(); + const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); + if (RI.hasVGPRs(DstRC)) { + // Update all the operands so they are VGPR register classes. These may + // not be the same register class because REG_SEQUENCE supports mixing + // subregister index types e.g. sub0_sub1 + sub2 + sub3 + for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI->getOperand(I); + if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + continue; + + const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg()); + const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC); + if (VRC == OpRC) + continue; + + unsigned DstReg = MRI.createVirtualRegister(VRC); + + BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); + + Op.setReg(DstReg); + Op.setIsKill(); } - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), - get(AMDGPU::COPY), DstReg) - .addOperand(MI->getOperand(i)); - MI->getOperand(i).setReg(DstReg); } + + return; } // Legalize INSERT_SUBREG @@ -1858,15 +2090,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } MachineBasicBlock &MBB = *MI->getParent(); - // Extract the ptr from the resource descriptor. - - // SRsrcPtrLo = srsrc:sub0 - unsigned SRsrcPtrLo = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub0, &AMDGPU::VGPR_32RegClass); - // SRsrcPtrHi = srsrc:sub1 - unsigned SRsrcPtrHi = buildExtractSubReg(MI, MRI, *SRsrc, - &AMDGPU::VReg_128RegClass, AMDGPU::sub1, &AMDGPU::VGPR_32RegClass); + // Extract the ptr from the resource descriptor. + unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, + &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass); // Create an empty resource descriptor unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); @@ -1891,80 +2118,112 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - unsigned NewVAddrLo; - unsigned NewVAddrHi; if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer // extracted from the resource descriptor to the current value of VAddr. - NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - // NewVaddrLo = SRsrcPtrLo + VAddr:sub0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADD_I32_e32), - NewVAddrLo) - .addReg(SRsrcPtrLo) - .addReg(VAddr->getReg(), 0, AMDGPU::sub0) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine); - - // NewVaddrHi = SRsrcPtrHi + VAddr:sub1 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_ADDC_U32_e32), - NewVAddrHi) - .addReg(SRsrcPtrHi) - .addReg(VAddr->getReg(), 0, AMDGPU::sub1) - .addReg(AMDGPU::VCC, RegState::ImplicitDefine) - .addReg(AMDGPU::VCC, RegState::Implicit); - + unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 + DebugLoc DL = MI->getDebugLoc(); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addReg(VAddr->getReg(), 0, AMDGPU::sub0); + + // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1 + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addReg(VAddr->getReg(), 0, AMDGPU::sub1); + + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. + assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() + < AMDGPUSubtarget::VOLCANIC_ISLANDS && + "FIXME: Need to emit flat atomics here"); + MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - - // Create the new instruction. unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); - MachineInstr *Addr64 = - BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0); // tfe + + // Atomics rith return have have an additional tied operand and are + // missing some of the special bits. + MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); + MachineInstr *Addr64; + + if (!VDataIn) { + // Regular buffer load / store. + MachineInstrBuilder MIB + = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset); + + // Atomics do not have this operand. + if (const MachineOperand *GLC + = getNamedOperand(*MI, AMDGPU::OpName::glc)) { + MIB.addImm(GLC->getImm()); + } + + MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); + + if (const MachineOperand *TFE + = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { + MIB.addImm(TFE->getImm()); + } + + MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + Addr64 = MIB; + } else { + // Atomics with return. + Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*VDataIn) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + } MI->removeFromParent(); MI = Addr64; - NewVAddrLo = SRsrcPtrLo; - NewVAddrHi = SRsrcPtrHi; + // NewVaddr = {NewVaddrHi, NewVaddrLo} + BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); } - // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), - NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); - - // Update the instruction to use NewVaddr VAddr->setReg(NewVAddr); // Update the instruction to use NewSRsrc @@ -2028,53 +2287,64 @@ void SIInstrInfo::splitSMRD(MachineInstr *MI, .addOperand(*SOff); unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addOperand(*SOff) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp)) + .addReg(SOff->getReg(), 0, SOff->getSubReg()) + .addImm(HalfSize); + Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) .addReg(SBase->getReg(), getKillRegState(IsKill), SBase->getSubReg()) .addReg(OffsetSGPR); } unsigned SubLo, SubHi; + const TargetRegisterClass *NewDstRC; switch (HalfSize) { case 4: SubLo = AMDGPU::sub0; SubHi = AMDGPU::sub1; + NewDstRC = &AMDGPU::VReg_64RegClass; break; case 8: SubLo = AMDGPU::sub0_sub1; SubHi = AMDGPU::sub2_sub3; + NewDstRC = &AMDGPU::VReg_128RegClass; break; case 16: SubLo = AMDGPU::sub0_sub1_sub2_sub3; SubHi = AMDGPU::sub4_sub5_sub6_sub7; + NewDstRC = &AMDGPU::VReg_256RegClass; break; case 32: SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; + NewDstRC = &AMDGPU::VReg_512RegClass; break; default: llvm_unreachable("Unhandled HalfSize"); } - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE)) - .addOperand(MI->getOperand(0)) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); + unsigned OldDst = MI->getOperand(0).getReg(); + unsigned NewDst = MRI.createVirtualRegister(NewDstRC); + + MRI.replaceRegWith(OldDst, NewDst); + + BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) + .addReg(RegLo) + .addImm(SubLo) + .addReg(RegHi) + .addImm(SubHi); } -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const { +void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { MachineBasicBlock *MBB = MI->getParent(); - switch (MI->getOpcode()) { - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: { + int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); + assert(DstIdx != -1); + unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; + switch(RI.getRegClass(DstRCID)->getSize()) { + case 4: + case 8: + case 16: { unsigned NewOpcode = getVALUOp(*MI); unsigned RegOffset; unsigned ImmOffset; @@ -2118,53 +2388,55 @@ void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) con BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) .addImm(RsrcDataFormat >> 32); BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - MI->setDesc(get(NewOpcode)); - if (MI->getOperand(2).isReg()) { - MI->getOperand(2).setReg(SRsrc); - } else { - MI->getOperand(2).ChangeToRegister(SRsrc, false); - } - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset)); - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // glc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // slc - MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(0)); // tfe - - const TargetRegisterClass *NewDstRC = - RI.getRegClass(get(NewOpcode).OpInfo[0].RegClass); - - unsigned DstReg = MI->getOperand(0).getReg(); + .addReg(DWord0) + .addImm(AMDGPU::sub0) + .addReg(DWord1) + .addImm(AMDGPU::sub1) + .addReg(DWord2) + .addImm(AMDGPU::sub2) + .addReg(DWord3) + .addImm(AMDGPU::sub3); + + const MCInstrDesc &NewInstDesc = get(NewOpcode); + const TargetRegisterClass *NewDstRC + = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); + unsigned DstReg = MI->getOperand(0).getReg(); MRI.replaceRegWith(DstReg, NewDstReg); + + MachineInstr *NewInst = + BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) + .addOperand(MI->getOperand(1)) // sbase + .addReg(SRsrc) + .addImm(0) + .addImm(ImmOffset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MI->eraseFromParent(); + + legalizeOperands(NewInst); + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); break; } - case AMDGPU::S_LOAD_DWORDX8_IMM: - case AMDGPU::S_LOAD_DWORDX8_SGPR: { + case 32: { MachineInstr *Lo, *Hi; splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); break; } - case AMDGPU::S_LOAD_DWORDX16_IMM: - case AMDGPU::S_LOAD_DWORDX16_SGPR: { + case 64: { MachineInstr *Lo, *Hi; splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI); - moveSMRDToVALU(Hi, MRI); + moveSMRDToVALU(Lo, MRI, Worklist); + moveSMRDToVALU(Hi, MRI, Worklist); break; } } @@ -2185,51 +2457,28 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Handle some special cases switch (Opcode) { default: - if (isSMRD(Inst->getOpcode())) { - moveSMRDToVALU(Inst, MRI); + if (isSMRD(*Inst)) { + moveSMRDToVALU(Inst, MRI, Worklist); + continue; } break; - case AMDGPU::S_MOV_B64: { - DebugLoc DL = Inst->getDebugLoc(); - - // If the source operand is a register we can replace this with a - // copy. - if (Inst->getOperand(1).isReg()) { - MachineInstr *Copy = BuildMI(*MBB, Inst, DL, get(TargetOpcode::COPY)) - .addOperand(Inst->getOperand(0)) - .addOperand(Inst->getOperand(1)); - Worklist.push_back(Copy); - } else { - // Otherwise, we need to split this into two movs, because there is - // no 64-bit VALU move instruction. - unsigned Reg = Inst->getOperand(0).getReg(); - unsigned Dst = split64BitImm(Worklist, - Inst, - MRI, - MRI.getRegClass(Reg), - Inst->getOperand(1)); - MRI.replaceRegWith(Reg, Dst); - } - Inst->eraseFromParent(); - continue; - } case AMDGPU::S_AND_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_OR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_XOR_B64: - splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32); + splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); Inst->eraseFromParent(); continue; case AMDGPU::S_NOT_B64: - splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32); + splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); Inst->eraseFromParent(); continue; @@ -2281,6 +2530,11 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } break; + case AMDGPU::S_ABS_I32: + lowerScalarAbs(Worklist, Inst); + Inst->eraseFromParent(); + continue; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2319,7 +2573,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { Inst->addOperand(MachineOperand::CreateImm(0)); } - addDescImplicitUseDef(NewDesc, Inst); + Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { const MachineOperand &OffsetWidthOp = Inst->getOperand(2); @@ -2337,27 +2591,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } // Update the destination register class. - - const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0); - - switch (Opcode) { - // For target instructions, getOpRegClass just returns the virtual - // register class associated with the operand, so we need to find an - // equivalent VGPR register class in order to move the instruction to the - // VALU. - case AMDGPU::COPY: - case AMDGPU::PHI: - case AMDGPU::REG_SEQUENCE: - case AMDGPU::INSERT_SUBREG: - if (RI.hasVGPRs(NewDstRC)) - continue; - NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); - if (!NewDstRC) - continue; - break; - default: - break; - } + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); + if (!NewDstRC) + continue; unsigned DstReg = Inst->getOperand(0).getReg(); unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); @@ -2366,13 +2602,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Legalize the operands legalizeOperands(Inst); - for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg), - E = MRI.use_end(); I != E; ++I) { - MachineInstr &UseMI = *I->getParent(); - if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); - } - } + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } @@ -2390,6 +2620,30 @@ const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { return &AMDGPU::VGPR_32RegClass; } +void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const { + MachineBasicBlock &MBB = *Inst->getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst->getDebugLoc(); + + MachineOperand &Dest = Inst->getOperand(0); + MachineOperand &Src = Inst->getOperand(1); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg) + .addImm(0) + .addReg(Src.getReg()); + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg) + .addReg(Src.getReg()) + .addReg(TmpReg); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::splitScalar64BitUnaryOp( SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, @@ -2414,20 +2668,21 @@ void SIInstrInfo::splitScalar64BitUnaryOp( AMDGPU::sub0, Src0SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); + BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -2436,10 +2691,11 @@ void SIInstrInfo::splitScalar64BitUnaryOp( MRI.replaceRegWith(Dest.getReg(), FullDestReg); - // Try to legalize the operands in case we need to swap the order to keep it - // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); + // We don't need to legalizeOperands here because for a single operand, src0 + // will support any kind of input. + + // Move all users of this moved value. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBinaryOp( @@ -2474,9 +2730,10 @@ void SIInstrInfo::splitScalar64BitBinaryOp( AMDGPU::sub0, Src1SubRC); const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg()); - const TargetRegisterClass *DestSubRC = RI.getSubRegClass(DestRC, AMDGPU::sub0); + const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC); + const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); - unsigned DestSub0 = MRI.createVirtualRegister(DestRC); + unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) .addOperand(SrcReg0Sub0) .addOperand(SrcReg1Sub0); @@ -2486,12 +2743,12 @@ void SIInstrInfo::splitScalar64BitBinaryOp( MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); - unsigned DestSub1 = MRI.createVirtualRegister(DestSubRC); + unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) .addOperand(SrcReg0Sub1) .addOperand(SrcReg1Sub1); - unsigned FullDestReg = MRI.createVirtualRegister(DestRC); + unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) .addReg(DestSub0) .addImm(AMDGPU::sub0) @@ -2502,8 +2759,11 @@ void SIInstrInfo::splitScalar64BitBinaryOp( // Try to legalize the operands in case we need to swap the order to keep it // valid. - Worklist.push_back(LoHalf); - Worklist.push_back(HiHalf); + legalizeOperands(LoHalf); + legalizeOperands(HiHalf); + + // Move all users of this moved vlaue. + addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, @@ -2532,18 +2792,19 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); - MachineInstr *First = BuildMI(MBB, MII, DL, InstDesc, MidReg) + BuildMI(MBB, MII, DL, InstDesc, MidReg) .addOperand(SrcRegSub0) .addImm(0); - MachineInstr *Second = BuildMI(MBB, MII, DL, InstDesc, ResultReg) + BuildMI(MBB, MII, DL, InstDesc, ResultReg) .addOperand(SrcRegSub1) .addReg(MidReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); - Worklist.push_back(First); - Worklist.push_back(Second); + // We don't need to legalize operands here. src0 for etiher instruction can be + // an SGPR, and the second input is unused or determined here. + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, @@ -2587,6 +2848,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); return; } @@ -2605,33 +2867,53 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, .addImm(AMDGPU::sub1); MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc, - MachineInstr *Inst) const { - // Add the implict and explicit register definitions. - if (NewDesc.ImplicitUses) { - for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) { - unsigned Reg = NewDesc.ImplicitUses[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, false, true)); +void SIInstrInfo::addUsersToMoveToVALUWorklist( + unsigned DstReg, + MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const { + for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), + E = MRI.use_end(); I != E; ++I) { + MachineInstr &UseMI = *I->getParent(); + if (!canReadVGPR(UseMI, I.getOperandNo())) { + Worklist.push_back(&UseMI); } } +} - if (NewDesc.ImplicitDefs) { - for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) { - unsigned Reg = NewDesc.ImplicitDefs[i]; - Inst->addOperand(MachineOperand::CreateReg(Reg, true, true)); - } +const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( + const MachineInstr &Inst) const { + const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); + + switch (Inst.getOpcode()) { + // For target instructions, getOpRegClass just returns the virtual register + // class associated with the operand, so we need to find an equivalent VGPR + // register class in order to move the instruction to the VALU. + case AMDGPU::COPY: + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + if (RI.hasVGPRs(NewDstRC)) + return nullptr; + + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + if (!NewDstRC) + return nullptr; + return NewDstRC; + default: + return NewDstRC; } } +// Find the one SGPR operand we are allowed to use. unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const { - const MCInstrDesc &Desc = get(MI->getOpcode()); + const MCInstrDesc &Desc = MI->getDesc(); // Find the one SGPR operand we are allowed to use. - unsigned SGPRReg = AMDGPU::NoRegister; - + // // First we need to consider the instruction's operand requirements before // legalizing. Some operands are required to be SGPRs, such as implicit uses // of VCC, but we are still bound by the constant bus requirement to only use @@ -2639,17 +2921,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // // If the operand's class is an SGPR, we can never move it. - for (const MachineOperand &MO : MI->implicit_operands()) { - // We only care about reads. - if (MO.isDef()) - continue; - - if (MO.getReg() == AMDGPU::VCC) - return AMDGPU::VCC; - - if (MO.getReg() == AMDGPU::FLAT_SCR) - return AMDGPU::FLAT_SCR; - } + unsigned SGPRReg = findImplicitSGPRRead(*MI); + if (SGPRReg != AMDGPU::NoRegister) + return SGPRReg; unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); @@ -2660,15 +2934,22 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, break; const MachineOperand &MO = MI->getOperand(Idx); - if (RI.isSGPRClassID(Desc.OpInfo[Idx].RegClass)) - SGPRReg = MO.getReg(); + if (!MO.isReg()) + continue; - if (MO.isReg() && RI.isSGPRClass(MRI.getRegClass(MO.getReg()))) - UsedSGPRs[i] = MO.getReg(); - } + // Is this operand statically required to be an SGPR based on the operand + // constraints? + const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass); + bool IsRequiredSGPR = RI.isSGPRClass(OpRC); + if (IsRequiredSGPR) + return MO.getReg(); - if (SGPRReg != AMDGPU::NoRegister) - return SGPRReg; + // If this could be a VGPR or an SGPR, Check the dynamic register class. + unsigned Reg = MO.getReg(); + const TargetRegisterClass *RegRC = MRI.getRegClass(Reg); + if (RI.isSGPRClass(RegRC)) + UsedSGPRs[i] = Reg; + } // We don't have a required SGPR operand, so we have a bit more freedom in // selecting operands to move. @@ -2680,6 +2961,9 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // V_FMA_F32 v0, s0, s0, s0 -> No moves // V_FMA_F32 v0, s0, s1, s0 -> Move s1 + // TODO: If some of the operands are 64-bit SGPRs and some 32, we should + // prefer those. + if (UsedSGPRs[0] != AMDGPU::NoRegister) { if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2]) SGPRReg = UsedSGPRs[0]; @@ -2720,7 +3004,7 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead( unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( getIndirectIndexBegin(*MBB->getParent())); - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC)) + return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) .addOperand(I->getOperand(0)) .addOperand(I->getOperand(1)) .addReg(IndirectBaseReg) diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 5053786..307ef67 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -39,14 +39,11 @@ private: unsigned SubIdx, const TargetRegisterClass *SubRC) const; - unsigned split64BitImm(SmallVectorImpl<MachineInstr *> &Worklist, - MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - const TargetRegisterClass *RC, - const MachineOperand &Op) const; - void swapOperands(MachineBasicBlock::iterator Inst) const; + void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + MachineInstr *Inst) const; + void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst, unsigned Opcode) const; @@ -58,13 +55,24 @@ private: void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr *Inst) const; - void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const; + void addUsersToMoveToVALUWorklist( + unsigned Reg, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; + + const TargetRegisterClass * + getDestEquivalentVGPRClass(const MachineInstr &Inst) const; bool checkInstOffsetsDoNotOverlap(MachineInstr *MIa, MachineInstr *MIb) const; unsigned findUsedSGPR(const MachineInstr *MI, int OpIndices[3]) const; +protected: + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx0, + unsigned OpIdx1) const override; + public: explicit SIInstrInfo(const AMDGPUSubtarget &st); @@ -117,17 +125,14 @@ public: // register. If there is no hardware instruction that can store to \p // DstRC, then AMDGPU::COPY is returned. unsigned getMovOpcode(const TargetRegisterClass *DstRC) const; + + LLVM_READONLY int commuteOpcode(const MachineInstr &MI) const; - MachineInstr *commuteInstruction(MachineInstr *MI, - bool NewMI = false) const override; bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; - bool isTriviallyReMaterializable(const MachineInstr *MI, - AliasAnalysis *AA = nullptr) const; - bool areMemAccessesTriviallyDisjoint( MachineInstr *MIa, MachineInstr *MIb, AliasAnalysis *AA = nullptr) const override; @@ -137,8 +142,6 @@ public: unsigned DstReg, unsigned SrcReg) const override; bool isMov(unsigned Opcode) const override; - bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; - bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, unsigned Reg, MachineRegisterInfo *MRI) const final; @@ -148,78 +151,154 @@ public: MachineBasicBlock::iterator &MI, LiveVariables *LV) const override; + static bool isSALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SALU; + } + bool isSALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SALU; } + static bool isVALU(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VALU; + } + bool isVALU(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VALU; } + static bool isSOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP1; + } + bool isSOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP1; } + static bool isSOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOP2; + } + bool isSOP2(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOP2; } + static bool isSOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPC; + } + bool isSOPC(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPC; } + static bool isSOPK(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPK; + } + bool isSOPK(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPK; } + static bool isSOPP(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SOPP; + } + bool isSOPP(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SOPP; } + static bool isVOP1(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP1; + } + bool isVOP1(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP1; } + static bool isVOP2(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP2; + } + bool isVOP2(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP2; } + static bool isVOP3(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOP3; + } + bool isVOP3(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOP3; } + static bool isVOPC(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VOPC; + } + bool isVOPC(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VOPC; } + static bool isMUBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MUBUF; + } + bool isMUBUF(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MUBUF; } + static bool isMTBUF(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MTBUF; + } + bool isMTBUF(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MTBUF; } + static bool isSMRD(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::SMRD; + } + bool isSMRD(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::SMRD; } + static bool isDS(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::DS; + } + bool isDS(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::DS; } + static bool isMIMG(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::MIMG; + } + bool isMIMG(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::MIMG; } + static bool isFLAT(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::FLAT; + } + bool isFLAT(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::FLAT; } + static bool isWQM(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::WQM; + } + bool isWQM(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::WQM; } + static bool isVGPRSpill(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::VGPRSpill; + } + bool isVGPRSpill(uint16_t Opcode) const { return get(Opcode).TSFlags & SIInstrFlags::VGPRSpill; } @@ -302,6 +381,26 @@ public: bool isOperandLegal(const MachineInstr *MI, unsigned OpIdx, const MachineOperand *MO = nullptr) const; + /// \brief Check if \p MO would be a valid operand for the given operand + /// definition \p OpInfo. Note this does not attempt to validate constant bus + /// restrictions (e.g. literal constant usage). + bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Check if \p MO (a register operand) is a legal register for the + /// given operand description. + bool isLegalRegOperand(const MachineRegisterInfo &MRI, + const MCOperandInfo &OpInfo, + const MachineOperand &MO) const; + + /// \brief Legalize operands in \p MI by either commuting it or inserting a + /// copy of src1. + void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr *MI) const; + + /// \brief Fix operands in \p MI to satisfy constant bus requirements. + void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const; + /// \brief Legalize all operands in this instruction. This function may /// create new instruction and insert them before \p MI. void legalizeOperands(MachineInstr *MI) const; @@ -312,7 +411,8 @@ public: unsigned HalfImmOp, unsigned HalfSGPROp, MachineInstr *&Lo, MachineInstr *&Hi) const; - void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const; + void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI, + SmallVectorImpl<MachineInstr *> &Worklist) const; /// \brief Replace this instruction's opcode with the equivalent VALU /// opcode. This function will also move the users of \p MI to the @@ -341,29 +441,49 @@ public: void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I, unsigned SavReg, unsigned IndexReg) const; - void insertNOPs(MachineBasicBlock::iterator MI, int Count) const; + void insertWaitStates(MachineBasicBlock::iterator MI, int Count) const; /// \brief Returns the operand named \p Op. If \p MI does not have an /// operand named \c Op, this function returns nullptr. + LLVM_READONLY MachineOperand *getNamedOperand(MachineInstr &MI, unsigned OperandName) const; + LLVM_READONLY const MachineOperand *getNamedOperand(const MachineInstr &MI, unsigned OpName) const { return getNamedOperand(const_cast<MachineInstr &>(MI), OpName); } + /// Get required immediate operand + int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const { + int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName); + return MI.getOperand(Idx).getImm(); + } + uint64_t getDefaultRsrcDataFormat() const; uint64_t getScratchRsrcWords23() const; }; namespace AMDGPU { - + LLVM_READONLY int getVOPe64(uint16_t Opcode); + + LLVM_READONLY int getVOPe32(uint16_t Opcode); + + LLVM_READONLY int getCommuteRev(uint16_t Opcode); + + LLVM_READONLY int getCommuteOrig(uint16_t Opcode); + + LLVM_READONLY int getAddr64Inst(uint16_t Opcode); + + LLVM_READONLY int getAtomicRetOp(uint16_t Opcode); + + LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode); const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 8d8110b..10f2adde 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -8,9 +8,9 @@ //===----------------------------------------------------------------------===// def isCI : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SEA_ISLANDS">; -def isVI : Predicate < - "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">, - AssemblerPredicate<"FeatureGCN3Encoding">; +def isCIOnly : Predicate<"Subtarget->getGeneration() ==" + "AMDGPUSubtarget::SEA_ISLANDS">, + AssemblerPredicate <"FeatureSeaIslands">; def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">; @@ -69,6 +69,15 @@ class sopk <bits<5> si, bits<5> vi = si> { field bits<5> VI = vi; } +// Specify an SMRD opcode for SI and SMEM opcode for VI + +// FIXME: This should really be bits<5> si, Tablegen crashes if +// parameter default value is other parameter with different bit size +class smrd<bits<8> si, bits<8> vi = si> { + field bits<5> SI = si{4-0}; + field bits<8> VI = vi; +} + // Execpt for the NONE field, this must be kept in sync with the SISubtarget enum // in AMDGPUInstrInfo.cpp def SISubtarget { @@ -121,9 +130,20 @@ def SIsampled : SDSample<"AMDGPUISD::SAMPLED">; def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">; def SIconstdata_ptr : SDNode< - "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 0, [SDTCisVT<0, i64>]> + "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, i64>, + SDTCisVT<0, i64>]> >; +def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(cast<LoadSDNode>(N)) || + isConstantLoad(cast<LoadSDNode>(N), -1); +}]>; + +def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(cast<LoadSDNode>(N), -1) && + static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N); +}]>; + //===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. @@ -328,9 +348,9 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{ static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end(); U != E; ++U) { - if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) { + const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo()); + if (RC && SIRI->isSGPRClass(RC)) return true; - } } return false; }]>; @@ -354,6 +374,8 @@ def sopp_brtarget : Operand<OtherVT> { let ParserMatchClass = SoppBrTarget; } +def const_ga : Operand<iPTR>; + include "SIInstrFormats.td" include "VIInstrFormats.td" @@ -393,7 +415,7 @@ def GDS01MatchClass : GDSBaseMatchClass <"parseDSOff01OptionalOps">; class GLCBaseMatchClass <string parser> : AsmOperandClass { let Name = "GLC"#parser; let PredicateMethod = "isImm"; - let ParserMethod = parser; + let ParserMethod = parser; let RenderMethod = "addImmOperands"; } @@ -436,6 +458,17 @@ def ClampMatchClass : AsmOperandClass { let RenderMethod = "addImmOperands"; } +class SMRDOffsetBaseMatchClass <string predicate> : AsmOperandClass { + let Name = "SMRDOffset"#predicate; + let PredicateMethod = predicate; + let RenderMethod = "addImmOperands"; +} + +def SMRDOffsetMatchClass : SMRDOffsetBaseMatchClass <"isSMRDOffset">; +def SMRDLiteralOffsetMatchClass : SMRDOffsetBaseMatchClass < + "isSMRDLiteralOffset" +>; + let OperandType = "OPERAND_IMMEDIATE" in { def offen : Operand<i1> { @@ -510,6 +543,16 @@ def ClampMod : Operand <i1> { let ParserMatchClass = ClampMatchClass; } +def smrd_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDOffsetMatchClass; +} + +def smrd_literal_offset : Operand <i32> { + let PrintMethod = "printU32ImmOperand"; + let ParserMatchClass = SMRDLiteralOffsetMatchClass; +} + } // End OperandType = "OPERAND_IMMEDIATE" def VOPDstS64 : VOPDstOperand <SReg_64>; @@ -528,6 +571,13 @@ def MUBUFScratch : ComplexPattern<i64, 4, "SelectMUBUFScratch">; def MUBUFOffset : ComplexPattern<i64, 6, "SelectMUBUFOffset">; def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">; +def SMRDImm : ComplexPattern<i64, 2, "SelectSMRDImm">; +def SMRDImm32 : ComplexPattern<i64, 2, "SelectSMRDImm32">; +def SMRDSgpr : ComplexPattern<i64, 2, "SelectSMRDSgpr">; +def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">; +def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">; +def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">; + def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">; def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">; def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">; @@ -717,19 +767,6 @@ class SOP2_Real_vi<sop2 op, string opName, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass SOP2_SELECT_32 <sop2 op, string opName, list<dag> pattern> { - def "" : SOP2_Pseudo <opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), pattern>; - - def _si : SOP2_Real_si <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]">; - - def _vi : SOP2_Real_vi <op, opName, (outs SReg_32:$dst), - (ins SSrc_32:$src0, SSrc_32:$src1, SCCReg:$scc), - opName#" $dst, $src0, $src1 [$scc]">; -} - multiclass SOP2_m <sop2 op, string opName, dag outs, dag ins, string asm, list<dag> pattern> { @@ -758,8 +795,10 @@ multiclass SOP2_64_32 <sop2 op, string opName, list<dag> pattern> : SOP2_m < class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt, string opName, PatLeaf cond> : SOPC < - op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1), - opName#" $src0, $src1", []>; + op, (outs), (ins rc:$src0, rc:$src1), + opName#" $src0, $src1", []> { + let Defs = [SCC]; +} class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL> : SOPC_Helper<op, SSrc_32, i32, opName, cond>; @@ -812,15 +851,20 @@ multiclass SOPK_32 <sopk op, string opName, list<dag> pattern> { } multiclass SOPK_SCC <sopk op, string opName, list<dag> pattern> { - def "" : SOPK_Pseudo <opName, (outs SCCReg:$dst), - (ins SReg_32:$src0, u16imm:$src1), pattern>; + def "" : SOPK_Pseudo <opName, (outs), + (ins SReg_32:$src0, u16imm:$src1), pattern> { + let Defs = [SCC]; + } + - let DisableEncoding = "$dst" in { - def _si : SOPK_Real_si <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + def _si : SOPK_Real_si <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; + } - def _vi : SOPK_Real_vi <op, opName, (outs SCCReg:$dst), - (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16">; + def _vi : SOPK_Real_vi <op, opName, (outs), + (ins SReg_32:$sdst, u16imm:$simm16), opName#" $sdst, $simm16"> { + let Defs = [SCC]; } } @@ -868,35 +912,68 @@ class SMRD_Real_si <bits<5> op, string opName, bit imm, dag outs, dag ins, } class SMRD_Real_vi <bits<8> op, string opName, bit imm, dag outs, dag ins, - string asm> : - SMRD <outs, ins, asm, []>, + string asm, list<dag> pattern = []> : + SMRD <outs, ins, asm, pattern>, SMEMe_vi <op, imm>, SIMCInstr<opName, SISubtarget.VI> { let AssemblerPredicates = [isVI]; } -multiclass SMRD_m <bits<5> op, string opName, bit imm, dag outs, dag ins, +multiclass SMRD_m <smrd op, string opName, bit imm, dag outs, dag ins, string asm, list<dag> pattern> { def "" : SMRD_Pseudo <opName, outs, ins, pattern>; - def _si : SMRD_Real_si <op, opName, imm, outs, ins, asm>; + def _si : SMRD_Real_si <op.SI, opName, imm, outs, ins, asm>; // glc is only applicable to scalar stores, which are not yet // implemented. let glc = 0 in { - def _vi : SMRD_Real_vi <{0, 0, 0, op}, opName, imm, outs, ins, asm>; + def _vi : SMRD_Real_vi <op.VI, opName, imm, outs, ins, asm>; } } -multiclass SMRD_Helper <bits<5> op, string opName, RegisterClass baseClass, +multiclass SMRD_Inval <smrd op, string opName, + SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1 in { + def "" : SMRD_Pseudo <opName, (outs), (ins), [(node)]>; + + let sbase = 0, offset = 0 in { + let sdst = 0 in { + def _si : SMRD_Real_si <op.SI, opName, 0, (outs), (ins), opName>; + } + + let glc = 0, sdata = 0 in { + def _vi : SMRD_Real_vi <op.VI, opName, 0, (outs), (ins), opName>; + } + } + } +} + +class SMEM_Inval <bits<8> op, string opName, SDPatternOperator node> : + SMRD_Real_vi<op, opName, 0, (outs), (ins), opName, [(node)]> { + let hasSideEffects = 1; + let mayStore = 1; + let sbase = 0; + let sdata = 0; + let glc = 0; + let offset = 0; +} + +multiclass SMRD_Helper <smrd op, string opName, RegisterClass baseClass, RegisterClass dstClass> { defm _IMM : SMRD_m < op, opName#"_IMM", 1, (outs dstClass:$dst), - (ins baseClass:$sbase, u32imm:$offset), + (ins baseClass:$sbase, smrd_offset:$offset), opName#" $dst, $sbase, $offset", [] >; + def _IMM_ci : SMRD < + (outs dstClass:$dst), (ins baseClass:$sbase, smrd_literal_offset:$offset), + opName#" $dst, $sbase, $offset", []>, SMRD_IMMe_ci <op.SI> { + let AssemblerPredicates = [isCIOnly]; + } + defm _SGPR : SMRD_m < op, opName#"_SGPR", 0, (outs dstClass:$dst), (ins baseClass:$sbase, SReg_32:$soff), @@ -922,11 +999,12 @@ def InputModsNoDefault : Operand <i32> { let ParserMatchClass = InputModsMatchClass; } -class getNumSrcArgs<ValueType Src1, ValueType Src2> { +class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> { int ret = - !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 + !if (!eq(Src0.Value, untyped.Value), 0, + !if (!eq(Src1.Value, untyped.Value), 1, // VOP1 !if (!eq(Src2.Value, untyped.Value), 2, // VOP2 - 3)); // VOP3 + 3))); // VOP3 } // Returns the register class to use for the destination of VOP[123C] @@ -934,28 +1012,37 @@ class getNumSrcArgs<ValueType Src1, ValueType Src2> { class getVALUDstForVT<ValueType VT> { RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>, !if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>, - VOPDstOperand<SReg_64>)); // else VT == i1 + !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>, + VOPDstOperand<SReg_64>))); // else VT == i1 } // Returns the register class to use for source 0 of VOP[12C] // instructions for the given VT. class getVOPSrc0ForVT<ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 32), VSrc_32, VSrc_64); + RegisterOperand ret = !if(!eq(VT.Size, 64), VSrc_64, VSrc_32); } // Returns the register class to use for source 1 of VOP[12C] for the // given VT. class getVOPSrc1ForVT<ValueType VT> { - RegisterClass ret = !if(!eq(VT.Size, 32), VGPR_32, VReg_64); + RegisterClass ret = !if(!eq(VT.Size, 64), VReg_64, VGPR_32); } // Returns the register class to use for sources of VOP3 instructions for the // given VT. class getVOP3SrcForVT<ValueType VT> { - RegisterOperand ret = !if(!eq(VT.Size, 32), VCSrc_32, VCSrc_64); + RegisterOperand ret = + !if(!eq(VT.Size, 64), + VCSrc_64, + !if(!eq(VT.Value, i1.Value), + SCSrc_64, + VCSrc_32 + ) + ); } // Returns 1 if the source arguments have modifiers, 0 if they do not. +// XXX - do f16 instructions? class hasModifiers<ValueType SrcVT> { bit ret = !if(!eq(SrcVT.Value, f32.Value), 1, !if(!eq(SrcVT.Value, f64.Value), 1, 0)); @@ -1009,17 +1096,20 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC, // Returns the assembly string for the inputs and outputs of a VOP[12C] // instruction. This does not add the _e32 suffix, so it can be reused // by getAsm64. -class getAsm32 <int NumSrcArgs> { +class getAsm32 <bit HasDst, int NumSrcArgs> { + string dst = "$dst"; + string src0 = ", $src0"; string src1 = ", $src1"; string src2 = ", $src2"; - string ret = "$dst, $src0"# - !if(!eq(NumSrcArgs, 1), "", src1)# - !if(!eq(NumSrcArgs, 3), src2, ""); + string ret = !if(HasDst, dst, "") # + !if(!eq(NumSrcArgs, 1), src0, "") # + !if(!eq(NumSrcArgs, 2), src0#src1, "") # + !if(!eq(NumSrcArgs, 3), src0#src1#src2, ""); } // Returns the assembly string for the inputs and outputs of a VOP3 // instruction. -class getAsm64 <int NumSrcArgs, bit HasModifiers> { +class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers> { string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); string src1 = !if(!eq(NumSrcArgs, 1), "", !if(!eq(NumSrcArgs, 2), " $src1_modifiers", @@ -1027,11 +1117,10 @@ class getAsm64 <int NumSrcArgs, bit HasModifiers> { string src2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); string ret = !if(!eq(HasModifiers, 0), - getAsm32<NumSrcArgs>.ret, + getAsm32<HasDst, NumSrcArgs>.ret, "$dst, "#src0#src1#src2#"$clamp"#"$omod"); } - class VOPProfile <list<ValueType> _ArgVT> { field list<ValueType> ArgVT = _ArgVT; @@ -1047,29 +1136,38 @@ class VOPProfile <list<ValueType> _ArgVT> { field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret; field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret; - field int NumSrcArgs = getNumSrcArgs<Src1VT, Src2VT>.ret; + field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1); + field bit HasDst32 = HasDst; + field int NumSrcArgs = getNumSrcArgs<Src0VT, Src1VT, Src2VT>.ret; field bit HasModifiers = hasModifiers<Src0VT>.ret; - field dag Outs = (outs DstRC:$dst); + field dag Outs = !if(HasDst,(outs DstRC:$dst),(outs)); + + // VOP3b instructions are a special case with a second explicit + // output. This is manually overridden for them. + field dag Outs32 = Outs; + field dag Outs64 = Outs; field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret; field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs, HasModifiers>.ret; - field string Asm32 = getAsm32<NumSrcArgs>.ret; - field string Asm64 = getAsm64<NumSrcArgs, HasModifiers>.ret; + field string Asm32 = getAsm32<HasDst, NumSrcArgs>.ret; + field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers>.ret; } // FIXME: I think these F16/I16 profiles will need to use f16/i16 types in order // for the instruction patterns to work. -def VOP_F16_F16 : VOPProfile <[f32, f32, untyped, untyped]>; -def VOP_F16_I16 : VOPProfile <[f32, i32, untyped, untyped]>; -def VOP_I16_F16 : VOPProfile <[i32, f32, untyped, untyped]>; +def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>; +def VOP_F16_I16 : VOPProfile <[f16, i32, untyped, untyped]>; +def VOP_I16_F16 : VOPProfile <[i32, f16, untyped, untyped]>; -def VOP_F16_F16_F16 : VOPProfile <[f32, f32, f32, untyped]>; -def VOP_F16_F16_I16 : VOPProfile <[f32, f32, i32, untyped]>; +def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; +def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i32, untyped]>; def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>; +def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; + def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; def VOP_F32_F64 : VOPProfile <[f32, f64, untyped, untyped]>; def VOP_F32_I32 : VOPProfile <[f32, i32, untyped, untyped]>; @@ -1087,25 +1185,76 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>; def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>; def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>; def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>; -def VOP_I32_I32_I32_VCC : VOPProfile <[i32, i32, i32, untyped]> { + +// Write out to vcc or arbitrary SGPR. +def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> { + let Asm32 = "$dst, vcc, $src0, $src1"; + let Asm64 = "$dst, $sdst, $src0, $src1"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); +} + +// Write out to vcc or arbitrary SGPR and read in from vcc or +// arbitrary SGPR. +def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> { + // We use VCSrc_32 to exclude literal constants, even though the + // encoding normally allows them since the implicit VCC use means + // using one would always violate the constant bus + // restriction. SGPRs are still allowed because it should + // technically be possible to use VCC again as src0. let Src0RC32 = VCSrc_32; + let Asm32 = "$dst, vcc, $src0, $src1, vcc"; + let Asm64 = "$dst, $sdst, $src0, $src1, $src2"; + let Outs32 = (outs DstRC:$dst); + let Outs64 = (outs DstRC:$dst, SReg_64:$sdst); + + // Suppress src2 implied by type since the 32-bit encoding uses an + // implicit VCC use. + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); } -def VOP_I1_F32_I32 : VOPProfile <[i1, f32, i32, untyped]> { - let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); - let Asm64 = "$dst, $src0_modifiers, $src1"; +class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { + let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); + let Asm64 = "$vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod"; +} + +def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VGPR_32>; } -def VOP_I1_F64_I32 : VOPProfile <[i1, f64, i32, untyped]> { +def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> { + // FIXME: Hack to stop printing _e64 + let DstRC = RegisterOperand<VReg_64>; +} + +// VOPC instructions are a special case because for the 32-bit +// encoding, we want to display the implicit vcc write as if it were +// an explicit $dst. +class VOPC_Profile<ValueType vt0, ValueType vt1 = vt0> : VOPProfile <[i1, vt0, vt1, untyped]> { + let Asm32 = "vcc, $src0, $src1"; + // The destination for 32-bit encoding is implicit. + let HasDst32 = 0; +} + +class VOPC_Class_Profile<ValueType vt> : VOPC_Profile<vt, i32> { let Ins64 = (ins InputModsNoDefault:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1); let Asm64 = "$dst, $src0_modifiers, $src1"; } +def VOPC_I1_F32_F32 : VOPC_Profile<f32>; +def VOPC_I1_F64_F64 : VOPC_Profile<f64>; +def VOPC_I1_I32_I32 : VOPC_Profile<i32>; +def VOPC_I1_I64_I64 : VOPC_Profile<i64>; + +def VOPC_I1_F32_I32 : VOPC_Class_Profile<f32>; +def VOPC_I1_F64_I32 : VOPC_Class_Profile<f64>; + def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; def VOP_I64_I64_I64 : VOPProfile <[i64, i64, i64, untyped]>; def VOP_CNDMASK : VOPProfile <[i32, i32, i32, untyped]> { - let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VCCReg:$src2); + let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1); let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, SSrc_64:$src2); let Asm64 = "$dst, $src0, $src1, $src2"; } @@ -1119,13 +1268,60 @@ def VOP_MAC : VOPProfile <[f32, f32, f32, f32]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2); let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3, HasModifiers>.ret; - let Asm32 = getAsm32<2>.ret; - let Asm64 = getAsm64<2, HasModifiers>.ret; + let Asm32 = getAsm32<1, 2>.ret; + let Asm64 = getAsm64<1, 2, HasModifiers>.ret; } def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>; def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>; def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>; +class SIInstAlias <string asm, Instruction inst, VOPProfile p> : + InstAlias <asm, (inst)>, PredicateControl { + + field bit isCompare; + field bit isCommutable; + + let ResultInst = + !if (p.HasDst32, + !if (!eq(p.NumSrcArgs, 0), + // 1 dst, 0 src + (inst p.DstRC:$dst), + !if (!eq(p.NumSrcArgs, 1), + // 1 dst, 1 src + (inst p.DstRC:$dst, p.Src0RC32:$src0), + !if (!eq(p.NumSrcArgs, 2), + // 1 dst, 2 src + (inst p.DstRC:$dst, p.Src0RC32:$src0, p.Src1RC32:$src1), + // else - unreachable + (inst)))), + // else + !if (!eq(p.NumSrcArgs, 2), + // 0 dst, 2 src + (inst p.Src0RC32:$src0, p.Src1RC32:$src1), + !if (!eq(p.NumSrcArgs, 1), + // 0 dst, 1 src + (inst p.Src0RC32:$src1), + // else + // 0 dst, 0 src + (inst)))); +} + +class SIInstAliasSI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_si"), p> { + let AssemblerPredicate = SIAssemblerPredicate; +} + +class SIInstAliasVI <string asm, string op_name, VOPProfile p> : + SIInstAlias <asm, !cast<Instruction>(op_name#"_e32_vi"), p> { + let AssemblerPredicates = [isVI]; +} + +multiclass SIInstAliasBuilder <string asm, VOPProfile p> { + + def : SIInstAliasSI <asm, NAME, p>; + + def : SIInstAliasVI <asm, NAME, p>; +} class VOP <string opName> { string OpName = opName; @@ -1165,20 +1361,22 @@ class VOP1_Real_vi <string opName, vop1 op, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass VOP1_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName> { - def "" : VOP1_Pseudo <outs, ins, pattern, opName>; +multiclass VOP1_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; + + def _vi : VOP1_Real_vi <opName, op, p.Outs, p.Ins32, asm>; - def _vi : VOP1_Real_vi <opName, op, outs, ins, asm>; } -multiclass VOP1SI_m <vop1 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName> { - def "" : VOP1_Pseudo <outs, ins, pattern, opName>; +multiclass VOP1SI_m <vop1 op, string opName, VOPProfile p, list<dag> pattern, + string asm = opName#p.Asm32> { + + def "" : VOP1_Pseudo <p.Outs, p.Ins32, pattern, opName>; - def _si : VOP1_Real_si <opName, op, outs, ins, asm>; + def _si : VOP1_Real_si <opName, op, p.Outs, p.Ins32, asm>; } class VOP2_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : @@ -1202,22 +1400,24 @@ class VOP2_Real_vi <string opName, vop2 op, dag outs, dag ins, string asm> : let AssemblerPredicates = [isVI]; } -multiclass VOP2SI_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo <outs, ins, pattern, opName>, +multiclass VOP2SI_m <vop2 op, string opName, VOPProfile p, list<dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; } -multiclass VOP2_m <vop2 op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, string revOp> { - def "" : VOP2_Pseudo <outs, ins, pattern, opName>, +multiclass VOP2_m <vop2 op, string opName, VOPProfile p, list <dag> pattern, + string revOp> { + + def "" : VOP2_Pseudo <p.Outs32, p.Ins32, pattern, opName>, VOP2_REV<revOp#"_e32", !eq(revOp, opName)>; - def _si : VOP2_Real_si <opName, op, outs, ins, asm>; + def _si : VOP2_Real_si <opName, op, p.Outs32, p.Ins32, p.Asm32>; - def _vi : VOP2_Real_vi <opName, op, outs, ins, asm>; + def _vi : VOP2_Real_vi <opName, op, p.Outs32, p.Ins32, p.Asm32>; } @@ -1250,6 +1450,9 @@ class VOP3_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : MnemonicAlias<opName#"_e64", opName> { let isPseudo = 1; let isCodeGenOnly = 1; + + field bit vdst; + field bit src0; } class VOP3_Real_si <bits<9> op, dag outs, dag ins, string asm, string opName> : @@ -1295,22 +1498,6 @@ multiclass VOP3_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, HasMods>; } -// VOP3_m without source modifiers -multiclass VOP3_m_nomods <vop op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, int NumSrcArgs, bit HasMods = 1> { - - def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - - let src0_modifiers = 0, - src1_modifiers = 0, - src2_modifiers = 0, - clamp = 0, - omod = 0 in { - def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>; - def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>; - } -} - multiclass VOP3_1_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, bit HasMods = 1> { @@ -1335,7 +1522,7 @@ multiclass VOP3SI_1_m <vop op, dag outs, dag ins, string asm, multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { + bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; @@ -1349,7 +1536,7 @@ multiclass VOP3_2_m <vop op, dag outs, dag ins, string asm, multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { + bit HasMods = 1> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; @@ -1360,54 +1547,41 @@ multiclass VOP3SI_2_m <vop op, dag outs, dag ins, string asm, // No VI instruction. This class is for SI only. } -// XXX - Is v_div_scale_{f32|f64} only available in vop3b without -// option of implicit vcc use? -multiclass VOP3b_2_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { - def "" : VOP3_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; - - // The VOP2 variant puts the carry out into VCC, the VOP3 variant - // can write it into any SGPR. We currently don't use the carry out, - // so for now hardcode it to VCC as well. - let sdst = SIOperand.VCC, Defs = [VCC] in { - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 0, HasMods>; - - def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 0, HasMods>; - } // End sdst = SIOperand.VCC, Defs = [VCC] -} - -multiclass VOP3b_3_m <vop op, dag outs, dag ins, string asm, - list<dag> pattern, string opName, string revOp, - bit HasMods = 1, bit UseFullOp = 0> { +// Two operand VOP3b instruction that may have a 3rd SGPR bool operand +// instead of an implicit VCC as in the VOP2b format. +multiclass VOP3b_2_3_m <vop op, dag outs, dag ins, string asm, + list<dag> pattern, string opName, string revOp, + bit HasMods = 1, bit useSrc2Input = 0> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>; - def _si : VOP3b_Real_si <op.SI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 1, HasMods>; + VOP3DisableFields<1, useSrc2Input, HasMods>; def _vi : VOP3b_Real_vi <op.VI3, outs, ins, asm, opName>, - VOP3DisableFields<1, 1, HasMods>; + VOP3DisableFields<1, useSrc2Input, HasMods>; } multiclass VOP3_C_m <vop op, dag outs, dag ins, string asm, list<dag> pattern, string opName, - bit HasMods, bit defExec, string revOp> { + bit HasMods, bit defExec, + string revOp, list<SchedReadWrite> sched> { def "" : VOP3_Pseudo <outs, ins, pattern, opName>, - VOP2_REV<revOp#"_e64", !eq(revOp, opName)>; + VOP2_REV<revOp#"_e64", !eq(revOp, opName)> { + let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; + } def _si : VOP3_Real_si <op.SI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } def _vi : VOP3_Real_vi <op.VI3, outs, ins, asm, opName>, VOP3DisableFields<1, 0, HasMods> { let Defs = !if(defExec, [EXEC], []); + let SchedRW = sched; } } @@ -1432,32 +1606,28 @@ multiclass VOP2SI_3VI_m <vop3 op, string opName, dag outs, dag ins, } } -multiclass VOP1_Helper <vop1 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - bit HasMods> { +multiclass VOP1_Helper <vop1 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64> { - defm _e32 : VOP1_m <op, outs, ins32, opName#asm32, pat32, opName>; + defm _e32 : VOP1_m <op, opName, p, pat32>; - defm _e64 : VOP3_1_m <op, outs, ins64, opName#asm64, pat64, opName, HasMods>; + defm _e64 : VOP3_1_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + p.HasModifiers>; } multiclass VOP1Inst <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP1_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))], - [(set P.DstVT:$dst, (node P.Src0VT:$src0))]), - P.HasModifiers + [(set P.DstVT:$dst, (node P.Src0VT:$src0))]) >; multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> { - defm _e32 : VOP1SI_m <op, P.Outs, P.Ins32, opName#P.Asm32, [], opName>; + defm _e32 : VOP1SI_m <op, opName, P, []>; defm _e64 : VOP3SI_1_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, @@ -1467,36 +1637,33 @@ multiclass VOP1InstSI <vop1 op, string opName, VOPProfile P, opName, P.HasModifiers>; } -multiclass VOP2_Helper <vop2 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; +multiclass VOP2_Helper <vop2 op, string opName, VOPProfile p, list<dag> pat32, + list<dag> pat64, string revOp> { - defm _e64 : VOP3_2_m <op, - outs, ins64, opName#asm64, pat64, opName, revOp, HasMods - >; + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; } multiclass VOP2Inst <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp >; multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> { - defm _e32 : VOP2SI_m <op, P.Outs, P.Ins32, P.Asm32, [], opName, revOp>; + + defm _e32 : VOP2SI_m <op, opName, P, [], revOp>; defm _e64 : VOP3SI_2_m <op, P.Outs, P.Ins64, opName#P.Asm64, !if(P.HasModifiers, @@ -1508,58 +1675,55 @@ multiclass VOP2InstSI <vop2 op, string opName, VOPProfile P, opName, revOp, P.HasModifiers>; } -multiclass VOP2b_Helper <vop2 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { +multiclass VOP2b_Helper <vop2 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, + string revOp, bit useSGPRInput> { - defm _e32 : VOP2_m <op, outs, ins32, asm32, pat32, opName, revOp>; + let SchedRW = [Write32Bit, WriteSALU] in { + let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in { + defm _e32 : VOP2_m <op, opName, p, pat32, revOp>; + } - defm _e64 : VOP3b_2_m <op, - outs, ins64, opName#asm64, pat64, opName, revOp, HasMods - >; + defm _e64 : VOP3b_2_3_m <op, p.Outs64, p.Ins64, opName#p.Asm64, pat64, + opName, revOp, p.HasModifiers, useSGPRInput>; + } } multiclass VOP2bInst <vop2 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2b_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp, !eq(P.NumSrcArgs, 3) >; // A VOP2 instruction that is VOP3-only on VI. -multiclass VOP2_VI3_Helper <vop23 op, string opName, dag outs, - dag ins32, string asm32, list<dag> pat32, - dag ins64, string asm64, list<dag> pat64, - string revOp, bit HasMods> { - defm _e32 : VOP2SI_m <op, outs, ins32, asm32, pat32, opName, revOp>; +multiclass VOP2_VI3_Helper <vop23 op, string opName, VOPProfile p, + list<dag> pat32, list<dag> pat64, string revOp> { - defm _e64 : VOP3_2_m <op, outs, ins64, opName#asm64, pat64, opName, - revOp, HasMods>; + defm _e32 : VOP2SI_m <op, opName, p, pat32, revOp>; + + defm _e64 : VOP3_2_m <op, p.Outs, p.Ins64, opName#p.Asm64, pat64, opName, + revOp, p.HasModifiers>; } multiclass VOP2_VI3_Inst <vop23 op, string opName, VOPProfile P, SDPatternOperator node = null_frag, string revOp = opName> : VOP2_VI3_Helper < - op, opName, P.Outs, - P.Ins32, P.Asm32, [], - P.Ins64, P.Asm64, + op, opName, P, [], !if(P.HasModifiers, [(set P.DstVT:$dst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$dst, (node P.Src0VT:$src0, P.Src1VT:$src1))]), - revOp, P.HasModifiers + revOp >; multiclass VOP2MADK <vop2 op, string opName, list<dag> pattern = []> { @@ -1583,64 +1747,75 @@ let isCodeGenOnly = 0 in { } // End isCodeGenOnly = 0 } -class VOPC_Pseudo <dag outs, dag ins, list<dag> pattern, string opName> : +class VOPC_Pseudo <dag ins, list<dag> pattern, string opName> : VOPCCommon <ins, "", pattern>, VOP <opName>, - SIMCInstr<opName#"_e32", SISubtarget.NONE>, - MnemonicAlias<opName#"_e32", opName> { + SIMCInstr<opName#"_e32", SISubtarget.NONE> { let isPseudo = 1; let isCodeGenOnly = 1; } -multiclass VOPC_m <vopc op, dag outs, dag ins, string asm, list<dag> pattern, - string opName, bit DefExec, string revOpName = ""> { - def "" : VOPC_Pseudo <outs, ins, pattern, opName>; - - def _si : VOPC<op.SI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.SI> { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - let AssemblerPredicates = [isSICI]; +multiclass VOPC_m <vopc op, dag ins, string op_asm, list<dag> pattern, + string opName, bit DefExec, VOPProfile p, + list<SchedReadWrite> sched, + string revOpName = "", string asm = opName#"_e32 "#op_asm, + string alias_asm = opName#" "#op_asm> { + def "" : VOPC_Pseudo <ins, pattern, opName> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let SchedRW = sched; } - def _vi : VOPC<op.VI, ins, asm, []>, - SIMCInstr <opName#"_e32", SISubtarget.VI> { - let Defs = !if(DefExec, [EXEC], []); - let hasSideEffects = DefExec; - let AssemblerPredicates = [isVI]; - } + let AssemblerPredicates = [isSICI] in { + def _si : VOPC<op.SI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.SI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isSICI] + + let AssemblerPredicates = [isVI] in { + def _vi : VOPC<op.VI, ins, asm, []>, + SIMCInstr <opName#"_e32", SISubtarget.VI> { + let Defs = !if(DefExec, [VCC, EXEC], [VCC]); + let hasSideEffects = DefExec; + let SchedRW = sched; + } + + } // End AssemblerPredicates = [isVI] + + defm : SIInstAliasBuilder<alias_asm, p>; } -multiclass VOPC_Helper <vopc op, string opName, - dag ins32, string asm32, list<dag> pat32, - dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; +multiclass VOPC_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; - defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, - opName, HasMods, DefExec, revOp>; + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>; } // Special case for class instructions which only have modifiers on // the 1st source operand. -multiclass VOPC_Class_Helper <vopc op, string opName, - dag ins32, string asm32, list<dag> pat32, - dag out64, dag ins64, string asm64, list<dag> pat64, - bit HasMods, bit DefExec, string revOp> { - defm _e32 : VOPC_m <op, (outs), ins32, opName#asm32, pat32, opName, DefExec>; - - defm _e64 : VOP3_C_m <op, out64, ins64, opName#asm64, pat64, - opName, HasMods, DefExec, revOp>, +multiclass VOPC_Class_Helper <vopc op, string opName, list<dag> pat32, + list<dag> pat64, bit DefExec, string revOp, + VOPProfile p, list<SchedReadWrite> sched> { + defm _e32 : VOPC_m <op, p.Ins32, p.Asm32, pat32, opName, DefExec, p, sched>; + + defm _e64 : VOP3_C_m <op, (outs VOPDstS64:$dst), p.Ins64, opName#p.Asm64, pat64, + opName, p.HasModifiers, DefExec, revOp, sched>, VOP3DisableModFields<1, 0, 0>; } multiclass VOPCInst <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, string revOp = opName, - bit DefExec = 0> : VOPC_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + bit DefExec = 0, + list<SchedReadWrite> sched = [Write32Bit]> : + VOPC_Helper < + op, opName, [], !if(P.HasModifiers, [(set i1:$dst, (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, @@ -1648,51 +1823,51 @@ multiclass VOPCInst <vopc op, string opName, (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), cond))], [(set i1:$dst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]), - P.HasModifiers, DefExec, revOp + DefExec, revOp, P, sched >; multiclass VOPCClassInst <vopc op, string opName, VOPProfile P, - bit DefExec = 0> : VOPC_Class_Helper < - op, opName, - P.Ins32, P.Asm32, [], - (outs VOPDstS64:$dst), P.Ins64, P.Asm64, + bit DefExec = 0, + list<SchedReadWrite> sched> : VOPC_Class_Helper < + op, opName, [], !if(P.HasModifiers, [(set i1:$dst, (AMDGPUfp_class (P.Src0VT (VOP3Mods0Clamp0OMod P.Src0VT:$src0, i32:$src0_modifiers)), P.Src1VT:$src1))], [(set i1:$dst, (AMDGPUfp_class P.Src0VT:$src0, P.Src1VT:$src1))]), - P.HasModifiers, DefExec, opName + DefExec, opName, P, sched >; multiclass VOPC_F32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_F32_F32_F32, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_F32_F32, cond, revOp>; multiclass VOPC_F64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_F64_F64_F64, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_F64_F64, cond, revOp, 0, [WriteDoubleAdd]>; multiclass VOPC_I32 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_I32_I32_I32, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_I32_I32, cond, revOp>; multiclass VOPC_I64 <vopc op, string opName, PatLeaf cond = COND_NULL, string revOp = opName> : - VOPCInst <op, opName, VOP_I64_I64_I64, cond, revOp>; + VOPCInst <op, opName, VOPC_I1_I64_I64, cond, revOp, 0, [Write64Bit]>; multiclass VOPCX <vopc op, string opName, VOPProfile P, PatLeaf cond = COND_NULL, + list<SchedReadWrite> sched, string revOp = ""> - : VOPCInst <op, opName, P, cond, revOp, 1>; + : VOPCInst <op, opName, P, cond, revOp, 1, sched>; multiclass VOPCX_F32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_F32_F32_F32, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_F32_F32, COND_NULL, [Write32Bit], revOp>; multiclass VOPCX_F64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_F64_F64_F64, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_F64_F64, COND_NULL, [WriteDoubleAdd], revOp>; multiclass VOPCX_I32 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_I32_I32_I32, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_I32_I32, COND_NULL, [Write32Bit], revOp>; multiclass VOPCX_I64 <vopc op, string opName, string revOp = opName> : - VOPCX <op, opName, VOP_I64_I64_I64, COND_NULL, revOp>; + VOPCX <op, opName, VOPC_I1_I64_I64, COND_NULL, [Write64Bit], revOp>; multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, list<dag> pat, int NumSrcArgs, bit HasMods> : VOP3_m < @@ -1700,16 +1875,16 @@ multiclass VOP3_Helper <vop3 op, string opName, dag outs, dag ins, string asm, >; multiclass VOPC_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F32_I32, 0>; + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 0, [Write32Bit]>; multiclass VOPCX_CLASS_F32 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F32_I32, 1>; + VOPCClassInst <op, opName, VOPC_I1_F32_I32, 1, [Write32Bit]>; multiclass VOPC_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F64_I32, 0>; + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 0, [WriteDoubleAdd]>; multiclass VOPCX_CLASS_F64 <vopc op, string opName> : - VOPCClassInst <op, opName, VOP_I1_F64_I32, 1>; + VOPCClassInst <op, opName, VOPC_I1_F64_I32, 1, [WriteDoubleAdd]>; multiclass VOP3Inst <vop3 op, string opName, VOPProfile P, SDPatternOperator node = null_frag> : VOP3_Helper < @@ -1761,25 +1936,13 @@ multiclass VOP3_VCC_Inst <vop3 op, string opName, 3, 1 >; -multiclass VOP3b_Helper <vop op, RegisterClass vrc, RegisterOperand arc, - string opName, list<dag> pattern> : - VOP3b_3_m < - op, (outs vrc:$vdst, SReg_64:$sdst), - (ins InputModsNoDefault:$src0_modifiers, arc:$src0, - InputModsNoDefault:$src1_modifiers, arc:$src1, - InputModsNoDefault:$src2_modifiers, arc:$src2, - ClampMod:$clamp, omod:$omod), - opName#" $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers"#"$clamp"#"$omod", pattern, - opName, opName, 1, 1 +multiclass VOP3bInst <vop op, string opName, VOPProfile P, list<dag> pattern = []> : + VOP3b_2_3_m < + op, P.Outs64, P.Ins64, + opName#" "#P.Asm64, pattern, + opName, "", 1, 1 >; -multiclass VOP3b_64 <vop3 op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VReg_64, VSrc_64, opName, pattern>; - -multiclass VOP3b_32 <vop3 op, string opName, list<dag> pattern> : - VOP3b_Helper <op, VGPR_32, VSrc_32, opName, pattern>; - - class Vop3ModPat<Instruction Inst, VOPProfile P, SDPatternOperator node> : Pat< (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), @@ -1925,12 +2088,14 @@ multiclass DS_1A1D_RET <bits<8> op, string opName, RegisterClass rc, dag ins = (ins VGPR_32:$addr, rc:$data0, ds_offset:$offset, gds:$gds), string asm = opName#" $vdst, $addr, $data0"#"$offset$gds"> { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; - let data1 = 0 in { - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + let data1 = 0 in { + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } } } @@ -1939,11 +2104,13 @@ multiclass DS_1A2D_RET_m <bits<8> op, string opName, RegisterClass rc, dag outs = (outs rc:$vdst), string asm = opName#" $vdst, $addr, $data0, $data1"#"$offset"#"$gds"> { - def "" : DS_Pseudo <opName, outs, ins, []>, - AtomicNoRet<noRetOp, 1>; + let hasPostISelHook = 1 in { + def "" : DS_Pseudo <opName, outs, ins, []>, + AtomicNoRet<noRetOp, 1>; - def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; - def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + def _si : DS_Off16_Real_si <op, opName, outs, ins, asm>; + def _vi : DS_Off16_Real_vi <op, opName, outs, ins, asm>; + } } multiclass DS_1A2D_RET <bits<8> op, string asm, RegisterClass rc, @@ -2214,7 +2381,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _ADDR64 : MUBUFAtomicAddr64_m < op, name#"_addr64", (outs), - (ins rc:$vdata, SReg_128:$srsrc, VReg_64:$vaddr, + (ins rc:$vdata, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#"$slc", [], 0 >; @@ -2233,7 +2400,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, defm _RTN_ADDR64 : MUBUFAtomicAddr64_m < op, name#"_rtn_addr64", (outs rc:$vdata), - (ins rc:$vdata_in, SReg_128:$srsrc, VReg_64:$vaddr, + (ins rc:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), name#" $vdata, $vaddr, $srsrc, $soffset addr64"#"$offset"#" glc"#"$slc", [(set vt:$vdata, @@ -2245,7 +2412,7 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, op, name#"_rtn_offset", (outs rc:$vdata), (ins rc:$vdata_in, SReg_128:$srsrc, SCSrc_32:$soffset, mbuf_offset:$offset, slc:$slc), - name#" $vdata, $srsrc, $soffset"#"$offset"#" glc $slc", + name#" $vdata, $srsrc, $soffset"#"$offset"#" glc$slc", [(set vt:$vdata, (atomic (MUBUFOffsetAtomic v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$slc), vt:$vdata_in))], 1 @@ -2256,6 +2423,8 @@ multiclass MUBUF_Atomic <mubuf op, string name, RegisterClass rc, } // mayStore = 1, mayLoad = 1, hasPostISelHook = 1 } +// FIXME: tfe can't be an operand because it requires a separate +// opcode because it needs an N+1 register class dest register. multiclass MUBUF_Load_Helper <mubuf op, string name, RegisterClass regClass, ValueType load_vt = i32, SDPatternOperator ld = null_frag> { @@ -2368,47 +2537,121 @@ multiclass MUBUF_Store_Helper <mubuf op, string name, RegisterClass vdataClass, } // End mayLoad = 0, mayStore = 1 } -class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : - FLAT <op, (outs regClass:$vdst), - (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), - asm#" $vdst, $addr"#"$glc"#"$slc"#"$tfe", []> { - let data = 0; - let mayLoad = 1; +// For cache invalidation instructions. +multiclass MUBUF_Invalidate <mubuf op, string opName, SDPatternOperator node> { + let hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" in { + def "" : MUBUF_Pseudo <opName, (outs), (ins), [(node)]>; + + // Set everything to 0. + let offset = 0, offen = 0, idxen = 0, glc = 0, vaddr = 0, + vdata = 0, srsrc = 0, slc = 0, tfe = 0, soffset = 0 in { + let addr64 = 0 in { + def _si : MUBUF_Real_si <op, opName, (outs), (ins), opName>; + } + + def _vi : MUBUF_Real_vi <op, opName, (outs), (ins), opName>; + } + } // End hasSideEffects = 1, mayStore = 1, AsmMatchConverter = "" } -class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> : - FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr, - glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), - name#" $data, $addr"#"$glc"#"$slc"#"$tfe", - []> { +//===----------------------------------------------------------------------===// +// FLAT classes +//===----------------------------------------------------------------------===// + +class flat <bits<7> ci, bits<7> vi = ci> { + field bits<7> CI = ci; + field bits<7> VI = vi; +} - let mayLoad = 0; - let mayStore = 1; +class FLAT_Pseudo <string opName, dag outs, dag ins, list<dag> pattern> : + FLAT <0, outs, ins, "", pattern>, + SIMCInstr<opName, SISubtarget.NONE> { + let isPseudo = 1; + let isCodeGenOnly = 1; +} - // Encoding - let vdst = 0; +class FLAT_Real_ci <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.SI> { + let AssemblerPredicate = isCIOnly; } -multiclass FLAT_ATOMIC <bits<7> op, string name, RegisterClass vdst_rc, - RegisterClass data_rc = vdst_rc> { +class FLAT_Real_vi <bits<7> op, string opName, dag outs, dag ins, string asm> : + FLAT <op, outs, ins, asm, []>, + SIMCInstr<opName, SISubtarget.VI> { + let AssemblerPredicate = VIAssemblerPredicate; +} - let mayLoad = 1, mayStore = 1 in { - def "" : FLAT <op, (outs), - (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, - tfe_flat_atomic:$tfe), - name#" $addr, $data"#"$slc"#"$tfe", []>, - AtomicNoRet <NAME, 0> { - let glc = 0; - let vdst = 0; - } +multiclass FLAT_AtomicRet_m <flat op, dag outs, dag ins, string asm, + list<dag> pattern> { + def "" : FLAT_Pseudo <NAME#"_RTN", outs, ins, pattern>, + AtomicNoRet <NAME, 1>; - def _RTN : FLAT <op, (outs vdst_rc:$vdst), - (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, - tfe_flat_atomic:$tfe), - name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>, - AtomicNoRet <NAME, 1> { - let glc = 1; - } + def _ci : FLAT_Real_ci <op.CI, NAME#"_RTN", outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME#"_RTN", outs, ins, asm>; +} + +multiclass FLAT_Load_Helper <flat op, string asm_name, + RegisterClass regClass, + dag outs = (outs regClass:$vdst), + dag ins = (ins VReg_64:$addr, glc_flat:$glc, slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $vdst, $addr"#"$glc"#"$slc"#"$tfe"> { + + let data = 0, mayLoad = 1 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_Store_Helper <flat op, string asm_name, + RegisterClass vdataClass, + dag outs = (outs), + dag ins = (ins vdataClass:$data, VReg_64:$addr, glc_flat:$glc, + slc_flat:$slc, tfe_flat:$tfe), + string asm = asm_name#" $data, $addr"#"$glc"#"$slc"#"$tfe"> { + + let mayLoad = 0, mayStore = 1, vdst = 0 in { + + def "" : FLAT_Pseudo <NAME, outs, ins, []>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs, ins, asm>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs, ins, asm>; + } +} + +multiclass FLAT_ATOMIC <flat op, string asm_name, RegisterClass vdst_rc, + RegisterClass data_rc = vdst_rc, + dag outs_noret = (outs), + string asm_noret = asm_name#" $addr, $data"#"$slc"#"$tfe"> { + + let mayLoad = 1, mayStore = 1, glc = 0, vdst = 0 in { + def "" : FLAT_Pseudo <NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), []>, + AtomicNoRet <NAME, 0>; + + def _ci : FLAT_Real_ci <op.CI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + + def _vi : FLAT_Real_vi <op.VI, NAME, outs_noret, + (ins VReg_64:$addr, data_rc:$data, + slc_flat_atomic:$slc, tfe_flat_atomic:$tfe), + asm_noret>; + } + + let glc = 1, hasPostISelHook = 1 in { + defm _RTN : FLAT_AtomicRet_m <op, (outs vdst_rc:$vdst), + (ins VReg_64:$addr, data_rc:$data, slc_flat_atomic:$slc, + tfe_flat_atomic:$tfe), + asm_name#" $vdst, $addr, $data glc"#"$slc"#"$tfe", []>; } } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td index e0eeea9..6f653c7 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -30,7 +30,9 @@ def isGCN : Predicate<"Subtarget->getGeneration() " ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">, AssemblerPredicate<"FeatureGCN">; def isSI : Predicate<"Subtarget->getGeneration() " - "== AMDGPUSubtarget::SOUTHERN_ISLANDS">; + "== AMDGPUSubtarget::SOUTHERN_ISLANDS">, + AssemblerPredicate<"FeatureSouthernIslands">; + def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">; def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">; @@ -62,36 +64,38 @@ let mayLoad = 1 in { // We are using the SGPR_32 and not the SReg_32 register class for 32-bit // SMRD instructions, because the SGPR_32 register class does not include M0 // and writing to M0 from an SMRD instruction will hang the GPU. -defm S_LOAD_DWORD : SMRD_Helper <0x00, "s_load_dword", SReg_64, SGPR_32>; -defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "s_load_dwordx2", SReg_64, SReg_64>; -defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "s_load_dwordx4", SReg_64, SReg_128>; -defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "s_load_dwordx8", SReg_64, SReg_256>; -defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "s_load_dwordx16", SReg_64, SReg_512>; +defm S_LOAD_DWORD : SMRD_Helper <smrd<0x00>, "s_load_dword", SReg_64, SGPR_32>; +defm S_LOAD_DWORDX2 : SMRD_Helper <smrd<0x01>, "s_load_dwordx2", SReg_64, SReg_64>; +defm S_LOAD_DWORDX4 : SMRD_Helper <smrd<0x02>, "s_load_dwordx4", SReg_64, SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <smrd<0x03>, "s_load_dwordx8", SReg_64, SReg_256>; +defm S_LOAD_DWORDX16 : SMRD_Helper <smrd<0x04>, "s_load_dwordx16", SReg_64, SReg_512>; defm S_BUFFER_LOAD_DWORD : SMRD_Helper < - 0x08, "s_buffer_load_dword", SReg_128, SGPR_32 + smrd<0x08>, "s_buffer_load_dword", SReg_128, SGPR_32 >; defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper < - 0x09, "s_buffer_load_dwordx2", SReg_128, SReg_64 + smrd<0x09>, "s_buffer_load_dwordx2", SReg_128, SReg_64 >; defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper < - 0x0a, "s_buffer_load_dwordx4", SReg_128, SReg_128 + smrd<0x0a>, "s_buffer_load_dwordx4", SReg_128, SReg_128 >; defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper < - 0x0b, "s_buffer_load_dwordx8", SReg_128, SReg_256 + smrd<0x0b>, "s_buffer_load_dwordx8", SReg_128, SReg_256 >; defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper < - 0x0c, "s_buffer_load_dwordx16", SReg_128, SReg_512 + smrd<0x0c>, "s_buffer_load_dwordx16", SReg_128, SReg_512 >; } // mayLoad = 1 //def S_MEMTIME : SMRD_ <0x0000001e, "s_memtime", []>; -//def S_DCACHE_INV : SMRD_ <0x0000001f, "s_dcache_inv", []>; + +defm S_DCACHE_INV : SMRD_Inval <smrd<0x1f, 0x20>, "s_dcache_inv", + int_amdgcn_s_dcache_inv>; //===----------------------------------------------------------------------===// // SOP1 Instructions @@ -123,7 +127,7 @@ let Defs = [SCC] in { defm S_BREV_B32 : SOP1_32 <sop1<0x0b, 0x08>, "s_brev_b32", - [(set i32:$dst, (AMDGPUbrev i32:$src0))] + [(set i32:$dst, (bitreverse i32:$src0))] >; defm S_BREV_B64 : SOP1_64 <sop1<0x0c, 0x09>, "s_brev_b64", []>; @@ -183,10 +187,14 @@ defm S_XNOR_SAVEEXEC_B64 : SOP1_64 <sop1<0x2b, 0x27>, "s_xnor_saveexec_b64", []> defm S_QUADMASK_B32 : SOP1_32 <sop1<0x2c, 0x28>, "s_quadmask_b32", []>; defm S_QUADMASK_B64 : SOP1_64 <sop1<0x2d, 0x29>, "s_quadmask_b64", []>; + +let Uses = [M0] in { defm S_MOVRELS_B32 : SOP1_32 <sop1<0x2e, 0x2a>, "s_movrels_b32", []>; defm S_MOVRELS_B64 : SOP1_64 <sop1<0x2f, 0x2b>, "s_movrels_b64", []>; defm S_MOVRELD_B32 : SOP1_32 <sop1<0x30, 0x2c>, "s_movreld_b32", []>; defm S_MOVRELD_B64 : SOP1_64 <sop1<0x31, 0x2d>, "s_movreld_b64", []>; +} // End Uses = [M0] + defm S_CBRANCH_JOIN : SOP1_1 <sop1<0x32, 0x2e>, "s_cbranch_join", []>; defm S_MOV_REGRD_B32 : SOP1_32 <sop1<0x33, 0x2f>, "s_mov_regrd_b32", []>; let Defs = [SCC] in { @@ -354,7 +362,7 @@ def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "s_cmp_le_u32">; // SOPK Instructions //===----------------------------------------------------------------------===// -let isReMaterializable = 1 in { +let isReMaterializable = 1, isMoveImm = 1 in { defm S_MOVK_I32 : SOPK_32 <sopk<0x00>, "s_movk_i32", []>; } // End isReMaterializable = 1 let Uses = [SCC] in { @@ -438,36 +446,38 @@ def S_BRANCH : SOPP < let isBarrier = 1; } -let DisableEncoding = "$scc" in { +let Uses = [SCC] in { def S_CBRANCH_SCC0 : SOPP < - 0x00000004, (ins sopp_brtarget:$simm16, SCCReg:$scc), + 0x00000004, (ins sopp_brtarget:$simm16), "s_cbranch_scc0 $simm16" >; def S_CBRANCH_SCC1 : SOPP < - 0x00000005, (ins sopp_brtarget:$simm16, SCCReg:$scc), + 0x00000005, (ins sopp_brtarget:$simm16), "s_cbranch_scc1 $simm16" >; -} // End DisableEncoding = "$scc" +} // End Uses = [SCC] +let Uses = [VCC] in { def S_CBRANCH_VCCZ : SOPP < - 0x00000006, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + 0x00000006, (ins sopp_brtarget:$simm16), "s_cbranch_vccz $simm16" >; def S_CBRANCH_VCCNZ : SOPP < - 0x00000007, (ins sopp_brtarget:$simm16, VCCReg:$vcc), + 0x00000007, (ins sopp_brtarget:$simm16), "s_cbranch_vccnz $simm16" >; +} // End Uses = [VCC] -let DisableEncoding = "$exec" in { +let Uses = [EXEC] in { def S_CBRANCH_EXECZ : SOPP < - 0x00000008, (ins sopp_brtarget:$simm16, EXECReg:$exec), + 0x00000008, (ins sopp_brtarget:$simm16), "s_cbranch_execz $simm16" >; def S_CBRANCH_EXECNZ : SOPP < - 0x00000009, (ins sopp_brtarget:$simm16, EXECReg:$exec), + 0x00000009, (ins sopp_brtarget:$simm16), "s_cbranch_execnz $simm16" >; -} // End DisableEncoding = "$exec" +} // End Uses = [EXEC] } // End isBranch = 1 @@ -477,11 +487,11 @@ let hasSideEffects = 1 in { def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier", [(int_AMDGPU_barrier_local)] > { + let SchedRW = [WriteBarrier]; let simm16 = 0; - let isBarrier = 1; - let hasCtrlDep = 1; let mayLoad = 1; let mayStore = 1; + let isConvergent = 1; } def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">; @@ -805,9 +815,6 @@ defm DS_CMPST_RTN_B32 : DS_1A2D_RET <0x30, "ds_cmpst_rtn_b32", VGPR_32, "ds_cmps defm DS_CMPST_RTN_F32 : DS_1A2D_RET <0x31, "ds_cmpst_rtn_f32", VGPR_32, "ds_cmpst_f32">; defm DS_MIN_RTN_F32 : DS_1A2D_RET <0x32, "ds_min_rtn_f32", VGPR_32, "ds_min_f32">; defm DS_MAX_RTN_F32 : DS_1A2D_RET <0x33, "ds_max_rtn_f32", VGPR_32, "ds_max_f32">; -let SubtargetPredicate = isCI in { -defm DS_WRAP_RTN_F32 : DS_1A1D_RET <0x34, "ds_wrap_rtn_f32", VGPR_32, "ds_wrap_f32">; -} // End isCI defm DS_SWIZZLE_B32 : DS_1A_RET <0x35, "ds_swizzle_b32", VGPR_32>; let mayStore = 0 in { defm DS_READ_B32 : DS_1A_RET <0x36, "ds_read_b32", VGPR_32>; @@ -905,11 +912,6 @@ defm DS_WRITE_SRC2_B64 : DS_1A <0xcc, "ds_write_src2_b64">; defm DS_MIN_SRC2_F64 : DS_1A <0xd2, "ds_min_src2_f64">; defm DS_MAX_SRC2_F64 : DS_1A <0xd3, "ds_max_src2_f64">; -//let SubtargetPredicate = isCI in { -// DS_CONDXCHG32_RTN_B64 -// DS_CONDXCHG32_RTN_B128 -//} // End isCI - //===----------------------------------------------------------------------===// // MUBUF Instructions //===----------------------------------------------------------------------===// @@ -951,13 +953,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper < mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global >; defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper < - mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load + mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load >; defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper < - mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load + mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load >; defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper < - mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load + mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load >; defm BUFFER_STORE_BYTE : MUBUF_Store_Helper < @@ -1034,9 +1036,12 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <mubuf<0x5e>, "buffer_atomic_fcmpswap_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <mubuf<0x5f>, "buffer_atomic_fmin_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <mubuf<0x60>, "buffer_atomic_fmax_x2", []>; // isn't on VI -//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <mubuf<0x70>, "buffer_wbinvl1_sc", []>; // isn't on CI & VI -//def BUFFER_WBINVL1_VOL : MUBUF_WBINVL1 <mubuf<0x70, 0x3f>, "buffer_wbinvl1_vol", []>; // isn't on SI -//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <mubuf<0x71, 0x3e>, "buffer_wbinvl1", []>; + +let SubtargetPredicate = isSI in { +defm BUFFER_WBINVL1_SC : MUBUF_Invalidate <mubuf<0x70>, "buffer_wbinvl1_sc", int_amdgcn_buffer_wbinvl1_sc>; // isn't on CI & VI +} + +defm BUFFER_WBINVL1 : MUBUF_Invalidate <mubuf<0x71, 0x3e>, "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1>; //===----------------------------------------------------------------------===// // MTBUF Instructions @@ -1155,8 +1160,8 @@ defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, "image_sample_c_cd_cl_o" // VOP1 Instructions //===----------------------------------------------------------------------===// -let vdst = 0, src0 = 0 in { -defm V_NOP : VOP1_m <vop1<0x0>, (outs), (ins), "v_nop", [], "v_nop">; +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_NOP : VOP1Inst <vop1<0x0>, "v_nop", VOP_NONE>; } let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in { @@ -1292,7 +1297,9 @@ defm V_SQRT_F64 : VOP1Inst <vop1<0x34, 0x28>, "v_sqrt_f64", VOP_F64_F64, fsqrt >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] + +let SchedRW = [WriteQuarterRate32] in { defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", VOP_F32_F32, AMDGPUsin @@ -1300,6 +1307,9 @@ defm V_SIN_F32 : VOP1Inst <vop1<0x35, 0x29>, "v_sin_f32", defm V_COS_F32 : VOP1Inst <vop1<0x36, 0x2a>, "v_cos_f32", VOP_F32_F32, AMDGPUcos >; + +} // End SchedRW = [WriteQuarterRate32] + defm V_NOT_B32 : VOP1Inst <vop1<0x37, 0x2b>, "v_not_b32", VOP_I32_I32>; defm V_BFREV_B32 : VOP1Inst <vop1<0x38, 0x2c>, "v_bfrev_b32", VOP_I32_I32>; defm V_FFBH_U32 : VOP1Inst <vop1<0x39, 0x2d>, "v_ffbh_u32", VOP_I32_I32>; @@ -1308,24 +1318,33 @@ defm V_FFBH_I32 : VOP1Inst <vop1<0x3b, 0x2f>, "v_ffbh_i32", VOP_I32_I32>; defm V_FREXP_EXP_I32_F64 : VOP1Inst <vop1<0x3c,0x30>, "v_frexp_exp_i32_f64", VOP_I32_F64 >; + +let SchedRW = [WriteDoubleAdd] in { defm V_FREXP_MANT_F64 : VOP1Inst <vop1<0x3d, 0x31>, "v_frexp_mant_f64", VOP_F64_F64 >; -defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", VOP_F64_F64>; + +defm V_FRACT_F64 : VOP1Inst <vop1<0x3e, 0x32>, "v_fract_f64", + VOP_F64_F64 +>; +} // End SchedRW = [WriteDoubleAdd] + + defm V_FREXP_EXP_I32_F32 : VOP1Inst <vop1<0x3f, 0x33>, "v_frexp_exp_i32_f32", VOP_I32_F32 >; defm V_FREXP_MANT_F32 : VOP1Inst <vop1<0x40, 0x34>, "v_frexp_mant_f32", VOP_F32_F32 >; -let vdst = 0, src0 = 0 in { -defm V_CLREXCP : VOP1_m <vop1<0x41,0x35>, (outs), (ins), "v_clrexcp", [], - "v_clrexcp" ->; +let vdst = 0, src0 = 0, VOPAsmPrefer32Bit = 1 in { +defm V_CLREXCP : VOP1Inst <vop1<0x41,0x35>, "v_clrexcp", VOP_NONE>; } + +let Uses = [M0, EXEC] in { defm V_MOVRELD_B32 : VOP1Inst <vop1<0x42, 0x36>, "v_movreld_b32", VOP_I32_I32>; defm V_MOVRELS_B32 : VOP1Inst <vop1<0x43, 0x37>, "v_movrels_b32", VOP_I32_I32>; defm V_MOVRELSD_B32 : VOP1Inst <vop1<0x44, 0x38>, "v_movrelsd_b32", VOP_I32_I32>; +} // End Uses = [M0, EXEC] // These instruction only exist on SI and CI let SubtargetPredicate = isSICI in { @@ -1343,7 +1362,7 @@ defm V_RSQ_LEGACY_F32 : VOP1InstSI <vop1<0x2d>, "v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy >; -} // End let SchedRW = [WriteQuarterRate32] +} // End SchedRW = [WriteQuarterRate32] let SchedRW = [WriteDouble] in { @@ -1360,7 +1379,7 @@ defm V_RSQ_CLAMP_F64 : VOP1InstSI <vop1<0x32>, "v_rsq_clamp_f64", // VINTRP Instructions //===----------------------------------------------------------------------===// -let Uses = [M0] in { +let Uses = [M0, EXEC] in { // FIXME: Specify SchedRW for VINTRP insturctions. @@ -1405,16 +1424,14 @@ defm V_INTERP_MOV_F32 : VINTRP_m < [(set f32:$dst, (AMDGPUinterp_mov (i32 imm:$src0), (i32 imm:$attr_chan), (i32 imm:$attr)))]>; -} // End Uses = [M0] +} // End Uses = [M0, EXEC] //===----------------------------------------------------------------------===// // VOP2 Instructions //===----------------------------------------------------------------------===// multiclass V_CNDMASK <vop2 op, string name> { - defm _e32 : VOP2_m < - op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins32, VOP_CNDMASK.Asm32, [], - name, name>; + defm _e32 : VOP2_m <op, name, VOP_CNDMASK, [], name>; defm _e64 : VOP3_m < op, VOP_CNDMASK.Outs, VOP_CNDMASK.Ins64, @@ -1500,34 +1517,32 @@ let isCommutable = 1 in { defm V_MADAK_F32 : VOP2MADK <vop2<0x21, 0x18>, "v_madak_f32">; } // End isCommutable = 1 -let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC +let isCommutable = 1 in { // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. // V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI, // but the VI instructions behave the same as the SI versions. defm V_ADD_I32 : VOP2bInst <vop2<0x25, 0x19>, "v_add_i32", - VOP_I32_I32_I32, add + VOP2b_I32_I1_I32_I32 >; -defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP_I32_I32_I32>; +defm V_SUB_I32 : VOP2bInst <vop2<0x26, 0x1a>, "v_sub_i32", VOP2b_I32_I1_I32_I32>; defm V_SUBREV_I32 : VOP2bInst <vop2<0x27, 0x1b>, "v_subrev_i32", - VOP_I32_I32_I32, null_frag, "v_sub_i32" + VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32" >; -let Uses = [VCC] in { // Carry-in comes from VCC defm V_ADDC_U32 : VOP2bInst <vop2<0x28, 0x1c>, "v_addc_u32", - VOP_I32_I32_I32_VCC + VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBB_U32 : VOP2bInst <vop2<0x29, 0x1d>, "v_subb_u32", - VOP_I32_I32_I32_VCC + VOP2b_I32_I1_I32_I32_I1 >; defm V_SUBBREV_U32 : VOP2bInst <vop2<0x2a, 0x1e>, "v_subbrev_u32", - VOP_I32_I32_I32_VCC, null_frag, "v_subb_u32" + VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32" >; -} // End Uses = [VCC] -} // End isCommutable = 1, Defs = [VCC] +} // End isCommutable = 1 defm V_READLANE_B32 : VOP2SI_3VI_m < vop3 <0x001, 0x289>, @@ -1575,10 +1590,10 @@ defm V_BCNT_U32_B32 : VOP2_VI3_Inst <vop23<0x22, 0x28b>, "v_bcnt_u32_b32", VOP_I32_I32_I32 >; defm V_MBCNT_LO_U32_B32 : VOP2_VI3_Inst <vop23<0x23, 0x28c>, "v_mbcnt_lo_u32_b32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, int_amdgcn_mbcnt_lo >; defm V_MBCNT_HI_U32_B32 : VOP2_VI3_Inst <vop23<0x24, 0x28d>, "v_mbcnt_hi_u32_b32", - VOP_I32_I32_I32 + VOP_I32_I32_I32, int_amdgcn_mbcnt_hi >; defm V_LDEXP_F32 : VOP2_VI3_Inst <vop23<0x2b, 0x288>, "v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp @@ -1704,15 +1719,15 @@ defm V_DIV_FIXUP_F32 : VOP3Inst < vop3<0x15f, 0x1de>, "v_div_fixup_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fixup >; -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDoubleAdd] in { defm V_DIV_FIXUP_F64 : VOP3Inst < vop3<0x160, 0x1df>, "v_div_fixup_f64", VOP_F64_F64_F64_F64, AMDGPUdiv_fixup >; -} // let SchedRW = [WriteDouble] +} // End SchedRW = [WriteDouble] -let SchedRW = [WriteDouble] in { +let SchedRW = [WriteDoubleAdd] in { let isCommutable = 1 in { defm V_ADD_F64 : VOP3Inst <vop3<0x164, 0x280>, "v_add_f64", @@ -1735,7 +1750,7 @@ defm V_LDEXP_F64 : VOP3Inst <vop3<0x168, 0x284>, "v_ldexp_f64", VOP_F64_F64_I32, AMDGPUldexp >; -} // let SchedRW = [WriteDouble] +} // let SchedRW = [WriteDoubleAdd] let isCommutable = 1, SchedRW = [WriteQuarterRate32] in { @@ -1756,16 +1771,21 @@ defm V_MUL_HI_I32 : VOP3Inst <vop3<0x16c, 0x287>, "v_mul_hi_i32", } // isCommutable = 1, SchedRW = [WriteQuarterRate32] let SchedRW = [WriteFloatFMA, WriteSALU] in { -defm V_DIV_SCALE_F32 : VOP3b_32 <vop3<0x16d, 0x1e0>, "v_div_scale_f32", []>; +defm V_DIV_SCALE_F32 : VOP3bInst <vop3<0x16d, 0x1e0>, "v_div_scale_f32", + VOP3b_F32_I1_F32_F32_F32 +>; } let SchedRW = [WriteDouble, WriteSALU] in { // Double precision division pre-scale. -defm V_DIV_SCALE_F64 : VOP3b_64 <vop3<0x16e, 0x1e1>, "v_div_scale_f64", []>; +defm V_DIV_SCALE_F64 : VOP3bInst <vop3<0x16e, 0x1e1>, "v_div_scale_f64", + VOP3b_F64_I1_F64_F64_F64 +>; } // let SchedRW = [WriteDouble] -let isCommutable = 1, Uses = [VCC] in { +let isCommutable = 1, Uses = [VCC, EXEC] in { +let SchedRW = [WriteFloatFMA] in { // v_div_fmas_f32: // result = src0 * src1 + src2 // if (vcc) @@ -1774,6 +1794,7 @@ let isCommutable = 1, Uses = [VCC] in { defm V_DIV_FMAS_F32 : VOP3_VCC_Inst <vop3<0x16f, 0x1e2>, "v_div_fmas_f32", VOP_F32_F32_F32_F32, AMDGPUdiv_fmas >; +} let SchedRW = [WriteDouble] in { // v_div_fmas_f64: @@ -1786,7 +1807,7 @@ defm V_DIV_FMAS_F64 : VOP3_VCC_Inst <vop3<0x170, 0x1e3>, "v_div_fmas_f64", >; } // End SchedRW = [WriteDouble] -} // End isCommutable = 1 +} // End isCommutable = 1, Uses = [VCC, EXEC] //def V_MSAD_U8 : VOP3_U8 <0x00000171, "v_msad_u8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "v_qsad_u8", []>; @@ -1835,13 +1856,13 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$dst), (ins VSrc_64:$src0, VSrc_64:$src1, SSrc_64:$src2), "", [] >; -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. def V_MOV_B64_PSEUDO : InstSI <(outs VReg_64:$dst), (ins VSrc_64:$src0), "", []>; } // end let hasSideEffects = 0, mayLoad = 0, mayStore = 0 -let hasSideEffects = 1 in { +let hasSideEffects = 1, SALU = 1 in { def SGPR_USE : InstSI <(outs),(ins), "", []>; } @@ -1921,39 +1942,9 @@ def SI_KILL : InstSI < let Uses = [EXEC], Defs = [EXEC,VCC,M0] in { -//defm SI_ : RegisterLoadStore <VGPR_32, FRAMEri, ADDRIndirect>; - -let UseNamedOperandTable = 1 in { - -def SI_RegisterLoad : InstSI < +class SI_INDIRECT_SRC<RegisterClass rc> : InstSI < (outs VGPR_32:$dst, SReg_64:$temp), - (ins FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterLoad = 1; - let mayLoad = 1; -} - -class SIRegStore<dag outs> : InstSI < - outs, - (ins VGPR_32:$val, FRAMEri32:$addr, i32imm:$chan), - "", [] -> { - let isRegisterStore = 1; - let mayStore = 1; -} - -let usesCustomInserter = 1 in { -def SI_RegisterStorePseudo : SIRegStore<(outs)>; -} // End usesCustomInserter = 1 -def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>; - - -} // End UseNamedOperandTable = 1 - -def SI_INDIRECT_SRC : InstSI < - (outs VGPR_32:$dst, SReg_64:$temp), - (ins unknown:$src, VSrc_32:$idx, i32imm:$off), + (ins rc:$src, VSrc_32:$idx, i32imm:$off), "si_indirect_src $dst, $temp, $src, $idx, $off", [] >; @@ -1967,6 +1958,13 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI < let Constraints = "$src = $dst"; } +// TODO: We can support indirect SGPR access. +def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; +def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; +def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; +def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; +def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; + def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; @@ -1977,19 +1975,24 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { - let UseNamedOperandTable = 1 in { + let UseNamedOperandTable = 1, Uses = [EXEC] in { def _SAVE : InstSI < (outs), - (ins sgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, - SReg_32:$scratch_offset), + (ins sgpr_class:$src, i32imm:$frame_idx), "", [] - >; + > { + let mayStore = 1; + let mayLoad = 0; + } def _RESTORE : InstSI < (outs sgpr_class:$dst), - (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), + (ins i32imm:$frame_idx), "", [] - >; + > { + let mayStore = 0; + let mayLoad = 1; + } } // End UseNamedOperandTable = 1 } @@ -2003,19 +2006,25 @@ defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { - let UseNamedOperandTable = 1, VGPRSpill = 1 in { + let UseNamedOperandTable = 1, VGPRSpill = 1, Uses = [EXEC] in { def _SAVE : InstSI < (outs), (ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] - >; + > { + let mayStore = 1; + let mayLoad = 0; + } def _RESTORE : InstSI < (outs vgpr_class:$dst), (ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset), "", [] - >; + > { + let mayStore = 0; + let mayLoad = 1; + } } // End UseNamedOperandTable = 1, VGPRSpill = 1 } @@ -2030,9 +2039,11 @@ let Defs = [SCC] in { def SI_CONSTDATA_PTR : InstSI < (outs SReg_64:$dst), - (ins), - "", [(set SReg_64:$dst, (i64 SIconstdata_ptr))] ->; + (ins const_ga:$ptr), + "", [(set SReg_64:$dst, (i64 (SIconstdata_ptr (tglobaladdr:$ptr))))] +> { + let SALU = 1; +} } // End Defs = [SCC] @@ -2072,84 +2083,63 @@ def : Pat < // SMRD Patterns //===----------------------------------------------------------------------===// -multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { +multiclass SMRD_Pattern <string Instr, ValueType vt> { - // 1. SI-CI: Offset as 8bit DWORD immediate + // 1. IMM offset def : Pat < - (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))), - (vt (Instr_IMM $sbase, (as_dword_i32imm $offset))) + (smrd_load (SMRDImm i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset)) >; - // 2. Offset loaded in an 32bit SGPR + // 2. SGPR offset def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) + (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset)) >; - // 3. No offset at all def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; + (smrd_load (SMRDImm32 i64:$sbase, i32:$offset)), + (vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset)) + > { + let Predicates = [isCIOnly]; + } } -multiclass SMRD_Pattern_vi <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> { - - // 1. VI: Offset as 20bit immediate in bytes - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM20bit:$offset))), - (vt (Instr_IMM $sbase, (as_i32imm $offset))) - >; - - // 2. Offset loaded in an 32bit SGPR - def : Pat < - (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))), - (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset))))) - >; - - // 3. No offset at all - def : Pat < - (constant_load i64:$sbase), - (vt (Instr_IMM $sbase, 0)) - >; -} - -let Predicates = [isSICI] in { -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; -defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; -defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; -defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; -} // End Predicates = [isSICI] +// Global and constant loads can be selected to either MUBUF or SMRD +// instructions, but SMRD instructions are faster so we want the instruction +// selector to prefer those. +let AddedComplexity = 100 in { -let Predicates = [isVI] in { -defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>; -defm : SMRD_Pattern_vi <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>; -} // End Predicates = [isVI] +defm : SMRD_Pattern <"S_LOAD_DWORD", i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v32i8>; +defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; +defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; -let Predicates = [isSICI] in { +// 1. Offset as an immediate +def : Pat < + (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset) +>; -// 1. Offset as 8bit DWORD immediate +// 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset)) + (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)), + (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset) >; -} // End Predicates = [isSICI] +let Predicates = [isCI] in { -// 2. Offset loaded in an 32bit SGPR def : Pat < - (SIload_constant v4i32:$sbase, imm:$offset), - (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset)) + (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)), + (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset) >; +} // End Predicates = [isCI] + +} // End let AddedComplexity = 10000 + //===----------------------------------------------------------------------===// // SOP1 Patterns //===----------------------------------------------------------------------===// @@ -2161,6 +2151,11 @@ def : Pat < (S_MOV_B32 0), sub1)) >; +def : Pat < + (i32 (smax i32:$x, (i32 (ineg i32:$x)))), + (S_ABS_I32 $x) +>; + //===----------------------------------------------------------------------===// // SOP2 Patterns //===----------------------------------------------------------------------===// @@ -2488,6 +2483,11 @@ def : Pat < /********** Extraction, Insertion, Building and Casting **********/ /********** ============================================ **********/ +//def : Extract_Element<i64, v2i64, 0, sub0_sub1>; +//def : Extract_Element<i64, v2i64, 1, sub2_sub3>; +//def : Extract_Element<f64, v2f64, 0, sub0_sub1>; +//def : Extract_Element<f64, v2f64, 1, sub2_sub3>; + foreach Index = 0-2 in { def Extract_Element_v2i32_#Index : Extract_Element < i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) @@ -2568,11 +2568,25 @@ def : BitConvert <v2i32, i64, VReg_64>; def : BitConvert <i64, v2i32, VReg_64>; def : BitConvert <v2f32, i64, VReg_64>; def : BitConvert <i64, v2f32, VReg_64>; +def : BitConvert <v2f32, f64, VReg_64>; def : BitConvert <v2i32, f64, VReg_64>; +def : BitConvert <f64, v2f32, VReg_64>; def : BitConvert <f64, v2i32, VReg_64>; def : BitConvert <v4f32, v4i32, VReg_128>; def : BitConvert <v4i32, v4f32, VReg_128>; + +def : BitConvert <v2i64, v4i32, SReg_128>; +def : BitConvert <v4i32, v2i64, SReg_128>; + +def : BitConvert <v2f64, v4f32, VReg_128>; +def : BitConvert <v2f64, v4i32, VReg_128>; +def : BitConvert <v4f32, v2f64, VReg_128>; +def : BitConvert <v4i32, v2f64, VReg_128>; + + + + def : BitConvert <v8f32, v8i32, SReg_256>; def : BitConvert <v8i32, v8f32, SReg_256>; def : BitConvert <v8i32, v32i8, SReg_256>; @@ -2601,10 +2615,9 @@ def : Pat < // Prevent expanding both fneg and fabs. -// FIXME: Should use S_OR_B32 def : Pat < (fneg (fabs f32:$src)), - (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */ + (S_OR_B32 $src, 0x80000000) /* Set sign bit */ >; // FIXME: Should use S_OR_B32 @@ -2836,10 +2849,6 @@ class DSAtomicRetPat<DS inst, ValueType vt, PatFrag frag> : Pat < // -1. For the non-rtn variants, the manual says it does // DS[A] = (DS[A] >= D0) ? 0 : DS[A] + 1, and setting D0 to uint_max // will always do the increment so I'm assuming it's the same. -// -// We also load this -1 with s_mov_b32 / s_mov_b64 even though this -// needs to be a VGPR. The SGPR copy pass will fix this, and it's -// easier since there is no v_mov_b64. class DSAtomicIncRetPat<DS inst, ValueType vt, Instruction LoadImm, PatFrag frag> : Pat < (frag (DS1Addr1Offset i32:$ptr, i32:$offset), (vt 1)), @@ -2855,9 +2864,9 @@ class DSAtomicCmpXChg <DS inst, ValueType vt, PatFrag frag> : Pat < // 32-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U32, i32, - S_MOV_B32, si_atomic_load_add_local>; + V_MOV_B32_e32, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U32, i32, - S_MOV_B32, si_atomic_load_sub_local>; + V_MOV_B32_e32, si_atomic_load_sub_local>; def : DSAtomicRetPat<DS_WRXCHG_RTN_B32, i32, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U32, i32, si_atomic_load_add_local>; @@ -2874,9 +2883,9 @@ def : DSAtomicCmpXChg<DS_CMPST_RTN_B32, i32, si_atomic_cmp_swap_32_local>; // 64-bit atomics. def : DSAtomicIncRetPat<DS_INC_RTN_U64, i64, - S_MOV_B64, si_atomic_load_add_local>; + V_MOV_B64_PSEUDO, si_atomic_load_add_local>; def : DSAtomicIncRetPat<DS_DEC_RTN_U64, i64, - S_MOV_B64, si_atomic_load_sub_local>; + V_MOV_B64_PSEUDO, si_atomic_load_sub_local>; def : DSAtomicRetPat<DS_WRXCHG_RTN_B64, i64, si_atomic_swap_local>; def : DSAtomicRetPat<DS_ADD_RTN_U64, i64, si_atomic_load_add_local>; @@ -3019,90 +3028,46 @@ def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>; def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>; def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>; -let SubtargetPredicate = isCI in { - -defm V_QSAD_PK_U16_U8 : VOP3Inst <vop3<0x173>, "v_qsad_pk_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U16_U8 : VOP3Inst <vop3<0x172>, "v_mqsad_u16_u8", - VOP_I32_I32_I32 ->; -defm V_MQSAD_U32_U8 : VOP3Inst <vop3<0x175>, "v_mqsad_u32_u8", - VOP_I32_I32_I32 ->; - -let isCommutable = 1 in { -defm V_MAD_U64_U32 : VOP3Inst <vop3<0x176>, "v_mad_u64_u32", - VOP_I64_I32_I32_I64 ->; - -// XXX - Does this set VCC? -defm V_MAD_I64_I32 : VOP3Inst <vop3<0x177>, "v_mad_i64_i32", - VOP_I64_I32_I32_I64 ->; -} // End isCommutable = 1 - -// Remaining instructions: -// FLAT_* -// S_CBRANCH_CDBGUSER -// S_CBRANCH_CDBGSYS -// S_CBRANCH_CDBGSYS_OR_USER -// S_CBRANCH_CDBGSYS_AND_USER -// S_DCACHE_INV_VOL -// DS_NOP -// DS_GWS_SEMA_RELEASE_ALL -// DS_WRAP_RTN_B32 -// DS_CNDXCHG32_RTN_B64 -// DS_WRITE_B96 -// DS_WRITE_B128 -// DS_CONDXCHG32_RTN_B128 -// DS_READ_B96 -// DS_READ_B128 -// BUFFER_LOAD_DWORDX3 -// BUFFER_STORE_DWORDX3 - -} // End isCI - /********** ====================== **********/ /********** Indirect adressing **********/ /********** ====================== **********/ -multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST IndDst> { +multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { // 1. Extract with offset def : Pat< - (eltvt (vector_extract vt:$vec, (add i32:$idx, imm:$off))), - (SI_INDIRECT_SRC $vec, $idx, imm:$off) + (eltvt (extractelt vt:$vec, (add i32:$idx, imm:$off))), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, imm:$off) >; // 2. Extract without offset def : Pat< - (eltvt (vector_extract vt:$vec, i32:$idx)), - (SI_INDIRECT_SRC $vec, $idx, 0) + (eltvt (extractelt vt:$vec, i32:$idx)), + (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $vec, $idx, 0) >; // 3. Insert with offset def : Pat< - (vector_insert vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), - (IndDst $vec, $idx, imm:$off, $val) + (insertelt vt:$vec, eltvt:$val, (add i32:$idx, imm:$off)), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, imm:$off, $val) >; // 4. Insert without offset def : Pat< - (vector_insert vt:$vec, eltvt:$val, i32:$idx), - (IndDst $vec, $idx, 0, $val) + (insertelt vt:$vec, eltvt:$val, i32:$idx), + (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $vec, $idx, 0, $val) >; } -defm : SI_INDIRECT_Pattern <v2f32, f32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <v4f32, f32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <v8f32, f32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <v16f32, f32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; +defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; +defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; +defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; -defm : SI_INDIRECT_Pattern <v2i32, i32, SI_INDIRECT_DST_V2>; -defm : SI_INDIRECT_Pattern <v4i32, i32, SI_INDIRECT_DST_V4>; -defm : SI_INDIRECT_Pattern <v8i32, i32, SI_INDIRECT_DST_V8>; -defm : SI_INDIRECT_Pattern <v16i32, i32, SI_INDIRECT_DST_V16>; +defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; +defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; +defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; +defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; //===----------------------------------------------------------------------===// // Conversion Patterns @@ -3215,12 +3180,12 @@ def : Pat < def : Pat < (i1 (trunc i32:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), $a), 1) + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), $a), 1) >; def : Pat < (i1 (trunc i64:$a)), - (V_CMP_EQ_I32_e64 (V_AND_B32_e64 (i32 1), + (V_CMP_EQ_I32_e64 (S_AND_B32 (i32 1), (EXTRACT_SUBREG $a, sub0)), 1) >; @@ -3301,24 +3266,6 @@ def : Pat < } // End Predicates = [isSI] -let Predicates = [isCI] in { - -// Convert (x - floor(x)) to fract(x) -def : Pat < - (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), - (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), - (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -// Convert (x + (-floor(x))) to fract(x) -def : Pat < - (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), - (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), - (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE) ->; - -} // End Predicates = [isCI] - //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index c319b32..126f624 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -103,6 +103,10 @@ public: return "SI Lower control flow instructions"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } }; } // End anonymous namespace @@ -140,8 +144,7 @@ void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { DebugLoc DL = From.getDebugLoc(); BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) - .addOperand(To) - .addReg(AMDGPU::EXEC); + .addOperand(To); } void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { @@ -159,8 +162,7 @@ void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { // If the exec mask is non-zero, skip the next two instructions BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); + .addImm(3); // Exec mask is zero: Export to NULL target... BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) @@ -269,8 +271,7 @@ void SILowerControlFlowPass::Loop(MachineInstr &MI) { .addReg(Src); BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addOperand(MI.getOperand(1)) - .addReg(AMDGPU::EXEC); + .addOperand(MI.getOperand(1)); MI.eraseFromParent(); } @@ -316,7 +317,7 @@ void SILowerControlFlowPass::Kill(MachineInstr &MI) { .addImm(0); } } else { - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32)) .addImm(0) .addOperand(Op); } @@ -362,9 +363,9 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int .addReg(AMDGPU::VCC_LO); // Compare the just read M0 value to all possible Idx values - BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) - .addReg(AMDGPU::M0) - .addReg(Idx); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32)) + .addReg(AMDGPU::M0) + .addReg(Idx); // Update EXEC, save the original EXEC value to VCC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) @@ -385,8 +386,7 @@ void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(-7) - .addReg(AMDGPU::EXEC); + .addImm(-7); // Restore EXEC BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) @@ -438,7 +438,6 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { MachineInstr *MovRel = BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) .addReg(Reg) - .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Vec, RegState::Implicit); LoadM0(MI, MovRel, Off); @@ -460,7 +459,6 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) .addReg(Reg, RegState::Define) .addReg(Val) - .addReg(AMDGPU::M0, RegState::Implicit) .addReg(Dst, RegState::Implicit); LoadM0(MI, MovRel, Off); @@ -486,11 +484,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode())) + if (TII->isWQM(MI) || TII->isDS(MI)) NeedWQM = true; // Flat uses m0 in case it needs to access LDS. - if (TII->isFLAT(MI.getOpcode())) + if (TII->isFLAT(MI)) NeedFlat = true; switch (MI.getOpcode()) { @@ -541,7 +539,11 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { Branch(MI); break; - case AMDGPU::SI_INDIRECT_SRC: + case AMDGPU::SI_INDIRECT_SRC_V1: + case AMDGPU::SI_INDIRECT_SRC_V2: + case AMDGPU::SI_INDIRECT_SRC_V4: + case AMDGPU::SI_INDIRECT_SRC_V8: + case AMDGPU::SI_INDIRECT_SRC_V16: IndirectSrc(MI); break; diff --git a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp index 67421e2..a2fa5fd 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -48,6 +48,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 587ea63..935aad4 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -29,10 +29,114 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), - HasSpilledVGPRs(false), + ScratchRSrcReg(AMDGPU::NoRegister), + ScratchWaveOffsetReg(AMDGPU::NoRegister), + PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), + DispatchPtrUserSGPR(AMDGPU::NoRegister), + QueuePtrUserSGPR(AMDGPU::NoRegister), + KernargSegmentPtrUserSGPR(AMDGPU::NoRegister), + DispatchIDUserSGPR(AMDGPU::NoRegister), + FlatScratchInitUserSGPR(AMDGPU::NoRegister), + PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister), + GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister), + WorkGroupIDXSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDYSystemSGPR(AMDGPU::NoRegister), + WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), + WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), + PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), + LDSWaveSpillSize(0), PSInputAddr(0), NumUserSGPRs(0), - LDSWaveSpillSize(0) { } + NumSystemSGPRs(0), + HasSpilledSGPRs(false), + HasSpilledVGPRs(false), + PrivateSegmentBuffer(false), + DispatchPtr(false), + QueuePtr(false), + DispatchID(false), + KernargSegmentPtr(false), + FlatScratchInit(false), + GridWorkgroupCountX(false), + GridWorkgroupCountY(false), + GridWorkgroupCountZ(false), + WorkGroupIDX(true), + WorkGroupIDY(false), + WorkGroupIDZ(false), + WorkGroupInfo(false), + PrivateSegmentWaveByteOffset(false), + WorkItemIDX(true), + WorkItemIDY(false), + WorkItemIDZ(false) { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + const Function *F = MF.getFunction(); + + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); + + if (getShaderType() == ShaderType::COMPUTE) + KernargSegmentPtr = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-y")) + WorkGroupIDY = true; + + if (F->hasFnAttribute("amdgpu-work-group-id-z")) + WorkGroupIDZ = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-y")) + WorkItemIDY = true; + + if (F->hasFnAttribute("amdgpu-work-item-id-z")) + WorkItemIDZ = true; + + bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool HasStackObjects = FrameInfo->hasStackObjects(); + + if (HasStackObjects || MaySpill) + PrivateSegmentWaveByteOffset = true; + + if (ST.isAmdHsaOS()) { + if (HasStackObjects || MaySpill) + PrivateSegmentBuffer = true; + + if (F->hasFnAttribute("amdgpu-dispatch-ptr")) + DispatchPtr = true; + } + + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; +} + +unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( + const SIRegisterInfo &TRI) { + PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + NumUserSGPRs += 4; + return PrivateSegmentBufferUserSGPR; +} + +unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { + DispatchPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return DispatchPtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { + QueuePtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return QueuePtrUserSGPR; +} + +unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { + KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg( + getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass); + NumUserSGPRs += 2; + return KernargSegmentPtrUserSGPR; +} SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, @@ -53,7 +157,6 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( if (!LaneVGPRs.count(LaneVGPRIdx)) { unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; - MRI.setPhysRegUsed(LaneVGPR); // Add this register as live-in to all blocks to avoid machine verifer // complaining about use of an undefined physical register. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 667da4c..9c528d6 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -26,13 +26,83 @@ class MachineRegisterInfo; /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { + // FIXME: This should be removed and getPreloadedValue moved here. + friend struct SIRegisterInfo; void anchor() override; unsigned TIDReg; - bool HasSpilledVGPRs; + + // Registers that may be reserved for spilling purposes. These may be the same + // as the input registers. + unsigned ScratchRSrcReg; + unsigned ScratchWaveOffsetReg; + + // Input registers setup for the HSA ABI. + // User SGPRs in allocation order. + unsigned PrivateSegmentBufferUserSGPR; + unsigned DispatchPtrUserSGPR; + unsigned QueuePtrUserSGPR; + unsigned KernargSegmentPtrUserSGPR; + unsigned DispatchIDUserSGPR; + unsigned FlatScratchInitUserSGPR; + unsigned PrivateSegmentSizeUserSGPR; + unsigned GridWorkGroupCountXUserSGPR; + unsigned GridWorkGroupCountYUserSGPR; + unsigned GridWorkGroupCountZUserSGPR; + + // System SGPRs in allocation order. + unsigned WorkGroupIDXSystemSGPR; + unsigned WorkGroupIDYSystemSGPR; + unsigned WorkGroupIDZSystemSGPR; + unsigned WorkGroupInfoSystemSGPR; + unsigned PrivateSegmentWaveByteOffsetSystemSGPR; public: + // FIXME: Make private + unsigned LDSWaveSpillSize; + unsigned PSInputAddr; + std::map<unsigned, unsigned> LaneVGPRs; + unsigned ScratchOffsetReg; + unsigned NumUserSGPRs; + unsigned NumSystemSGPRs; + +private: + bool HasSpilledSGPRs; + bool HasSpilledVGPRs; + + // Feature bits required for inputs passed in user SGPRs. + bool PrivateSegmentBuffer : 1; + bool DispatchPtr : 1; + bool QueuePtr : 1; + bool DispatchID : 1; + bool KernargSegmentPtr : 1; + bool FlatScratchInit : 1; + bool GridWorkgroupCountX : 1; + bool GridWorkgroupCountY : 1; + bool GridWorkgroupCountZ : 1; + + // Feature bits required for inputs passed in system SGPRs. + bool WorkGroupIDX : 1; // Always initialized. + bool WorkGroupIDY : 1; + bool WorkGroupIDZ : 1; + bool WorkGroupInfo : 1; + bool PrivateSegmentWaveByteOffset : 1; + + bool WorkItemIDX : 1; // Always initialized. + bool WorkItemIDY : 1; + bool WorkItemIDZ : 1; + + MCPhysReg getNextUserSGPR() const { + assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); + return AMDGPU::SGPR0 + NumUserSGPRs; + } + + MCPhysReg getNextSystemSGPR() const { + return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; + } + +public: struct SpilledReg { unsigned VGPR; int Lane; @@ -46,16 +116,162 @@ public: SIMachineFunctionInfo(const MachineFunction &MF); SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex, unsigned SubIdx); - unsigned PSInputAddr; - unsigned NumUserSGPRs; - std::map<unsigned, unsigned> LaneVGPRs; - unsigned LDSWaveSpillSize; - unsigned ScratchOffsetReg; bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; unsigned getTIDReg() const { return TIDReg; }; void setTIDReg(unsigned Reg) { TIDReg = Reg; } - bool hasSpilledVGPRs() const { return HasSpilledVGPRs; } - void setHasSpilledVGPRs(bool Spill = true) { HasSpilledVGPRs = Spill; } + + // Add user SGPRs. + unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + unsigned addDispatchPtr(const SIRegisterInfo &TRI); + unsigned addQueuePtr(const SIRegisterInfo &TRI); + unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); + + // Add system SGPRs. + unsigned addWorkGroupIDX() { + WorkGroupIDXSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDXSystemSGPR; + } + + unsigned addWorkGroupIDY() { + WorkGroupIDYSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDYSystemSGPR; + } + + unsigned addWorkGroupIDZ() { + WorkGroupIDZSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupIDZSystemSGPR; + } + + unsigned addWorkGroupInfo() { + WorkGroupInfoSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return WorkGroupInfoSystemSGPR; + } + + unsigned addPrivateSegmentWaveByteOffset() { + PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR(); + NumSystemSGPRs += 1; + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + bool hasPrivateSegmentBuffer() const { + return PrivateSegmentBuffer; + } + + bool hasDispatchPtr() const { + return DispatchPtr; + } + + bool hasQueuePtr() const { + return QueuePtr; + } + + bool hasDispatchID() const { + return DispatchID; + } + + bool hasKernargSegmentPtr() const { + return KernargSegmentPtr; + } + + bool hasFlatScratchInit() const { + return FlatScratchInit; + } + + bool hasGridWorkgroupCountX() const { + return GridWorkgroupCountX; + } + + bool hasGridWorkgroupCountY() const { + return GridWorkgroupCountY; + } + + bool hasGridWorkgroupCountZ() const { + return GridWorkgroupCountZ; + } + + bool hasWorkGroupIDX() const { + return WorkGroupIDX; + } + + bool hasWorkGroupIDY() const { + return WorkGroupIDY; + } + + bool hasWorkGroupIDZ() const { + return WorkGroupIDZ; + } + + bool hasWorkGroupInfo() const { + return WorkGroupInfo; + } + + bool hasPrivateSegmentWaveByteOffset() const { + return PrivateSegmentWaveByteOffset; + } + + bool hasWorkItemIDX() const { + return WorkItemIDX; + } + + bool hasWorkItemIDY() const { + return WorkItemIDY; + } + + bool hasWorkItemIDZ() const { + return WorkItemIDZ; + } + + unsigned getNumUserSGPRs() const { + return NumUserSGPRs; + } + + unsigned getNumPreloadedSGPRs() const { + return NumUserSGPRs + NumSystemSGPRs; + } + + unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + return PrivateSegmentWaveByteOffsetSystemSGPR; + } + + /// \brief Returns the physical register reserved for use as the resource + /// descriptor for scratch accesses. + unsigned getScratchRSrcReg() const { + return ScratchRSrcReg; + } + + void setScratchRSrcReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchRSrcReg = Reg; + } + + unsigned getScratchWaveOffsetReg() const { + return ScratchWaveOffsetReg; + } + + void setScratchWaveOffsetReg(unsigned Reg) { + assert(Reg != AMDGPU::NoRegister && "Should never be unset"); + ScratchWaveOffsetReg = Reg; + } + + bool hasSpilledSGPRs() const { + return HasSpilledSGPRs; + } + + void setHasSpilledSGPRs(bool Spill = true) { + HasSpilledSGPRs = Spill; + } + + bool hasSpilledVGPRs() const { + return HasSpilledVGPRs; + } + + void setHasSpilledVGPRs(bool Spill = true) { + HasSpilledVGPRs = Spill; + } unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; diff --git a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp deleted file mode 100644 index 2cd600d..0000000 --- a/contrib/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp +++ /dev/null @@ -1,193 +0,0 @@ -//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// This pass loads scratch pointer and scratch offset into a register or a -/// frame index which can be used anywhere in the program. These values will -/// be used for spilling VGPRs. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUSubtarget.h" -#include "SIDefines.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFrameInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RegisterScavenging.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/LLVMContext.h" - -using namespace llvm; - -namespace { - -class SIPrepareScratchRegs : public MachineFunctionPass { - -private: - static char ID; - -public: - SIPrepareScratchRegs() : MachineFunctionPass(ID) { } - - bool runOnMachineFunction(MachineFunction &MF) override; - - const char *getPassName() const override { - return "SI prepare scratch registers"; - } - -}; - -} // End anonymous namespace - -char SIPrepareScratchRegs::ID = 0; - -FunctionPass *llvm::createSIPrepareScratchRegs() { - return new SIPrepareScratchRegs(); -} - -bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const SIInstrInfo *TII = - static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - MachineBasicBlock *Entry = MF.begin(); - MachineBasicBlock::iterator I = Entry->begin(); - DebugLoc DL = I->getDebugLoc(); - - // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to - // run this pass. - if (!MFI->hasSpilledVGPRs()) - return false; - - unsigned ScratchPtrPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR); - unsigned ScratchOffsetPreloadReg = - TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET); - - if (!Entry->isLiveIn(ScratchPtrPreloadReg)) - Entry->addLiveIn(ScratchPtrPreloadReg); - - if (!Entry->isLiveIn(ScratchOffsetPreloadReg)) - Entry->addLiveIn(ScratchOffsetPreloadReg); - - // Load the scratch offset. - unsigned ScratchOffsetReg = - TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass); - int ScratchOffsetFI = -1; - - if (ScratchOffsetReg != AMDGPU::NoRegister) { - // Found an SGPR to use - MRI.setPhysRegUsed(ScratchOffsetReg); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg) - .addReg(ScratchOffsetPreloadReg); - } else { - // No SGPR is available, we must spill. - ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4); - BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE)) - .addReg(ScratchOffsetPreloadReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } - - - // Now that we have the scratch pointer and offset values, we need to - // add them to all the SI_SPILL_V* instructions. - - RegScavenger RS; - unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4); - RS.addScavengingFrameIndex(ScratchRsrcFI); - - for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); - BI != BE; ++BI) { - - MachineBasicBlock &MBB = *BI; - // Add the scratch offset reg as a live-in so that the register scavenger - // doesn't re-use it. - if (!MBB.isLiveIn(ScratchOffsetReg) && - ScratchOffsetReg != AMDGPU::NoRegister) - MBB.addLiveIn(ScratchOffsetReg); - RS.enterBasicBlock(&MBB); - - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - MachineInstr &MI = *I; - RS.forward(I); - DebugLoc DL = MI.getDebugLoc(); - if (!TII->isVGPRSpill(MI.getOpcode())) - continue; - - // Scratch resource - unsigned ScratchRsrcReg = - RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0); - - uint64_t Rsrc23 = TII->getScratchRsrcWords23(); - - unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); - unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); - unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2); - unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0) - .addExternalSymbol("SCRATCH_RSRC_DWORD0") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1) - .addExternalSymbol("SCRATCH_RSRC_DWORD1") - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2) - .addImm(Rsrc23 & 0xffffffff) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3) - .addImm(Rsrc23 >> 32) - .addReg(ScratchRsrcReg, RegState::ImplicitDefine); - - // Scratch Offset - if (ScratchOffsetReg == AMDGPU::NoRegister) { - ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE), - ScratchOffsetReg) - .addFrameIndex(ScratchOffsetFI) - .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef) - .addReg(AMDGPU::SGPR0, RegState::Undef); - } else if (!MBB.isLiveIn(ScratchOffsetReg)) { - MBB.addLiveIn(ScratchOffsetReg); - } - - if (ScratchRsrcReg == AMDGPU::NoRegister || - ScratchOffsetReg == AMDGPU::NoRegister) { - LLVMContext &Ctx = MF.getFunction()->getContext(); - Ctx.emitError("ran out of SGPRs for spilling VGPRs"); - ScratchRsrcReg = AMDGPU::SGPR0; - ScratchOffsetReg = AMDGPU::SGPR0; - } - MI.getOperand(2).setReg(ScratchRsrcReg); - MI.getOperand(2).setIsKill(true); - MI.getOperand(2).setIsUndef(false); - MI.getOperand(3).setReg(ScratchOffsetReg); - MI.getOperand(3).setIsUndef(false); - MI.getOperand(3).setIsKill(false); - MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true)); - MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true)); - } - } - return true; -} diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index e9e8412..3cdffef 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -12,7 +12,6 @@ // //===----------------------------------------------------------------------===// - #include "SIRegisterInfo.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" @@ -33,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co Reserved.set(*R); } +unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4; + unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx)); + return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the + // next sgpr128 down. + return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95; + } + + return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99; +} + +unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + if (ST.hasSGPRInitBug()) { + unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5; + return AMDGPU::SGPR_32RegClass.getRegister(Idx); + } + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // Next register before reservations for flat_scr and vcc. + return AMDGPU::SGPR97; + } + + return AMDGPU::SGPR95; +} + BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); @@ -42,13 +75,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { reserveRegisterTuples(Reserved, AMDGPU::EXEC); reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR); - // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs - reserveRegisterTuples(Reserved, AMDGPU::VGPR254); - reserveRegisterTuples(Reserved, AMDGPU::VGPR255); + // Reserve the last 2 registers so we will always have at least 2 more that + // will physically contain VCC. + reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103); + + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + + if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation + // for VCC/FLAT_SCR. + reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99); + reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101); + } // Tonga and Iceland can only allocate a fixed number of SGPRs due // to a hw bug. - if (MF.getSubtarget<AMDGPUSubtarget>().hasSGPRInitBug()) { + if (ST.hasSGPRInitBug()) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); // Reserve some SGPRs for FLAT_SCRATCH and VCC (4 SGPRs). // Assume XNACK_MASK is unused. @@ -60,34 +102,57 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } } + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); + if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { + // Reserve 1 SGPR for scratch wave offset in case we need to spill. + reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); + } + + unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); + if (ScratchRSrcReg != AMDGPU::NoRegister) { + // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need + // to spill. + // TODO: May need to reserve a VGPR if doing LDS spilling. + reserveRegisterTuples(Reserved, ScratchRSrcReg); + assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); + } + return Reserved; } unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const { - const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>(); // FIXME: We should adjust the max number of waves based on LDS size. unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(), STI.getMaxWavesPerCU()); unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU()); + unsigned VSLimit = SGPRLimit + VGPRLimit; + for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I) { + const TargetRegisterClass *RC = *I; - unsigned NumSubRegs = std::max((int)(*I)->getSize() / 4, 1); + unsigned NumSubRegs = std::max((int)RC->getSize() / 4, 1); unsigned Limit; - if (isSGPRClass(*I)) { + if (isPseudoRegClass(RC)) { + // FIXME: This is a hack. We should never be considering the pressure of + // these since no virtual register should ever have this class. + Limit = VSLimit; + } else if (isSGPRClass(RC)) { Limit = SGPRLimit / NumSubRegs; } else { Limit = VGPRLimit / NumSubRegs; } - const int *Sets = getRegClassPressureSets(*I); + const int *Sets = getRegClassPressureSets(RC); assert(Sets); for (unsigned i = 0; Sets[i] != -1; ++i) { - if (Sets[i] == (int)Idx) + if (Sets[i] == (int)Idx) return Limit; } } @@ -174,17 +239,17 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI, unsigned SubReg = NumSubRegs > 1 ? getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) : Value; - bool IsKill = (i == e - 1); BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp)) - .addReg(SubReg, getDefRegState(IsLoad)) - .addReg(ScratchRsrcReg, getKillRegState(IsKill)) - .addReg(SOffset) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)); + .addReg(SubReg, getDefRegState(IsLoad)) + .addReg(ScratchRsrcReg) + .addReg(SOffset) + .addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addReg(Value, RegState::Implicit | getDefRegState(IsLoad)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); } } @@ -228,6 +293,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(SubReg) .addImm(Spill.Lane); + // FIXME: Since this spills to another register instead of an actual + // frame index, we should delete the frame index when all references to + // it are fixed. } MI->eraseFromParent(); break; @@ -263,16 +331,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // TODO: only do this when it is needed switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) { case AMDGPUSubtarget::SOUTHERN_ISLANDS: - // "VALU writes SGPR" -> "SMRD reads that SGPR" needs "S_NOP 3" on SI - TII->insertNOPs(MI, 3); + // "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states + // ("S_NOP 3") on SI + TII->insertWaitStates(MI, 4); break; case AMDGPUSubtarget::SEA_ISLANDS: break; default: // VOLCANIC_ISLANDS and later - // "VALU writes SGPR -> VMEM reads that SGPR" needs "S_NOP 4" on VI - // and later. This also applies to VALUs which write VCC, but we're - // unlikely to see VMEM use VCC. - TII->insertNOPs(MI, 4); + // "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states + // ("S_NOP 4") on VI and later. This also applies to VALUs which write + // VCC, but we're unlikely to see VMEM use VCC. + TII->insertWaitStates(MI, 5); } MI->eraseFromParent(); @@ -322,22 +391,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, } } -const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass( - MVT VT) const { - switch(VT.SimpleTy) { - default: - case MVT::i32: return &AMDGPU::VGPR_32RegClass; - } -} - unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const { return getEncodingValue(Reg) & 0xff; } +// FIXME: This is very slow. It might be worth creating a map from physreg to +// register class. const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { assert(!TargetRegisterInfo::isVirtualRegister(Reg)); - static const TargetRegisterClass *BaseClasses[] = { + static const TargetRegisterClass *const BaseClasses[] = { &AMDGPU::VGPR_32RegClass, &AMDGPU::SReg_32RegClass, &AMDGPU::VReg_64RegClass, @@ -359,33 +422,45 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { return nullptr; } +// TODO: It might be helpful to have some target specific flags in +// TargetRegisterClass to mark which classes are VGPRs to make this trivial. bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const { - return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) || - getCommonSubClass(&AMDGPU::VReg_512RegClass, RC); + switch (RC->getSize()) { + case 4: + return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr; + case 8: + return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr; + case 12: + return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr; + case 16: + return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr; + case 32: + return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr; + case 64: + return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr; + default: + llvm_unreachable("Invalid register class size"); + } } const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass( const TargetRegisterClass *SRC) const { - if (hasVGPRs(SRC)) { - return SRC; - } else if (SRC == &AMDGPU::SCCRegRegClass) { - return &AMDGPU::VCCRegRegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) { - return &AMDGPU::VGPR_32RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) { - return &AMDGPU::VReg_64RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) { - return &AMDGPU::VReg_128RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) { - return &AMDGPU::VReg_256RegClass; - } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) { - return &AMDGPU::VReg_512RegClass; - } - return nullptr; + switch (SRC->getSize()) { + case 4: + return &AMDGPU::VGPR_32RegClass; + case 8: + return &AMDGPU::VReg_64RegClass; + case 12: + return &AMDGPU::VReg_96RegClass; + case 16: + return &AMDGPU::VReg_128RegClass; + case 32: + return &AMDGPU::VReg_256RegClass; + case 64: + return &AMDGPU::VReg_512RegClass; + default: + llvm_unreachable("Invalid register class size"); + } } const TargetRegisterClass *SIRegisterInfo::getSubRegClass( @@ -402,6 +477,30 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass( } } +bool SIRegisterInfo::shouldRewriteCopySrc( + const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const { + // We want to prefer the smallest register class possible, so we don't want to + // stop and rewrite on anything that looks like a subregister + // extract. Operations mostly don't care about the super register class, so we + // only want to stop on the most basic of copies between the smae register + // class. + // + // e.g. if we have something like + // vreg0 = ... + // vreg1 = ... + // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2 + // vreg3 = COPY vreg2, sub0 + // + // We want to look through the COPY to find: + // => vreg3 = COPY vreg0 + + // Plain copy. + return getCommonSubClass(DefRC, SrcRC) != nullptr; +} + unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC, unsigned Channel) const { @@ -462,30 +561,47 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { return OpType == AMDGPU::OPERAND_REG_INLINE_C; } +// FIXME: Most of these are flexible with HSA and we don't need to reserve them +// as input registers if unused. Whether the dispatch ptr is necessary should be +// easy to detect from used intrinsics. Scratch setup is harder to know. unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>(); + (void)ST; switch (Value) { - case SIRegisterInfo::TGID_X: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0); - case SIRegisterInfo::TGID_Y: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1); - case SIRegisterInfo::TGID_Z: - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2); - case SIRegisterInfo::SCRATCH_WAVE_OFFSET: - if (MFI->getShaderType() != ShaderType::COMPUTE) - return MFI->ScratchOffsetReg; - return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4); - case SIRegisterInfo::SCRATCH_PTR: - return AMDGPU::SGPR2_SGPR3; - case SIRegisterInfo::INPUT_PTR: - return AMDGPU::SGPR0_SGPR1; - case SIRegisterInfo::TIDIG_X: + case SIRegisterInfo::WORKGROUP_ID_X: + assert(MFI->hasWorkGroupIDX()); + return MFI->WorkGroupIDXSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Y: + assert(MFI->hasWorkGroupIDY()); + return MFI->WorkGroupIDYSystemSGPR; + case SIRegisterInfo::WORKGROUP_ID_Z: + assert(MFI->hasWorkGroupIDZ()); + return MFI->WorkGroupIDZSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET: + return MFI->PrivateSegmentWaveByteOffsetSystemSGPR; + case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER: + assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations"); + assert(MFI->hasPrivateSegmentBuffer()); + return MFI->PrivateSegmentBufferUserSGPR; + case SIRegisterInfo::KERNARG_SEGMENT_PTR: + assert(MFI->hasKernargSegmentPtr()); + return MFI->KernargSegmentPtrUserSGPR; + case SIRegisterInfo::DISPATCH_PTR: + assert(MFI->hasDispatchPtr()); + return MFI->DispatchPtrUserSGPR; + case SIRegisterInfo::QUEUE_PTR: + llvm_unreachable("not implemented"); + case SIRegisterInfo::WORKITEM_ID_X: + assert(MFI->hasWorkItemIDX()); return AMDGPU::VGPR0; - case SIRegisterInfo::TIDIG_Y: + case SIRegisterInfo::WORKITEM_ID_Y: + assert(MFI->hasWorkItemIDY()); return AMDGPU::VGPR1; - case SIRegisterInfo::TIDIG_Z: + case SIRegisterInfo::WORKITEM_ID_Z: + assert(MFI->hasWorkItemIDZ()); return AMDGPU::VGPR2; } llvm_unreachable("unexpected preloaded value type"); @@ -496,12 +612,9 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, // AMDGPU::NoRegister. unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC) const { - - for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); - I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) - return *I; - } + for (unsigned Reg : *RC) + if (!MRI.isPhysRegUsed(Reg)) + return Reg; return AMDGPU::NoRegister; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 7da6de2..1795237 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -18,6 +18,7 @@ #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" namespace llvm { @@ -29,6 +30,15 @@ private: public: SIRegisterInfo(); + /// Return the end register initially reserved for the scratch buffer in case + /// spilling is needed. + unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; + + /// Return the end register initially reserved for the scratch wave offset in + /// case spilling is needed. + unsigned reservedPrivateSegmentWaveByteOffsetReg( + const MachineFunction &MF) const; + BitVector getReservedRegs(const MachineFunction &MF) const override; unsigned getRegPressureSetLimit(const MachineFunction &MF, @@ -40,10 +50,6 @@ public: unsigned FIOperandNum, RegScavenger *RS) const override; - /// \brief get the register class of the specified type to use in the - /// CFGStructurizer - const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override; - unsigned getHWRegIndex(unsigned Reg) const override; /// \brief Return the 'base' register class for this register. @@ -52,23 +58,30 @@ public: /// \returns true if this class contains only SGPR registers bool isSGPRClass(const TargetRegisterClass *RC) const { - if (!RC) - return false; - return !hasVGPRs(RC); } /// \returns true if this class ID contains only SGPR registers bool isSGPRClassID(unsigned RCID) const { - if (static_cast<int>(RCID) == -1) - return false; - return isSGPRClass(getRegClass(RCID)); } + bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return isSGPRClass(MRI.getRegClass(Reg)); + return getPhysRegClass(Reg); + } + /// \returns true if this class contains VGPR registers. bool hasVGPRs(const TargetRegisterClass *RC) const; + /// returns true if this is a pseudoregister class combination of VGPRs and + /// SGPRs for operand modeling. FIXME: We should set isAllocatable = 0 on + /// them. + static bool isPseudoRegClass(const TargetRegisterClass *RC) { + return RC == &AMDGPU::VS_32RegClass || RC == &AMDGPU::VS_64RegClass; + } + /// \returns A VGPR reg class with the same width as \p SRC const TargetRegisterClass *getEquivalentVGPRClass( const TargetRegisterClass *SRC) const; @@ -79,6 +92,11 @@ public: const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC, unsigned SubIdx) const; + bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, + unsigned DefSubReg, + const TargetRegisterClass *SrcRC, + unsigned SrcSubReg) const override; + /// \p Channel This is the register channel (e.g. a value from 0-16), not the /// SubReg index. /// \returns The sub-register of Reg that is in Channel. @@ -91,19 +109,25 @@ public: /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or - /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. + /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. bool opCanUseInlineConstant(unsigned OpType) const; enum PreloadedValue { - TGID_X, - TGID_Y, - TGID_Z, - SCRATCH_WAVE_OFFSET, - SCRATCH_PTR, - INPUT_PTR, - TIDIG_X, - TIDIG_Y, - TIDIG_Z + // SGPRS: + PRIVATE_SEGMENT_BUFFER = 0, + DISPATCH_PTR = 1, + QUEUE_PTR = 2, + KERNARG_SEGMENT_PTR = 3, + WORKGROUP_ID_X = 10, + WORKGROUP_ID_Y = 11, + WORKGROUP_ID_Z = 12, + PRIVATE_SEGMENT_WAVE_BYTE_OFFSET = 14, + + // VGPRS: + FIRST_VGPR_VALUE = 15, + WORKITEM_ID_X = FIRST_VGPR_VALUE, + WORKITEM_ID_Y = 16, + WORKITEM_ID_Z = 17 }; /// \brief Returns the physical register that \p Value is stored in. diff --git a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 2a9017f..bfaf937 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/contrib/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -10,10 +10,13 @@ //===----------------------------------------------------------------------===// // Declarations that describe the SI registers //===----------------------------------------------------------------------===// - -class SIReg <string n, bits<16> encoding = 0> : Register<n> { +class SIReg <string n, bits<16> regIdx = 0> : Register<n>, + DwarfRegNum<[!cast<int>(HWEncoding)]> { let Namespace = "AMDGPU"; - let HWEncoding = encoding; + + // This is the not yet the complete register encoding. An additional + // bit is set for VGPRs. + let HWEncoding = regIdx; } // Special Registers @@ -21,7 +24,8 @@ def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; // VCC for 64-bit instructions -def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { +def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, + DwarfRegAlias<VCC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 106; @@ -30,7 +34,8 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { def EXEC_LO : SIReg<"exec_lo", 126>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { +def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>, + DwarfRegAlias<EXEC_LO> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 126; @@ -39,18 +44,29 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]> { def SCC : SIReg<"scc", 253>; def M0 : SIReg <"m0", 124>; -def FLAT_SCR_LO : SIReg<"flat_scr_lo", 104>; // Offset in units of 256-bytes. -def FLAT_SCR_HI : SIReg<"flat_scr_hi", 105>; // Size is the per-thread scratch size, in bytes. +multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> { + def _ci : SIReg<n, ci_e>; + def _vi : SIReg<n, vi_e>; + def "" : SIReg<"", 0>; +} -// Pair to indicate location of scratch space for flat accesses. -def FLAT_SCR : RegisterWithSubRegs <"flat_scr", [FLAT_SCR_LO, FLAT_SCR_HI]> { +class FlatReg <Register lo, Register hi, bits<16> encoding> : + RegisterWithSubRegs<"flat_scratch", [lo, hi]>, + DwarfRegAlias<lo> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; - let HWEncoding = 104; + let HWEncoding = encoding; } +defm FLAT_SCR_LO : FLAT_SCR_LOHI_m<"flat_scratch_lo", 104, 102>; // Offset in units of 256-bytes. +defm FLAT_SCR_HI : FLAT_SCR_LOHI_m<"flat_scratch_hi", 105, 103>; // Size is the per-thread scratch size, in bytes. + +def FLAT_SCR_ci : FlatReg<FLAT_SCR_LO_ci, FLAT_SCR_HI_ci, 104>; +def FLAT_SCR_vi : FlatReg<FLAT_SCR_LO_vi, FLAT_SCR_HI_vi, 102>; +def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>; + // SGPR registers -foreach Index = 0-101 in { +foreach Index = 0-103 in { def SGPR#Index : SIReg <"SGPR"#Index, Index>; } @@ -65,25 +81,27 @@ foreach Index = 0-255 in { // Groupings using register classes and tuples //===----------------------------------------------------------------------===// +// TODO: Do we need to set DwarfRegAlias on register tuples? + // SGPR 32-bit registers def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32, - (add (sequence "SGPR%u", 0, 101))>; + (add (sequence "SGPR%u", 0, 103))>; // SGPR 64-bit registers def SGPR_64Regs : RegisterTuples<[sub0, sub1], - [(add (decimate (trunc SGPR_32, 101), 2)), + [(add (decimate SGPR_32, 2)), (add (decimate (shl SGPR_32, 1), 2))]>; // SGPR 128-bit registers def SGPR_128 : RegisterTuples<[sub0, sub1, sub2, sub3], - [(add (decimate (trunc SGPR_32, 99), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4))]>; // SGPR 256-bit registers def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], - [(add (decimate (trunc SGPR_32, 95), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4)), @@ -95,7 +113,7 @@ def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7], // SGPR 512-bit registers def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15], - [(add (decimate (trunc SGPR_32, 87), 4)), + [(add (decimate SGPR_32, 4)), (add (decimate (shl SGPR_32, 1), 4)), (add (decimate (shl SGPR_32, 2), 4)), (add (decimate (shl SGPR_32, 3), 4)), @@ -174,44 +192,57 @@ class RegImmMatcher<string name> : AsmOperandClass { let RenderMethod = "addRegOrImmOperands"; } -// Special register classes for predicates and the M0 register -def SCCReg : RegisterClass<"AMDGPU", [i32, i1], 32, (add SCC)> { - let CopyCost = -1; // Theoretically it is possible to read from SCC, - // but it should never be necessary. -} - -def VCCReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add VCC)>; -def EXECReg : RegisterClass<"AMDGPU", [i64, i1], 64, (add EXEC)>; - // Register class for all scalar registers (SGPRs + Special Registers) def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SGPR_32, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI) >; -def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 64, (add SGPR_64Regs)>; +def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)>; -def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 64, - (add SGPR_64, VCCReg, EXECReg, FLAT_SCR) +def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32, + (add SGPR_64, VCC, EXEC, FLAT_SCR) >; -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8], 128, (add SGPR_128)>; +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128)> { + // Requires 2 s_mov_b64 to copy + let CopyCost = 2; +} -def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>; +def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add SGPR_256)> { + // Requires 4 s_mov_b64 to copy + let CopyCost = 4; +} -def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>; +def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> { + // Requires 8 s_mov_b64 to copy + let CopyCost = 8; +} // Register class for all vector registers (VGPRs + Interploation Registers) -def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 64, (add VGPR_64)>; +def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> { + // Requires 2 v_mov_b32 to copy + let CopyCost = 2; +} -def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> { +def VReg_96 : RegisterClass<"AMDGPU", [untyped], 32, (add VGPR_96)> { let Size = 96; + + // Requires 3 v_mov_b32 to copy + let CopyCost = 3; } -def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>; +def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32, (add VGPR_128)> { + // Requires 4 v_mov_b32 to copy + let CopyCost = 4; +} -def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>; +def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 32, (add VGPR_256)> { + let CopyCost = 8; +} -def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>; +def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add VGPR_512)> { + let CopyCost = 16; +} def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> { let Size = 32; @@ -253,7 +284,9 @@ def SCSrc_32 : RegInlineOperand<SReg_32> { def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)>; -def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>; +def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> { + let CopyCost = 2; +} def VSrc_32 : RegisterOperand<VS_32> { let OperandNamespace = "AMDGPU"; @@ -282,3 +315,13 @@ def VCSrc_64 : RegisterOperand<VS_64> { let OperandType = "OPERAND_REG_INLINE_C"; let ParserMatchClass = RegImmMatcher<"VCSrc64">; } + +//===----------------------------------------------------------------------===// +// SCSrc_* Operands with an SGPR or an inline constant +//===----------------------------------------------------------------------===// + +def SCSrc_64 : RegisterOperand<SReg_64> { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_REG_INLINE_C"; + let ParserMatchClass = RegImmMatcher<"SCSrc64">; +} diff --git a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td index 9b1f676..cd77e51 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/contrib/llvm/lib/Target/AMDGPU/SISchedule.td @@ -17,16 +17,28 @@ def WriteLDS : SchedWrite; def WriteSALU : SchedWrite; def WriteSMEM : SchedWrite; def WriteVMEM : SchedWrite; +def WriteBarrier : SchedWrite; // Vector ALU instructions def Write32Bit : SchedWrite; def WriteQuarterRate32 : SchedWrite; +def WriteFullOrQuarterRate32 : SchedWrite; def WriteFloatFMA : SchedWrite; -def WriteDouble : SchedWrite; +// Slow quarter rate f64 instruction. +def WriteDouble : SchedWrite; + +// half rate f64 instruction (same as v_add_f64) def WriteDoubleAdd : SchedWrite; +// Half rate 64-bit instructions. +def Write64Bit : SchedWrite; + +// FIXME: Should there be a class for instructions which are VALU +// instructions and have VALU rates, but write to the SALU (i.e. VOPC +// instructions) + def SIFullSpeedModel : SchedMachineModel; def SIQuarterSpeedModel : SchedMachineModel; @@ -53,7 +65,7 @@ class HWVALUWriteRes<SchedWrite write, int latency> : // The latency numbers are taken from AMD Accelerated Parallel Processing -// guide. They may not be acurate. +// guide. They may not be accurate. // The latency values are 1 / (operations / cycle) / 4. multiclass SICommonWriteRes { @@ -64,8 +76,10 @@ multiclass SICommonWriteRes { def : HWWriteRes<WriteSALU, [HWSALU], 1>; def : HWWriteRes<WriteSMEM, [HWLGKM], 10>; // XXX: Guessed ??? def : HWWriteRes<WriteVMEM, [HWVMEM], 450>; // 300 - 600 + def : HWWriteRes<WriteBarrier, [HWBranch], 500>; // XXX: Guessed ??? def : HWVALUWriteRes<Write32Bit, 1>; + def : HWVALUWriteRes<Write64Bit, 2>; def : HWVALUWriteRes<WriteQuarterRate32, 4>; } diff --git a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 5d00bdd..4f0913f 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -141,8 +141,7 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, if (!MRI.isSSA()) return; - assert(TII->isVOP1(MI.getOpcode()) || TII->isVOP2(MI.getOpcode()) || - TII->isVOPC(MI.getOpcode())); + assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); const SIRegisterInfo &TRI = TII->getRegisterInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -187,6 +186,21 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, } +// Copy MachineOperand with all flags except setting it as implicit. +static MachineOperand copyRegOperandAsImplicit(const MachineOperand &Orig) { + assert(!Orig.isImplicit()); + return MachineOperand::CreateReg(Orig.getReg(), + Orig.isDef(), + true, + Orig.isKill(), + Orig.isDead(), + Orig.isUndef(), + Orig.isEarlyClobber(), + Orig.getSubReg(), + Orig.isDebug(), + Orig.isInternalRead()); +} + bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = @@ -236,14 +250,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { if (TII->isVOPC(Op32)) { unsigned DstReg = MI.getOperand(0).getReg(); if (TargetRegisterInfo::isVirtualRegister(DstReg)) { - // VOPC instructions can only write to the VCC register. We can't - // force them to use VCC here, because the register allocator has - // trouble with sequences like this, which cause the allocator to run - // out of registers if vreg0 and vreg1 belong to the VCCReg register - // class: - // vreg0 = VOPC; - // vreg1 = VOPC; - // S_AND_B64 vreg0, vreg1 + // VOPC instructions can only write to the VCC register. We can't + // force them to use VCC here, because this is only one register and + // cannot deal with sequences which would require multiple copies of + // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...) // // So, instead of forcing the instruction to write to VCC, we provide // a hint to the register allocator to use VCC and then we we will run @@ -272,13 +282,22 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { } // We can shrink this instruction - DEBUG(dbgs() << "Shrinking "; MI.dump(); dbgs() << '\n';); + DEBUG(dbgs() << "Shrinking " << MI); MachineInstrBuilder Inst32 = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(Op32)); - // dst - Inst32.addOperand(MI.getOperand(0)); + // Add the dst operand if the 32-bit encoding also has an explicit $dst. + // For VOPC instructions, this is replaced by an implicit def of vcc. + int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::dst); + if (Op32DstIdx != -1) { + // dst + Inst32.addOperand(MI.getOperand(0)); + } else { + assert(MI.getOperand(0).getReg() == AMDGPU::VCC && + "Unexpected case"); + } + Inst32.addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)); @@ -288,9 +307,19 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { Inst32.addOperand(*Src1); const MachineOperand *Src2 = - TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (Src2) - Inst32.addOperand(*Src2); + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2) { + int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2); + if (Op32Src2Idx != -1) { + Inst32.addOperand(*Src2); + } else { + // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is + // replaced with an implicit read of vcc. + assert(Src2->getReg() == AMDGPU::VCC && + "Unexpected missing register operand"); + Inst32.addOperand(copyRegOperandAsImplicit(*Src2)); + } + } ++NumInstructionsShrunk; MI.eraseFromParent(); diff --git a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp index 591ce85..dbdc76b 100644 --- a/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/SITypeRewriter.cpp @@ -22,6 +22,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPU.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" @@ -61,14 +62,7 @@ bool SITypeRewriter::doInitialization(Module &M) { } bool SITypeRewriter::runOnFunction(Function &F) { - Attribute A = F.getFnAttribute("ShaderType"); - - unsigned ShaderType = ShaderType::COMPUTE; - if (A.isStringAttribute()) { - StringRef Str = A.getValueAsString(); - Str.getAsInteger(0, ShaderType); - } - if (ShaderType == ShaderType::COMPUTE) + if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE) return false; visit(F); diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index b76b400..add415e 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -7,12 +7,23 @@ // //===----------------------------------------------------------------------===// #include "AMDGPUBaseInfo.h" +#include "AMDGPU.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/SubtargetFeature.h" #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM +#define GET_REGINFO_ENUM +#include "AMDGPUGenRegisterInfo.inc" +#undef GET_REGINFO_ENUM + namespace llvm { namespace AMDGPU { @@ -56,5 +67,91 @@ void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, Header.private_segment_alignment = 4; } +MCSection *getHSATextSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsatext", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_EXECINSTR | + ELF::SHF_AMDGPU_HSA_AGENT | + ELF::SHF_AMDGPU_HSA_CODE); +} + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsadata_global_program", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE | + ELF::SHF_AMDGPU_HSA_GLOBAL); +} + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) { + return Ctx.getELFSection(".hsarodata_readonly_agent", ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_AMDGPU_HSA_READONLY | + ELF::SHF_AMDGPU_HSA_AGENT); +} + +bool isGroupSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; +} + +bool isGlobalSegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; +} + +bool isReadOnlySegment(const GlobalValue *GV) { + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; +} + +static const char ShaderTypeAttribute[] = "ShaderType"; + +unsigned getShaderType(const Function &F) { + Attribute A = F.getFnAttribute(ShaderTypeAttribute); + unsigned ShaderType = ShaderType::COMPUTE; + + if (A.isStringAttribute()) { + StringRef Str = A.getValueAsString(); + if (Str.getAsInteger(0, ShaderType)) { + LLVMContext &Ctx = F.getContext(); + Ctx.emitError("can't parse shader type"); + } + } + return ShaderType; +} + +bool isSI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; +} + +bool isCI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands]; +} + +bool isVI(const MCSubtargetInfo &STI) { + return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]; +} + +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { + + switch(Reg) { + default: break; + case AMDGPU::FLAT_SCR: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_ci : AMDGPU::FLAT_SCR_vi; + + case AMDGPU::FLAT_SCR_LO: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_LO_ci : AMDGPU::FLAT_SCR_LO_vi; + + case AMDGPU::FLAT_SCR_HI: + assert(!isSI(STI)); + return isCI(STI) ? AMDGPU::FLAT_SCR_HI_ci : AMDGPU::FLAT_SCR_HI_vi; + } + return Reg; +} + } // End namespace AMDGPU } // End namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index f57028c..19419a2 100644 --- a/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -15,6 +15,11 @@ namespace llvm { class FeatureBitset; +class Function; +class GlobalValue; +class MCContext; +class MCSection; +class MCSubtargetInfo; namespace AMDGPU { @@ -27,6 +32,27 @@ struct IsaVersion { IsaVersion getIsaVersion(const FeatureBitset &Features); void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const FeatureBitset &Features); +MCSection *getHSATextSection(MCContext &Ctx); + +MCSection *getHSADataGlobalAgentSection(MCContext &Ctx); + +MCSection *getHSADataGlobalProgramSection(MCContext &Ctx); + +MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx); + +bool isGroupSegment(const GlobalValue *GV); +bool isGlobalSegment(const GlobalValue *GV); +bool isReadOnlySegment(const GlobalValue *GV); + +unsigned getShaderType(const Function &F); + +bool isSI(const MCSubtargetInfo &STI); +bool isCI(const MCSubtargetInfo &STI); +bool isVI(const MCSubtargetInfo &STI); + +/// If \p Reg is a pseudo reg, return the correct hardware register given +/// \p STI otherwise return \p Reg. +unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI); } // end namespace AMDGPU } // end namespace llvm diff --git a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td index aca4673..20a026a 100644 --- a/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td +++ b/contrib/llvm/lib/Target/AMDGPU/VIInstructions.td @@ -73,8 +73,8 @@ defm V_MIN_I16 : VOP2Inst <vop2<0,0x32>, "v_min_i16", VOP_I16_I16_I16>; } // End isCommutable = 1 defm V_LDEXP_F16 : VOP2Inst <vop2<0,0x33>, "v_ldexp_f16", VOP_F16_F16_I16>; -// Aliases to simplify matching of floating-pint instructions that are VOP2 on -// SI and VOP3 on VI. +// Aliases to simplify matching of floating-point instructions that +// are VOP2 on SI and VOP3 on VI. class SI2_VI3Alias <string name, Instruction inst> : InstAlias < name#" $dst, $src0, $src1", @@ -89,60 +89,15 @@ def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>; def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>; -} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI - //===----------------------------------------------------------------------===// -// SMEM Patterns +// SMEM Instructions //===----------------------------------------------------------------------===// -let Predicates = [isVI] in { +def S_DCACHE_WB : SMEM_Inval <0x21, + "s_dcache_wb", int_amdgcn_s_dcache_wb>; -// 1. Offset as 20bit DWORD immediate -def : Pat < - (SIload_constant v4i32:$sbase, IMM20bit:$offset), - (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_i32imm $offset)) ->; - -// Patterns for global loads with no offset -class FlatLoadPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) ->; - -def : FlatLoadPat <FLAT_LOAD_UBYTE, az_extloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SBYTE, sextloadi8_global, i32>; -def : FlatLoadPat <FLAT_LOAD_USHORT, az_extloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_global, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORD, global_load, i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX2, global_load, v2i32>; -def : FlatLoadPat <FLAT_LOAD_DWORDX4, global_load, v4i32>; - -class FlatStorePat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (node vt:$data, i64:$addr), - (inst $data, $addr, 0, 0, 0) ->; - -def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_global, i32>; -def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_global, i32>; -def : FlatStorePat <FLAT_STORE_DWORD, global_store, i32>; -def : FlatStorePat <FLAT_STORE_DWORDX2, global_store, v2i32>; -def : FlatStorePat <FLAT_STORE_DWORDX4, global_store, v4i32>; - -class FlatAtomicPat <FLAT inst, SDPatternOperator node, ValueType vt> : Pat < - (vt (node i64:$addr, vt:$data)), - (inst $addr, $data, 0, 0) ->; - -def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>; -def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>; +def S_DCACHE_WB_VOL : SMEM_Inval <0x23, + "s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>; +} // End SIAssemblerPredicate = DisableInst, SubtargetPredicate = isVI -} // End Predicates = [isVI] diff --git a/contrib/llvm/lib/Target/ARM/ARM.h b/contrib/llvm/lib/Target/ARM/ARM.h index 9550a3a..cd7540e 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.h +++ b/contrib/llvm/lib/Target/ARM/ARM.h @@ -35,7 +35,6 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM, FunctionPass *createA15SDOptimizerPass(); FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); -FunctionPass *createARMGlobalBaseRegPass(); FunctionPass *createARMConstantIslandPass(); FunctionPass *createMLxExpansionPass(); FunctionPass *createThumb2ITBlockPass(); diff --git a/contrib/llvm/lib/Target/ARM/ARM.td b/contrib/llvm/lib/Target/ARM/ARM.td index ef609a6..a44dc83 100644 --- a/contrib/llvm/lib/Target/ARM/ARM.td +++ b/contrib/llvm/lib/Target/ARM/ARM.td @@ -17,6 +17,17 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// +// ARM Helper classes. +// + +class ProcNoItin<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; + +class Architecture<string fname, string aname, list<SubtargetFeature> features > + : SubtargetFeature<fname, "ARMArch", aname, + !strconcat(aname, " architecture"), features>; + +//===----------------------------------------------------------------------===// // ARM Subtarget state. // @@ -51,8 +62,11 @@ def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true", "Enable ARMv8 FP", [FeatureVFP4]>; +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Enable full half-precision floating point", + [FeatureFPARMv8]>; def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", - "Restrict VFP3 to 16 double registers">; + "Restrict FP to 16 double registers">; def FeatureHWDiv : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true", "Enable divide instructions">; def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", @@ -119,9 +133,9 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", def FeatureHasRAS : SubtargetFeature<"ras", "HasRAS", "true", "Has return address stack">; -/// Some M architectures don't have the DSP extension (v7E-M vs. v7M) -def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true", - "Supports v7 DSP instructions in Thumb2">; +/// DSP extension. +def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", + "Supports DSP instructions in ARM and/or Thumb2">; // Multiprocessing extension. def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", @@ -150,11 +164,28 @@ def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass", def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", "NaCl trap">; +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", "Generate calls via indirect call " "instructions">; -// ARM ISAs. +def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", + "Reserve R9, making it unavailable as " + "GPR">; + +def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true", + "Don't use movt/movw pairs for 32-bit " + "imms">; + + +//===----------------------------------------------------------------------===// +// ARM ISAa. +// + def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true", "Support ARM v4T instructions">; def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true", @@ -180,302 +211,444 @@ def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", [HasV6T2Ops, FeaturePerfMon]>; def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", "Support ARM v8 instructions", - [HasV7Ops, FeatureVirtualization, - FeatureMP]>; + [HasV7Ops]>; def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", - [HasV8Ops, FeatureAClass, FeatureCRC]>; + [HasV8Ops]>; +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + "Support ARM v8.2a instructions", + [HasV8_1aOps]>; + //===----------------------------------------------------------------------===// -// ARM Processors supported. +// ARM Processor subtarget features. // -include "ARMSchedule.td" - -// ARM processor families. def ProcA5 : SubtargetFeature<"a5", "ARMProcFamily", "CortexA5", - "Cortex-A5 ARM processors", - [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVMLxForwarding, FeatureT2XtPk, - FeatureTrustZone, FeatureMP]>; + "Cortex-A5 ARM processors", []>; def ProcA7 : SubtargetFeature<"a7", "ARMProcFamily", "CortexA7", - "Cortex-A7 ARM processors", - [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVMLxForwarding, FeatureT2XtPk, - FeatureVFP4, FeatureMP, - FeatureHWDiv, FeatureHWDivARM, - FeatureTrustZone, FeatureVirtualization]>; + "Cortex-A7 ARM processors", []>; def ProcA8 : SubtargetFeature<"a8", "ARMProcFamily", "CortexA8", - "Cortex-A8 ARM processors", - [FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVMLxForwarding, FeatureT2XtPk, - FeatureTrustZone]>; + "Cortex-A8 ARM processors", []>; def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9", - "Cortex-A9 ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureFP16, - FeatureAvoidPartialCPSR, - FeatureTrustZone]>; -def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", - "Swift ARM processors", - [FeatureNEONForFP, FeatureT2XtPk, - FeatureVFP4, FeatureMP, FeatureHWDiv, - FeatureHWDivARM, FeatureAvoidPartialCPSR, - FeatureAvoidMOVsShOp, - FeatureHasSlowFPVMLx, FeatureTrustZone]>; + "Cortex-A9 ARM processors", []>; def ProcA12 : SubtargetFeature<"a12", "ARMProcFamily", "CortexA12", - "Cortex-A12 ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureVFP4, - FeatureHWDiv, FeatureHWDivARM, - FeatureAvoidPartialCPSR, - FeatureVirtualization, - FeatureTrustZone]>; - - -// FIXME: It has not been determined if A15 has these features. -def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", - "Cortex-A15 ARM processors", - [FeatureT2XtPk, FeatureVFP4, - FeatureMP, FeatureHWDiv, FeatureHWDivARM, - FeatureAvoidPartialCPSR, - FeatureTrustZone, FeatureVirtualization]>; - + "Cortex-A12 ARM processors", []>; +def ProcA15 : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15", + "Cortex-A15 ARM processors", []>; def ProcA17 : SubtargetFeature<"a17", "ARMProcFamily", "CortexA17", - "Cortex-A17 ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureVFP4, - FeatureHWDiv, FeatureHWDivARM, - FeatureAvoidPartialCPSR, - FeatureVirtualization, - FeatureTrustZone]>; - + "Cortex-A17 ARM processors", []>; +def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", + "Cortex-A35 ARM processors", []>; def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", - "Cortex-A53 ARM processors", - [FeatureHWDiv, FeatureHWDivARM, - FeatureTrustZone, FeatureT2XtPk, - FeatureCrypto, FeatureCRC]>; - + "Cortex-A53 ARM processors", []>; def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", - "Cortex-A57 ARM processors", - [FeatureHWDiv, FeatureHWDivARM, - FeatureTrustZone, FeatureT2XtPk, - FeatureCrypto, FeatureCRC]>; + "Cortex-A57 ARM processors", []>; +def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", + "Cortex-A72 ARM processors", []>; -def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", - "Cortex-R4 ARM processors", - [FeatureHWDiv, - FeatureAvoidPartialCPSR, - FeatureDSPThumb2, FeatureT2XtPk, - HasV7Ops, FeatureDB, FeatureHasRAS, - FeatureRClass]>; +def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", + "Qualcomm ARM processors", []>; +def ProcSwift : SubtargetFeature<"swift", "ARMProcFamily", "Swift", + "Swift ARM processors", []>; + +def ProcR4 : SubtargetFeature<"r4", "ARMProcFamily", "CortexR4", + "Cortex-R4 ARM processors", []>; def ProcR5 : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5", - "Cortex-R5 ARM processors", - [FeatureSlowFPBrcc, - FeatureHWDiv, FeatureHWDivARM, - FeatureHasSlowFPVMLx, - FeatureAvoidPartialCPSR, - FeatureT2XtPk]>; - -// FIXME: krait has currently the same features as A9 -// plus VFP4 and hardware division features. -def ProcKrait : SubtargetFeature<"krait", "ARMProcFamily", "Krait", - "Qualcomm ARM processors", - [FeatureVMLxForwarding, - FeatureT2XtPk, FeatureFP16, - FeatureAvoidPartialCPSR, - FeatureTrustZone, - FeatureVFP4, - FeatureHWDiv, - FeatureHWDivARM]>; + "Cortex-R5 ARM processors", []>; +def ProcR7 : SubtargetFeature<"r7", "ARMProcFamily", "CortexR7", + "Cortex-R7 ARM processors", []>; -class ProcNoItin<string Name, list<SubtargetFeature> Features> - : Processor<Name, NoItineraries, Features>; +//===----------------------------------------------------------------------===// +// ARM schedules. +// + +include "ARMSchedule.td" + + +//===----------------------------------------------------------------------===// +// ARM architectures +// + +def ARMv2 : Architecture<"armv2", "ARMv2", []>; + +def ARMv2a : Architecture<"armv2a", "ARMv2a", []>; + +def ARMv3 : Architecture<"armv3", "ARMv3", []>; + +def ARMv3m : Architecture<"armv3m", "ARMv3m", []>; + +def ARMv4 : Architecture<"armv4", "ARMv4", []>; + +def ARMv4t : Architecture<"armv4t", "ARMv4t", [HasV4TOps]>; + +def ARMv5t : Architecture<"armv5t", "ARMv5t", [HasV5TOps]>; + +def ARMv5te : Architecture<"armv5te", "ARMv5te", [HasV5TEOps]>; + +def ARMv5tej : Architecture<"armv5tej", "ARMv5tej", [HasV5TEOps]>; + +def ARMv6 : Architecture<"armv6", "ARMv6", [HasV6Ops]>; + +def ARMv6t2 : Architecture<"armv6t2", "ARMv6t2", [HasV6T2Ops, + FeatureDSP]>; + +def ARMv6k : Architecture<"armv6k", "ARMv6k", [HasV6KOps]>; + +def ARMv6kz : Architecture<"armv6kz", "ARMv6kz", [HasV6KOps, + FeatureTrustZone]>; + +def ARMv6m : Architecture<"armv6-m", "ARMv6m", [HasV6MOps, + FeatureNoARM, + FeatureDB, + FeatureMClass]>; + +def ARMv6sm : Architecture<"armv6s-m", "ARMv6sm", [HasV6MOps, + FeatureNoARM, + FeatureDB, + FeatureMClass]>; + +def ARMv7a : Architecture<"armv7-a", "ARMv7a", [HasV7Ops, + FeatureNEON, + FeatureDB, + FeatureDSP, + FeatureAClass]>; + +def ARMv7r : Architecture<"armv7-r", "ARMv7r", [HasV7Ops, + FeatureDB, + FeatureDSP, + FeatureHWDiv, + FeatureRClass]>; + +def ARMv7m : Architecture<"armv7-m", "ARMv7m", [HasV7Ops, + FeatureThumb2, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + FeatureMClass]>; + +def ARMv7em : Architecture<"armv7e-m", "ARMv7em", [HasV7Ops, + FeatureThumb2, + FeatureNoARM, + FeatureDB, + FeatureHWDiv, + FeatureMClass, + FeatureDSP, + FeatureT2XtPk]>; + +def ARMv8a : Architecture<"armv8-a", "ARMv8a", [HasV8Ops, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +def ARMv81a : Architecture<"armv8.1-a", "ARMv81a", [HasV8_1aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +def ARMv82a : Architecture<"armv8.2-a", "ARMv82a", [HasV8_2aOps, + FeatureAClass, + FeatureDB, + FeatureFPARMv8, + FeatureNEON, + FeatureDSP, + FeatureTrustZone, + FeatureMP, + FeatureVirtualization, + FeatureCrypto, + FeatureCRC]>; + +// Aliases +def IWMMXT : Architecture<"iwmmxt", "ARMv5te", [ARMv5te]>; +def IWMMXT2 : Architecture<"iwmmxt2", "ARMv5te", [ARMv5te]>; +def XScale : Architecture<"xscale", "ARMv5te", [ARMv5te]>; +def ARMv6j : Architecture<"armv6j", "ARMv7a", [ARMv6]>; +def ARMv7k : Architecture<"armv7k", "ARMv7a", [ARMv7a]>; +def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; + + +//===----------------------------------------------------------------------===// +// ARM processors +// + +// Dummy CPU, used to target architectures +def : ProcNoItin<"generic", []>; + +def : ProcNoItin<"arm8", [ARMv4]>; +def : ProcNoItin<"arm810", [ARMv4]>; +def : ProcNoItin<"strongarm", [ARMv4]>; +def : ProcNoItin<"strongarm110", [ARMv4]>; +def : ProcNoItin<"strongarm1100", [ARMv4]>; +def : ProcNoItin<"strongarm1110", [ARMv4]>; + +def : ProcNoItin<"arm7tdmi", [ARMv4t]>; +def : ProcNoItin<"arm7tdmi-s", [ARMv4t]>; +def : ProcNoItin<"arm710t", [ARMv4t]>; +def : ProcNoItin<"arm720t", [ARMv4t]>; +def : ProcNoItin<"arm9", [ARMv4t]>; +def : ProcNoItin<"arm9tdmi", [ARMv4t]>; +def : ProcNoItin<"arm920", [ARMv4t]>; +def : ProcNoItin<"arm920t", [ARMv4t]>; +def : ProcNoItin<"arm922t", [ARMv4t]>; +def : ProcNoItin<"arm940t", [ARMv4t]>; +def : ProcNoItin<"ep9312", [ARMv4t]>; + +def : ProcNoItin<"arm10tdmi", [ARMv5t]>; +def : ProcNoItin<"arm1020t", [ARMv5t]>; + +def : ProcNoItin<"arm9e", [ARMv5te]>; +def : ProcNoItin<"arm926ej-s", [ARMv5te]>; +def : ProcNoItin<"arm946e-s", [ARMv5te]>; +def : ProcNoItin<"arm966e-s", [ARMv5te]>; +def : ProcNoItin<"arm968e-s", [ARMv5te]>; +def : ProcNoItin<"arm10e", [ARMv5te]>; +def : ProcNoItin<"arm1020e", [ARMv5te]>; +def : ProcNoItin<"arm1022e", [ARMv5te]>; +def : ProcNoItin<"xscale", [ARMv5te]>; +def : ProcNoItin<"iwmmxt", [ARMv5te]>; + +def : Processor<"arm1136j-s", ARMV6Itineraries, [ARMv6]>; +def : Processor<"arm1136jf-s", ARMV6Itineraries, [ARMv6, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"cortex-m0", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"cortex-m0plus", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"cortex-m1", ARMV6Itineraries, [ARMv6m]>; +def : Processor<"sc000", ARMV6Itineraries, [ARMv6m]>; + +def : Processor<"arm1176jz-s", ARMV6Itineraries, [ARMv6kz]>; +def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ARMv6kz, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"mpcorenovfp", ARMV6Itineraries, [ARMv6k]>; +def : Processor<"mpcore", ARMV6Itineraries, [ARMv6k, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; + +def : Processor<"arm1156t2-s", ARMV6Itineraries, [ARMv6t2]>; +def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2, + FeatureVFP2, + FeatureHasSlowFPVMLx]>; -// V4 Processors. -def : ProcNoItin<"generic", []>; -def : ProcNoItin<"arm8", []>; -def : ProcNoItin<"arm810", []>; -def : ProcNoItin<"strongarm", []>; -def : ProcNoItin<"strongarm110", []>; -def : ProcNoItin<"strongarm1100", []>; -def : ProcNoItin<"strongarm1110", []>; - -// V4T Processors. -def : ProcNoItin<"arm7tdmi", [HasV4TOps]>; -def : ProcNoItin<"arm7tdmi-s", [HasV4TOps]>; -def : ProcNoItin<"arm710t", [HasV4TOps]>; -def : ProcNoItin<"arm720t", [HasV4TOps]>; -def : ProcNoItin<"arm9", [HasV4TOps]>; -def : ProcNoItin<"arm9tdmi", [HasV4TOps]>; -def : ProcNoItin<"arm920", [HasV4TOps]>; -def : ProcNoItin<"arm920t", [HasV4TOps]>; -def : ProcNoItin<"arm922t", [HasV4TOps]>; -def : ProcNoItin<"arm940t", [HasV4TOps]>; -def : ProcNoItin<"ep9312", [HasV4TOps]>; - -// V5T Processors. -def : ProcNoItin<"arm10tdmi", [HasV5TOps]>; -def : ProcNoItin<"arm1020t", [HasV5TOps]>; - -// V5TE Processors. -def : ProcNoItin<"arm9e", [HasV5TEOps]>; -def : ProcNoItin<"arm926ej-s", [HasV5TEOps]>; -def : ProcNoItin<"arm946e-s", [HasV5TEOps]>; -def : ProcNoItin<"arm966e-s", [HasV5TEOps]>; -def : ProcNoItin<"arm968e-s", [HasV5TEOps]>; -def : ProcNoItin<"arm10e", [HasV5TEOps]>; -def : ProcNoItin<"arm1020e", [HasV5TEOps]>; -def : ProcNoItin<"arm1022e", [HasV5TEOps]>; -def : ProcNoItin<"xscale", [HasV5TEOps]>; -def : ProcNoItin<"iwmmxt", [HasV5TEOps]>; - -// V6 Processors. -def : Processor<"arm1136j-s", ARMV6Itineraries, [HasV6Ops]>; -def : Processor<"arm1136jf-s", ARMV6Itineraries, [HasV6Ops, FeatureVFP2, - FeatureHasSlowFPVMLx]>; - -// V6M Processors. -def : Processor<"cortex-m0", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; -def : Processor<"cortex-m0plus", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; -def : Processor<"cortex-m1", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; -def : Processor<"sc000", ARMV6Itineraries, [HasV6MOps, FeatureNoARM, - FeatureDB, FeatureMClass]>; - -// V6K Processors. -def : Processor<"arm1176jz-s", ARMV6Itineraries, [HasV6KOps]>; -def : Processor<"arm1176jzf-s", ARMV6Itineraries, [HasV6KOps, FeatureVFP2, - FeatureHasSlowFPVMLx]>; -def : Processor<"mpcorenovfp", ARMV6Itineraries, [HasV6KOps]>; -def : Processor<"mpcore", ARMV6Itineraries, [HasV6KOps, FeatureVFP2, - FeatureHasSlowFPVMLx]>; - -// V6T2 Processors. -def : Processor<"arm1156t2-s", ARMV6Itineraries, [HasV6T2Ops, - FeatureDSPThumb2]>; -def : Processor<"arm1156t2f-s", ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2, - FeatureHasSlowFPVMLx, - FeatureDSPThumb2]>; - -// V7a Processors. // FIXME: A5 has currently the same Schedule model as A8 -def : ProcessorModel<"cortex-a5", CortexA8Model, - [ProcA5, HasV7Ops, FeatureNEON, FeatureDB, - FeatureVFP4, FeatureDSPThumb2, - FeatureHasRAS, FeatureAClass]>; -def : ProcessorModel<"cortex-a7", CortexA8Model, - [ProcA7, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; -def : ProcessorModel<"cortex-a8", CortexA8Model, - [ProcA8, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; -def : ProcessorModel<"cortex-a9", CortexA9Model, - [ProcA9, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, FeatureMP, - FeatureAClass]>; +def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureMP, + FeatureVFP4]>; + +def : ProcessorModel<"cortex-a7", CortexA8Model, [ARMv7a, ProcA7, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureMP, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureVirtualization]>; + +def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8, + FeatureHasRAS, + FeatureTrustZone, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVMLxForwarding, + FeatureT2XtPk]>; + +def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, + FeatureHasRAS, + FeatureTrustZone, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureFP16, + FeatureAvoidPartialCPSR, + FeatureMP]>; // FIXME: A12 has currently the same Schedule model as A9 -def : ProcessorModel<"cortex-a12", CortexA9Model, - [ProcA12, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureMP, - FeatureHasRAS, FeatureAClass]>; - -// FIXME: A15 has currently the same ProcessorModel as A9. -def : ProcessorModel<"cortex-a15", CortexA9Model, - [ProcA15, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; +def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, + FeatureHasRAS, + FeatureTrustZone, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization, + FeatureMP]>; + +// FIXME: A15 has currently the same Schedule model as A9. +def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, + FeatureHasRAS, + FeatureTrustZone, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization]>; // FIXME: A17 has currently the same Schedule model as A9 -def : ProcessorModel<"cortex-a17", CortexA9Model, - [ProcA17, HasV7Ops, FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureMP, - FeatureHasRAS, FeatureAClass]>; +def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, + FeatureHasRAS, + FeatureTrustZone, + FeatureMP, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureVirtualization]>; // FIXME: krait has currently the same Schedule model as A9 -def : ProcessorModel<"krait", CortexA9Model, - [ProcKrait, HasV7Ops, - FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; +// FIXME: krait has currently the same features as A9 plus VFP4 and hardware +// division features. +def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, + FeatureHasRAS, + FeatureVMLxForwarding, + FeatureT2XtPk, + FeatureFP16, + FeatureAvoidPartialCPSR, + FeatureVFP4, + FeatureHWDiv, + FeatureHWDivARM]>; + +def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, + FeatureHasRAS, + FeatureNEONForFP, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureAvoidMOVsShOp, + FeatureHasSlowFPVMLx]>; // FIXME: R4 has currently the same ProcessorModel as A8. -def : ProcessorModel<"cortex-r4", CortexA8Model, - [ProcR4]>; +def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4, + FeatureHasRAS, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; // FIXME: R4F has currently the same ProcessorModel as A8. -def : ProcessorModel<"cortex-r4f", CortexA8Model, - [ProcR4, - FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, - FeatureVFP3, FeatureD16]>; +def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, + FeatureHasRAS, + FeatureSlowFPBrcc, + FeatureHasSlowFPVMLx, + FeatureVFP3, + FeatureD16, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; // FIXME: R5 has currently the same ProcessorModel as A8. -def : ProcessorModel<"cortex-r5", CortexA8Model, - [ProcR5, HasV7Ops, FeatureDB, - FeatureVFP3, FeatureDSPThumb2, - FeatureHasRAS, - FeatureD16, FeatureRClass]>; +def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, + FeatureHasRAS, + FeatureVFP3, + FeatureD16, + FeatureSlowFPBrcc, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; // FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5. -def : ProcessorModel<"cortex-r7", CortexA8Model, - [ProcR5, HasV7Ops, FeatureDB, - FeatureVFP3, FeatureDSPThumb2, - FeatureHasRAS, FeatureVFPOnlySP, - FeatureD16, FeatureMP, FeatureRClass]>; - -// V7M Processors. -def : ProcNoItin<"cortex-m3", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureMClass]>; -def : ProcNoItin<"sc300", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureMClass]>; - -// V7EM Processors. -def : ProcNoItin<"cortex-m4", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureDSPThumb2, - FeatureT2XtPk, FeatureVFP4, - FeatureVFPOnlySP, FeatureD16, - FeatureMClass]>; -def : ProcNoItin<"cortex-m7", [HasV7Ops, - FeatureThumb2, FeatureNoARM, FeatureDB, - FeatureHWDiv, FeatureDSPThumb2, - FeatureT2XtPk, FeatureFPARMv8, - FeatureD16, FeatureMClass]>; - - -// Swift uArch Processors. -def : ProcessorModel<"swift", SwiftModel, - [ProcSwift, HasV7Ops, FeatureNEON, - FeatureDB, FeatureDSPThumb2, - FeatureHasRAS, FeatureAClass]>; - -// V8 Processors -def : ProcNoItin<"cortex-a53", [ProcA53, HasV8Ops, FeatureAClass, - FeatureDB, FeatureFPARMv8, - FeatureNEON, FeatureDSPThumb2]>; -def : ProcNoItin<"cortex-a57", [ProcA57, HasV8Ops, FeatureAClass, - FeatureDB, FeatureFPARMv8, - FeatureNEON, FeatureDSPThumb2]>; -// FIXME: Cortex-A72 is currently modelled as an Cortex-A57. -def : ProcNoItin<"cortex-a72", [ProcA57, HasV8Ops, FeatureAClass, - FeatureDB, FeatureFPARMv8, - FeatureNEON, FeatureDSPThumb2]>; +def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, + FeatureHasRAS, + FeatureVFP3, + FeatureVFPOnlySP, + FeatureD16, + FeatureFP16, + FeatureMP, + FeatureSlowFPBrcc, + FeatureHWDivARM, + FeatureHasSlowFPVMLx, + FeatureAvoidPartialCPSR, + FeatureT2XtPk]>; + +def : ProcNoItin<"cortex-m3", [ARMv7m]>; +def : ProcNoItin<"sc300", [ARMv7m]>; + +def : ProcNoItin<"cortex-m4", [ARMv7em, + FeatureVFP4, + FeatureVFPOnlySP, + FeatureD16]>; + +def : ProcNoItin<"cortex-m7", [ARMv7em, + FeatureFPARMv8, + FeatureD16]>; + + +def : ProcNoItin<"cortex-a35", [ARMv8a, ProcA35, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a57", [ARMv8a, ProcA57, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; + +def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, + FeatureHWDiv, + FeatureHWDivARM, + FeatureT2XtPk, + FeatureCrypto, + FeatureCRC]>; // Cyclone is very similar to swift -def : ProcessorModel<"cyclone", SwiftModel, - [ProcSwift, HasV8Ops, HasV7Ops, - FeatureCrypto, FeatureFPARMv8, - FeatureDB,FeatureDSPThumb2, - FeatureHasRAS, FeatureZCZeroing]>; +def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, + FeatureHasRAS, + FeatureNEONForFP, + FeatureT2XtPk, + FeatureVFP4, + FeatureMP, + FeatureHWDiv, + FeatureHWDivARM, + FeatureAvoidPartialCPSR, + FeatureAvoidMOVsShOp, + FeatureHasSlowFPVMLx, + FeatureCrypto, + FeatureZCZeroing]>; + //===----------------------------------------------------------------------===// // Register File Description @@ -504,8 +677,15 @@ def ARMAsmWriter : AsmWriter { bit isMCAsmWriter = 1; } +def ARMAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "ARM"; + string BreakCharacters = "."; +} + def ARM : Target { // Pull in Instruction Info: let InstructionSet = ARMInstrInfo; let AssemblyWriters = [ARMAsmWriter]; + let AssemblyParserVariants = [ARMAsmParserVariant]; } diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 738dded..206db96 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -60,7 +60,7 @@ using namespace llvm; ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), AFI(nullptr), MCP(nullptr), - InConstantPool(false) {} + InConstantPool(false), OptimizationGoals(-1) {} void ARMAsmPrinter::EmitFunctionBodyEnd() { // Make sure to terminate any constant pools that were at the end @@ -80,8 +80,8 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() { OutStreamer->EmitLabel(CurrentFnSym); } -void ARMAsmPrinter::EmitXXStructor(const Constant *CV) { - uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType()); +void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) { + uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType()); assert(Size && "C++ constructor pointer had zero size!"); const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts()); @@ -106,9 +106,38 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Subtarget = &MF.getSubtarget<ARMSubtarget>(); SetupMachineFunction(MF); + const Function* F = MF.getFunction(); + const TargetMachine& TM = MF.getTarget(); + + // Calculate this function's optimization goal. + unsigned OptimizationGoal; + if (F->hasFnAttribute(Attribute::OptimizeNone)) + // For best debugging illusion, speed and small size sacrificed + OptimizationGoal = 6; + else if (F->optForMinSize()) + // Aggressively for small size, speed and debug illusion sacrificed + OptimizationGoal = 4; + else if (F->optForSize()) + // For small size, but speed and debugging illusion preserved + OptimizationGoal = 3; + else if (TM.getOptLevel() == CodeGenOpt::Aggressive) + // Aggressively for speed, small size and debug illusion sacrificed + OptimizationGoal = 2; + else if (TM.getOptLevel() > CodeGenOpt::None) + // For speed, but small size and good debug illusion preserved + OptimizationGoal = 1; + else // TM.getOptLevel() == CodeGenOpt::None + // For good debugging, but speed and small size preserved + OptimizationGoal = 5; + + // Combine a new optimization goal with existing ones. + if (OptimizationGoals == -1) // uninitialized goals + OptimizationGoals = OptimizationGoal; + else if (OptimizationGoals != (int)OptimizationGoal) // conflicting goals + OptimizationGoals = 0; if (Subtarget->isTargetCOFF()) { - bool Internal = MF.getFunction()->hasInternalLinkage(); + bool Internal = F->hasInternalLinkage(); COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL; int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT; @@ -198,22 +227,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, MCSymbol *ARMAsmPrinter:: GetARMJTIPICJumpTableLabel(unsigned uid) const { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); SmallString<60> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI" + raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_' << uid; return OutContext.getOrCreateSymbol(Name); } - -MCSymbol *ARMAsmPrinter::GetARMSJLJEHLabel() const { - const DataLayout *DL = TM.getDataLayout(); - SmallString<60> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "SJLJEH" - << getFunctionNumber(); - return OutContext.getOrCreateSymbol(Name); -} - bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum, unsigned AsmVariant, const char *ExtraCode, raw_ostream &O) { @@ -515,6 +535,17 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) { // generates code that does this, it is always safe to set. OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); } + + // The last attribute to be emitted is ABI_optimization_goals + MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); + ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS); + + if (OptimizationGoals > 0 && + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI())) + ATS.emitAttribute(ARMBuildAttrs::ABI_optimization_goals, OptimizationGoals); + OptimizationGoals = -1; + + ATS.finishAttributeSection(); } //===----------------------------------------------------------------------===// @@ -532,7 +563,7 @@ static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU, if (Subtarget->hasV8Ops()) return ARMBuildAttrs::v8; else if (Subtarget->hasV7Ops()) { - if (Subtarget->isMClass() && Subtarget->hasThumb2DSP()) + if (Subtarget->isMClass() && Subtarget->hasDSP()) return ARMBuildAttrs::v7E_M; return ARMBuildAttrs::v7; } else if (Subtarget->hasV6T2Ops()) @@ -587,7 +618,7 @@ void ARMAsmPrinter::emitAttributes() { // We consider krait as a "cortex-a9" + hwdiv CPU // Enable hwdiv through ".arch_extension idiv" if (STI.hasDivide() || STI.hasDivideInARMMode()) - ATS.emitArchExtension(ARM::AEK_HWDIV); + ATS.emitArchExtension(ARM::AEK_HWDIV | ARM::AEK_HWDIVARM); } else ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString); } @@ -807,8 +838,6 @@ void ARMAsmPrinter::emitAttributes() { else if (STI.hasVirtualization()) ATS.emitAttribute(ARMBuildAttrs::Virtualization_use, ARMBuildAttrs::AllowVirtualization); - - ATS.finishAttributeSection(); } //===----------------------------------------------------------------------===// @@ -828,8 +857,7 @@ getModifierVariantKind(ARMCP::ARMCPModifier Modifier) { case ARMCP::TLSGD: return MCSymbolRefExpr::VK_TLSGD; case ARMCP::TPOFF: return MCSymbolRefExpr::VK_TPOFF; case ARMCP::GOTTPOFF: return MCSymbolRefExpr::VK_GOTTPOFF; - case ARMCP::GOT: return MCSymbolRefExpr::VK_GOT; - case ARMCP::GOTOFF: return MCSymbolRefExpr::VK_GOTOFF; + case ARMCP::GOT_PREL: return MCSymbolRefExpr::VK_ARM_GOT_PREL; } llvm_unreachable("Invalid ARMCPModifier!"); } @@ -875,8 +903,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV, void ARMAsmPrinter:: EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { - const DataLayout *DL = TM.getDataLayout(); - int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType()); + const DataLayout &DL = getDataLayout(); + int Size = DL.getTypeAllocSize(MCPV->getType()); ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV); @@ -909,10 +937,9 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { OutContext); if (ACPV->getPCAdjustment()) { - MCSymbol *PCLabel = getPICLabel(DL->getPrivateGlobalPrefix(), - getFunctionNumber(), - ACPV->getLabelId(), - OutContext); + MCSymbol *PCLabel = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + ACPV->getLabelId(), OutContext); const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext); PCRelExpr = MCBinaryExpr::createAdd(PCRelExpr, @@ -1136,6 +1163,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { Offset = 0; break; case ARM::ADDri: + case ARM::t2ADDri: Offset = -MI->getOperand(2).getImm(); break; case ARM::SUBri: @@ -1198,7 +1226,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { #include "ARMGenMCPseudoLowering.inc" void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // If we just ended a constant pool, mark it as such. if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) { @@ -1355,9 +1383,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *GVSym = GetARMGVSymbol(GV, TF); const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext); - MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(), - getFunctionNumber(), - MI->getOperand(2).getImm(), OutContext); + MCSymbol *LabelSym = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(2).getImm(), OutContext); const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext); unsigned PCAdj = (Opc == ARM::MOVi16_ga_pcrel) ? 8 : 4; const MCExpr *PCRelExpr = @@ -1388,9 +1416,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { MCSymbol *GVSym = GetARMGVSymbol(GV, TF); const MCExpr *GVSymExpr = MCSymbolRefExpr::create(GVSym, OutContext); - MCSymbol *LabelSym = getPICLabel(DL->getPrivateGlobalPrefix(), - getFunctionNumber(), - MI->getOperand(3).getImm(), OutContext); + MCSymbol *LabelSym = + getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), + MI->getOperand(3).getImm(), OutContext); const MCExpr *LabelSymExpr= MCSymbolRefExpr::create(LabelSym, OutContext); unsigned PCAdj = (Opc == ARM::MOVTi16_ga_pcrel) ? 8 : 4; const MCExpr *PCRelExpr = @@ -1414,10 +1442,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // This adds the address of LPC0 to r0. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(), + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), - MI->getOperand(2).getImm(), - OutContext)); + MI->getOperand(2).getImm(), OutContext)); // Form and emit the add. EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr) @@ -1436,10 +1463,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // This adds the address of LPC0 to r0. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(), + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), - MI->getOperand(2).getImm(), - OutContext)); + MI->getOperand(2).getImm(), OutContext)); // Form and emit the add. EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::ADDrr) @@ -1468,10 +1494,9 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // a PC-relative address at the ldr instruction. // Emit the label. - OutStreamer->EmitLabel(getPICLabel(DL->getPrivateGlobalPrefix(), + OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(), getFunctionNumber(), - MI->getOperand(2).getImm(), - OutContext)); + MI->getOperand(2).getImm(), OutContext)); // Form and emit the load unsigned Opcode; @@ -1519,7 +1544,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (MCPE.isMachineConstantPoolEntry()) EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); else - EmitGlobalConstant(MCPE.Val.ConstVal); + EmitGlobalConstant(DL, MCPE.Val.ConstVal); return; } case ARM::JUMPTABLE_ADDRS: @@ -1653,12 +1678,12 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) { // adds $val, #7 // str $val, [$src, #4] // movs r0, #0 - // b 1f + // b LSJLJEH // movs r0, #1 - // 1: + // LSJLJEH: unsigned SrcReg = MI->getOperand(0).getReg(); unsigned ValReg = MI->getOperand(1).getReg(); - MCSymbol *Label = GetARMSJLJEHLabel(); + MCSymbol *Label = OutContext.createTempSymbol("SJLJEH", false, true); OutStreamer->AddComment("eh_setjmp begin"); EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr) .addReg(ValReg) diff --git a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h index 3d25121..ed7be2d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h +++ b/contrib/llvm/lib/Target/ARM/ARMAsmPrinter.h @@ -51,6 +51,11 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter { /// labels used for ARMv4t thumb code to make register indirect calls. SmallVector<std::pair<unsigned, MCSymbol*>, 4> ThumbIndirectPads; + /// OptimizationGoals - Maintain a combined optimization goal for all + /// functions in a module: one of Tag_ABI_optimization_goals values, + /// -1 if uninitialized, 0 if conflicting goals + int OptimizationGoals; + public: explicit ARMAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); @@ -84,7 +89,7 @@ public: void EmitFunctionEntryLabel() override; void EmitStartOfAsmFile(Module &M) override; void EmitEndOfAsmFile(Module &M) override; - void EmitXXStructor(const Constant *CV) override; + void EmitXXStructor(const DataLayout &DL, const Constant *CV) override; // lowerOperand - Convert a MachineOperand into the equivalent MCOperand. bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp); @@ -119,8 +124,6 @@ private: MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol); MCSymbol *GetARMJTIPICJumpTableLabel(unsigned uid) const; - MCSymbol *GetARMSJLJEHLabel() const; - MCSymbol *GetARMGVSymbol(const GlobalValue *GV, unsigned char TargetFlags); public: diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 9f43e73..49f3288 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -97,7 +97,7 @@ ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI) Subtarget(STI) { for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) { if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second) - assert(false && "Duplicated entries?"); + llvm_unreachable("Duplicated entries?"); MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc); MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc); } @@ -440,7 +440,7 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const { bool ARMBaseInstrInfo::isPredicated(const MachineInstr *MI) const { if (MI->isBundle()) { - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { int PIdx = I->findFirstPredOperandIdx(); @@ -518,7 +518,7 @@ bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI, static bool isCPSRDefined(const MachineInstr *MI) { for (const auto &MO : MI->operands()) - if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef()) + if (MO.isReg() && MO.getReg() == ARM::CPSR && MO.isDef() && !MO.isDead()) return true; return false; } @@ -647,7 +647,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { unsigned ARMBaseInstrInfo::getInstBundleLength(const MachineInstr *MI) const { unsigned Size = 0; - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { assert(!I->isBundle() && "No nested bundle!"); @@ -853,11 +853,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), Align); switch (RC->getSize()) { case 4: @@ -1043,12 +1041,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), Align); switch (RC->getSize()) { case 4: @@ -1224,6 +1219,60 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, return MI->mayLoad() && hasLoadFromStackSlot(MI, Dummy, FrameIndex); } +/// \brief Expands MEMCPY to either LDMIA/STMIA or LDMIA_UPD/STMID_UPD +/// depending on whether the result is used. +void ARMBaseInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MBBI) const { + bool isThumb1 = Subtarget.isThumb1Only(); + bool isThumb2 = Subtarget.isThumb2(); + const ARMBaseInstrInfo *TII = Subtarget.getInstrInfo(); + + MachineInstr *MI = MBBI; + DebugLoc dl = MI->getDebugLoc(); + MachineBasicBlock *BB = MI->getParent(); + + MachineInstrBuilder LDM, STM; + if (isThumb1 || !MI->getOperand(1).isDead()) { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA_UPD + : isThumb1 ? ARM::tLDMIA_UPD + : ARM::LDMIA_UPD)) + .addOperand(MI->getOperand(1)); + } else { + LDM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2LDMIA : ARM::LDMIA)); + } + + if (isThumb1 || !MI->getOperand(0).isDead()) { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA_UPD + : isThumb1 ? ARM::tSTMIA_UPD + : ARM::STMIA_UPD)) + .addOperand(MI->getOperand(0)); + } else { + STM = BuildMI(*BB, MI, dl, TII->get(isThumb2 ? ARM::t2STMIA : ARM::STMIA)); + } + + AddDefaultPred(LDM.addOperand(MI->getOperand(3))); + AddDefaultPred(STM.addOperand(MI->getOperand(2))); + + // Sort the scratch registers into ascending order. + const TargetRegisterInfo &TRI = getRegisterInfo(); + llvm::SmallVector<unsigned, 6> ScratchRegs; + for(unsigned I = 5; I < MI->getNumOperands(); ++I) + ScratchRegs.push_back(MI->getOperand(I).getReg()); + std::sort(ScratchRegs.begin(), ScratchRegs.end(), + [&TRI](const unsigned &Reg1, + const unsigned &Reg2) -> bool { + return TRI.getEncodingValue(Reg1) < + TRI.getEncodingValue(Reg2); + }); + + for (const auto &Reg : ScratchRegs) { + LDM.addReg(Reg, RegState::Define); + STM.addReg(Reg, RegState::Kill); + } + + BB->erase(MBBI); +} + + bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MachineFunction &MF = *MI->getParent()->getParent(); @@ -1237,6 +1286,11 @@ ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return true; } + if (MI->getOpcode() == ARM::MEMCPY) { + expandMEMCPY(MI); + return true; + } + // This hook gets to expand COPY instructions before they become // copyPhysReg() calls. Look for VMOVS instructions that can legally be // widened to VMOVD. We prefer the VMOVD when possible because it may be @@ -1325,9 +1379,9 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) { // instructions, so that's probably OK, but is PIC always correct when // we get here? if (ACPV->isGlobalValue()) - NewCPV = ARMConstantPoolConstant:: - Create(cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, - ARMCP::CPValue, 4); + NewCPV = ARMConstantPoolConstant::Create( + cast<ARMConstantPoolConstant>(ACPV)->getGV(), PCLabelId, ARMCP::CPValue, + 4, ACPV->getModifier(), ACPV->mustAddCurrentAddress()); else if (ACPV->isExtSymbol()) NewCPV = ARMConstantPoolSymbol:: Create(MF.getFunction()->getContext(), @@ -1645,16 +1699,14 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr *MI, bool ARMBaseInstrInfo:: isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const { + BranchProbability Probability) const { if (!NumCycles) return false; // If we are optimizing for size, see if the branch in the predecessor can be // lowered to cbn?z by the constant island lowering pass, and return false if // so. This results in a shorter instruction sequence. - const Function *F = MBB.getParent()->getFunction(); - if (F->hasFnAttribute(Attribute::OptimizeForSize) || - F->hasFnAttribute(Attribute::MinSize)) { + if (MBB.getParent()->getFunction()->optForSize()) { MachineBasicBlock *Pred = *MBB.pred_begin(); if (!Pred->empty()) { MachineInstr *LastMI = &*Pred->rbegin(); @@ -1677,12 +1729,14 @@ isProfitableToIfCvt(MachineBasicBlock &MBB, } // Attempt to estimate the relative costs of predication versus branching. - unsigned UnpredCost = Probability.getNumerator() * NumCycles; - UnpredCost /= Probability.getDenominator(); - UnpredCost += 1; // The branch itself - UnpredCost += Subtarget.getMispredictionPenalty() / 10; - - return (NumCycles + ExtraPredCycles) <= UnpredCost; + // Here we scale up each component of UnpredCost to avoid precision issue when + // scaling NumCycles by Probability. + const unsigned ScalingUpFactor = 1024; + unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor); + UnpredCost += ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; + + return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost; } bool ARMBaseInstrInfo:: @@ -1690,23 +1744,22 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned TCycles, unsigned TExtra, MachineBasicBlock &FMBB, unsigned FCycles, unsigned FExtra, - const BranchProbability &Probability) const { + BranchProbability Probability) const { if (!TCycles || !FCycles) return false; // Attempt to estimate the relative costs of predication versus branching. - unsigned TUnpredCost = Probability.getNumerator() * TCycles; - TUnpredCost /= Probability.getDenominator(); - - uint32_t Comp = Probability.getDenominator() - Probability.getNumerator(); - unsigned FUnpredCost = Comp * FCycles; - FUnpredCost /= Probability.getDenominator(); - + // Here we scale up each component of UnpredCost to avoid precision issue when + // scaling TCycles/FCycles by Probability. + const unsigned ScalingUpFactor = 1024; + unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor); + unsigned FUnpredCost = + Probability.getCompl().scale(FCycles * ScalingUpFactor); unsigned UnpredCost = TUnpredCost + FUnpredCost; - UnpredCost += 1; // The branch itself - UnpredCost += Subtarget.getMispredictionPenalty() / 10; + UnpredCost += 1 * ScalingUpFactor; // The branch itself + UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10; - return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost; + return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost; } bool @@ -1744,9 +1797,10 @@ unsigned llvm::getMatchingCondBranchOpcode(unsigned Opc) { llvm_unreachable("Unknown unconditional branch opcode!"); } -/// commuteInstruction - Handle commutable instructions. -MachineInstr * -ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { switch (MI->getOpcode()) { case ARM::MOVCCr: case ARM::t2MOVCCr: { @@ -1756,7 +1810,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // MOVCC AL can't be inverted. Shouldn't happen. if (CC == ARMCC::AL || PredReg != ARM::CPSR) return nullptr; - MI = TargetInstrInfo::commuteInstruction(MI, NewMI); + MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); if (!MI) return nullptr; // After swapping the MOVCC operands, also invert the condition. @@ -1765,7 +1819,7 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { return MI; } } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } /// Identify instructions that can be folded into a MOVCC instruction, and @@ -1975,21 +2029,12 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB, } } -static bool isAnySubRegLive(unsigned Reg, const TargetRegisterInfo *TRI, - MachineInstr *MI) { - for (MCSubRegIterator Subreg(Reg, TRI, /* IncludeSelf */ true); - Subreg.isValid(); ++Subreg) - if (MI->getParent()->computeRegisterLiveness(TRI, *Subreg, MI) != - MachineBasicBlock::LQR_Dead) - return true; - return false; -} bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, MachineFunction &MF, MachineInstr *MI, unsigned NumBytes) { // This optimisation potentially adds lots of load and store // micro-operations, it's only really a great benefit to code-size. - if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize)) + if (!MF.getFunction()->optForMinSize()) return false; // If only one register is pushed/popped, LLVM can use an LDR/STR @@ -2058,11 +2103,9 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget, // registers live within the function we might clobber a return value // register; the other way a register can be live here is if it's // callee-saved. - // TODO: Currently, computeRegisterLiveness() does not report "live" if a - // sub reg is live. When computeRegisterLiveness() works for sub reg, it - // can replace isAnySubRegLive(). if (isCalleeSavedRegister(CurReg, CSRegs) || - isAnySubRegLive(CurReg, TRI, MI)) { + MI->getParent()->computeRegisterLiveness(TRI, CurReg, MI) != + MachineBasicBlock::LQR_Dead) { // VFP pops don't allow holes in the register list, so any skip is fatal // for our transformation. GPR pops do, so we should just keep looking. if (IsVFPPushPop) @@ -3381,7 +3424,7 @@ static const MachineInstr *getBundledDefMI(const TargetRegisterInfo *TRI, assert(Idx != -1 && "Cannot find bundled definition!"); DefIdx = Idx; - return II; + return &*II; } static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, @@ -3389,7 +3432,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, unsigned &UseIdx, unsigned &Dist) { Dist = 0; - MachineBasicBlock::const_instr_iterator II = MI; ++II; + MachineBasicBlock::const_instr_iterator II = ++MI->getIterator(); assert(II->isInsideBundle() && "Empty bundle?"); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); @@ -3410,7 +3453,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI, } UseIdx = Idx; - return II; + return &*II; } /// Return the number of cycles to add to (or subtract from) the static @@ -3652,6 +3695,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, // instructions). if (Latency > 0 && Subtarget.isThumb2()) { const MachineFunction *MF = DefMI->getParent()->getParent(); + // FIXME: Use Function::optForSize(). if (MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) --Latency; } @@ -3931,11 +3975,11 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, // other passes may query the latency of a bundled instruction. if (MI->isBundle()) { unsigned Latency = 0; - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); while (++I != E && I->isInsideBundle()) { if (I->getOpcode() != ARM::t2IT) - Latency += getInstrLatency(ItinData, I, PredCost); + Latency += getInstrLatency(ItinData, &*I, PredCost); } return Latency; } @@ -4054,8 +4098,8 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI, MIB = BuildMI(MBB, MI, DL, get(LoadOpc), Reg); MIB.addReg(Reg, RegState::Kill).addImm(0); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; - MachineMemOperand *MMO = MBB.getParent()-> - getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 4, 4); + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); MIB.addMemOperand(MMO); AddDefaultPred(MIB); } diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index b4706e3..d80c494 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -86,6 +86,18 @@ protected: RegSubRegPair &BaseReg, RegSubRegPairAndIdx &InsertedReg) const override; + /// Commutes the operands in the given instruction. + /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. + /// + /// Do not call this method for a non-commutable instruction or for + /// non-commutable pair of operand indices OpIdx1 and OpIdx2. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; + public: // Return whether the target has an explicit NOP encoding. bool hasNOP() const; @@ -188,9 +200,6 @@ public: MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const override; - MachineInstr *commuteInstruction(MachineInstr*, - bool=false) const override; - const MachineInstrBuilder &AddDReg(MachineInstrBuilder &MIB, unsigned Reg, unsigned SubIdx, unsigned State, const TargetRegisterInfo *TRI) const; @@ -224,15 +233,15 @@ public: bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, unsigned ExtraT, MachineBasicBlock &FMBB, unsigned NumF, unsigned ExtraF, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, - const BranchProbability &Probability) const override { + BranchProbability Probability) const override { return NumCycles == 1; } @@ -343,6 +352,8 @@ private: virtual void expandLoadStackGuard(MachineBasicBlock::iterator MI, Reloc::Model RM) const = 0; + void expandMEMCPY(MachineBasicBlock::iterator) const; + private: /// Modeling special VFP / NEON fp MLA / MLS hazards. diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp index e7d5be77..419717c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -225,7 +225,8 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, ArrayRef<MCPhysReg> Order, SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, - const VirtRegMap *VRM) const { + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const { const MachineRegisterInfo &MRI = MF.getRegInfo(); std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg); @@ -338,7 +339,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { // 1. Dynamic stack realignment is explicitly disabled, // 2. This is a Thumb1 function (it's not useful, so we don't bother), or // 3. There are VLAs in the function and the base pointer is disabled. - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + if (!TargetRegisterInfo::canRealignStack(MF)) return false; if (AFI->isThumb1OnlyFunction()) return false; @@ -356,18 +357,6 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const { } bool ARMBaseRegisterInfo:: -needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const ARMFrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttribute(Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); -} - -bool ARMBaseRegisterInfo:: cannotEliminateFrame(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack()) diff --git a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index fdc1ef9..cea8b80 100644 --- a/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -94,7 +94,7 @@ public: const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; - const uint32_t *getNoPreservedMask() const; + const uint32_t *getNoPreservedMask() const override; /// getThisReturnPreservedMask - Returns a call preserved mask specific to the /// case that 'returned' is on an i32 first argument if the calling convention @@ -126,15 +126,15 @@ public: ArrayRef<MCPhysReg> Order, SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF, - const VirtRegMap *VRM) const override; + const VirtRegMap *VRM, + const LiveRegMatrix *Matrix) const override; void updateRegAllocHint(unsigned Reg, unsigned NewReg, MachineFunction &MF) const override; bool hasBasePointer(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override; bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override; diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h index d687568..a731d00 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.h +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.h @@ -160,15 +160,15 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT, State); } -static const uint16_t RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; +static const MCPhysReg RRegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; -static const uint16_t SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, - ARM::S4, ARM::S5, ARM::S6, ARM::S7, - ARM::S8, ARM::S9, ARM::S10, ARM::S11, - ARM::S12, ARM::S13, ARM::S14, ARM::S15 }; -static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, - ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; -static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; +static const MCPhysReg SRegList[] = { ARM::S0, ARM::S1, ARM::S2, ARM::S3, + ARM::S4, ARM::S5, ARM::S6, ARM::S7, + ARM::S8, ARM::S9, ARM::S10, ARM::S11, + ARM::S12, ARM::S13, ARM::S14, ARM::S15 }; +static const MCPhysReg DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3, + ARM::D4, ARM::D5, ARM::D6, ARM::D7 }; +static const MCPhysReg QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 }; // Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA @@ -199,9 +199,11 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, // Try to allocate a contiguous block of registers, each of the correct // size to hold one member. - unsigned Align = std::min(PendingMembers[0].getExtraInfo(), 8U); + auto &DL = State.getMachineFunction().getDataLayout(); + unsigned StackAlign = DL.getStackAlignment(); + unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign); - ArrayRef<uint16_t> RegList; + ArrayRef<MCPhysReg> RegList; switch (LocVT.SimpleTy) { case MVT::i32: { RegList = RRegList; diff --git a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td index 27cf06b..2335164 100644 --- a/contrib/llvm/lib/Target/ARM/ARMCallingConv.td +++ b/contrib/llvm/lib/Target/ARM/ARMCallingConv.td @@ -125,6 +125,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[ CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>, CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>, CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>, + CCIfType<[v2f64], CCIfAlign<"16", + CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>, CCIfType<[v2f64], CCAssignToStackWithShadow<16, 8, [Q0, Q1, Q2, Q3]>> ]>; diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index f4ec8c6..e89757c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -342,7 +342,7 @@ void ARMConstantIslands::verify() { #ifndef NDEBUG for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E; ++MBBI) { - MachineBasicBlock *MBB = MBBI; + MachineBasicBlock *MBB = &*MBBI; unsigned MBBId = MBB->getNumber(); assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset); } @@ -542,7 +542,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs) // identity mapping of CPI's to CPE's. const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants(); - const DataLayout &TD = *MF->getTarget().getDataLayout(); + const DataLayout &TD = MF->getDataLayout(); for (unsigned i = 0, e = CPs.size(); i != e; ++i) { unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); assert(Size >= 4 && "Too small constant pool entry"); @@ -589,6 +589,8 @@ void ARMConstantIslands::doInitialJumpTablePlacement( MachineBasicBlock *LastCorrectlyNumberedBB = nullptr; for (MachineBasicBlock &MBB : *MF) { auto MI = MBB.getLastNonDebugInstr(); + if (MI == MBB.end()) + continue; unsigned JTOpcode; switch (MI->getOpcode()) { @@ -639,12 +641,12 @@ void ARMConstantIslands::doInitialJumpTablePlacement( /// into the block immediately after it. bool ARMConstantIslands::BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; + MachineFunction::iterator MBBI = MBB->getIterator(); // Can't fall off end of function. if (std::next(MBBI) == MBB->getParent()->end()) return false; - MachineBasicBlock *NextBB = std::next(MBBI); + MachineBasicBlock *NextBB = &*std::next(MBBI); if (std::find(MBB->succ_begin(), MBB->succ_end(), NextBB) == MBB->succ_end()) return false; @@ -722,15 +724,15 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { // has any inline assembly in it. If so, we have to be conservative about // alignment assumptions, as we don't know for sure the size of any // instructions in the inline assembly. - for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) - computeBlockSize(I); + for (MachineBasicBlock &MBB : *MF) + computeBlockSize(&MBB); // The known bits of the entry block offset are determined by the function // alignment. BBInfo.front().KnownBits = MF->getAlignment(); // Compute block offsets and known bits. - adjustBBOffsetsAfter(MF->begin()); + adjustBBOffsetsAfter(&MF->front()); // Now go back through the instructions and build up our data structures. for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); @@ -968,7 +970,7 @@ MachineBasicBlock *ARMConstantIslands::splitBlockBeforeInstr(MachineInstr *MI) { // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; ++MBBI; + MachineFunction::iterator MBBI = ++OrigBB->getIterator(); MF->insert(MBBI, NewBB); // Splice the instructions starting with MI over to NewBB. @@ -1088,7 +1090,7 @@ bool ARMConstantIslands::isWaterInRange(unsigned UserOffset, unsigned CPELogAlign = getCPELogAlign(U.CPEMI); unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); unsigned NextBlockOffset, NextBlockAlignment; - MachineFunction::const_iterator NextBlock = Water; + MachineFunction::const_iterator NextBlock = Water->getIterator(); if (++NextBlock == MF->end()) { NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); NextBlockAlignment = 0; @@ -1350,7 +1352,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex, if (isOffsetInRange(UserOffset, CPEOffset, U)) { DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber() << format(", expected CPE offset %#x\n", CPEOffset)); - NewMBB = std::next(MachineFunction::iterator(UserMBB)); + NewMBB = &*++UserMBB->getIterator(); // Add an unconditional branch from UserMBB to fallthrough block. Record // it for branch lengthening; this new branch will not get out of range, // but if the preceding conditional branch is out of range, the targets @@ -1503,8 +1505,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { NewWaterList.insert(NewIsland); // The new CPE goes before the following block (NewMBB). - NewMBB = std::next(MachineFunction::iterator(WaterBB)); - + NewMBB = &*++WaterBB->getIterator(); } else { // No water found. DEBUG(dbgs() << "No water found\n"); @@ -1515,7 +1516,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // next iteration for constant pools, but in this context, we don't want // it. Check for this so it will be removed from the WaterList. // Also remove any entry from NewWaterList. - MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB)); + MachineBasicBlock *WaterBB = &*--NewMBB->getIterator(); IP = std::find(WaterList.begin(), WaterList.end(), WaterBB); if (IP != WaterList.end()) NewWaterList.erase(WaterBB); @@ -1532,7 +1533,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { WaterList.erase(IP); // Okay, we know we can put an island before NewMBB now, do it! - MF->insert(NewMBB, NewIsland); + MF->insert(NewMBB->getIterator(), NewIsland); // Update internal data structures to account for the newly inserted MBB. updateForInsertedWaterBlock(NewIsland); @@ -1553,7 +1554,7 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // Increase the size of the island block to account for the new entry. BBInfo[NewIsland->getNumber()].Size += Size; - adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland))); + adjustBBOffsetsAfter(&*--NewIsland->getIterator()); // Finally, change the CPI in the instruction operand to be ID. for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) @@ -1732,7 +1733,7 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) { MBB->back().eraseFromParent(); // BBInfo[SplitBB].Offset is wrong temporarily, fixed below } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*++MBB->getIterator(); DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() << " also invert condition and change dest. to BB#" @@ -2058,9 +2059,9 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI, /// \brief Returns whether CPEMI is the first instruction in the block /// immediately following JTMI (assumed to be a TBB or TBH terminator). If so, /// we can switch the first register to PC and usually remove the address -/// calculation that preceeded it. +/// calculation that preceded it. static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) { - MachineFunction::iterator MBB = JTMI->getParent(); + MachineFunction::iterator MBB = JTMI->getParent()->getIterator(); MachineFunction *MF = MBB->getParent(); ++MBB; @@ -2235,7 +2236,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector<MachineOperand, 4> Cond; SmallVector<MachineOperand, 4> CondPrior; - MachineFunction::iterator BBi = BB; + MachineFunction::iterator BBi = BB->getIterator(); MachineFunction::iterator OldPrior = std::prev(BBi); // If the block terminator isn't analyzable, don't try to move the block @@ -2258,7 +2259,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Create a new MBB for the code after the jump BB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(JTBB->getBasicBlock()); - MachineFunction::iterator MBBI = JTBB; ++MBBI; + MachineFunction::iterator MBBI = ++JTBB->getIterator(); MF->insert(MBBI, NewBB); // Add an unconditional branch from NewBB to BB. @@ -2273,8 +2274,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) { // Update the CFG. NewBB->addSuccessor(BB); - JTBB->removeSuccessor(BB); - JTBB->addSuccessor(NewBB); + JTBB->replaceSuccessor(BB, NewBB); ++NumJTInserted; return NewBB; diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp index 7d41c69..c9849b2 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp @@ -52,8 +52,7 @@ const char *ARMConstantPoolValue::getModifierText() const { // strings if that's legal. case ARMCP::no_modifier: return "none"; case ARMCP::TLSGD: return "tlsgd"; - case ARMCP::GOT: return "GOT"; - case ARMCP::GOTOFF: return "GOTOFF"; + case ARMCP::GOT_PREL: return "GOT_PREL"; case ARMCP::GOTTPOFF: return "gottpoff"; case ARMCP::TPOFF: return "tpoff"; } diff --git a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h index 36f63e2..6b18a4e 100644 --- a/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h +++ b/contrib/llvm/lib/Target/ARM/ARMConstantPoolValue.h @@ -39,8 +39,7 @@ namespace ARMCP { enum ARMCPModifier { no_modifier, TLSGD, - GOT, - GOTOFF, + GOT_PREL, GOTTPOFF, TPOFF }; @@ -103,8 +102,6 @@ public: bool isLSDA() const { return Kind == ARMCP::CPLSDA; } bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; } - unsigned getRelocationInfo() const override { return 2; } - int getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) override; diff --git a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 4438f50..56f3498 100644 --- a/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -330,22 +330,19 @@ static const NEONLdStTableEntry NEONLdStTable[] = { /// LookupNEONLdSt - Search the NEONLdStTable for information about a NEON /// load or store pseudo instruction. static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { - const unsigned NumEntries = array_lengthof(NEONLdStTable); - #ifndef NDEBUG // Make sure the table is sorted. static bool TableChecked = false; if (!TableChecked) { - for (unsigned i = 0; i != NumEntries-1; ++i) - assert(NEONLdStTable[i] < NEONLdStTable[i+1] && - "NEONLdStTable is not sorted!"); + assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) && + "NEONLdStTable is not sorted!"); TableChecked = true; } #endif - const NEONLdStTableEntry *I = - std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode); - if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode) + auto I = std::lower_bound(std::begin(NEONLdStTable), + std::end(NEONLdStTable), Opcode); + if (I != std::end(NEONLdStTable) && I->PseudoOpc == Opcode) return I; return nullptr; } @@ -734,7 +731,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, HI16.addImm(Pred).addReg(PredReg); if (RequiresBundling) - finalizeBundle(MBB, &*LO16, &*MBBI); + finalizeBundle(MBB, LO16->getIterator(), MBBI->getIterator()); TransferImpOps(MI, LO16, HI16); MI.eraseFromParent(); @@ -747,6 +744,55 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, switch (Opcode) { default: return false; + + case ARM::TCRETURNdi: + case ARM::TCRETURNri: { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->isReturn() && + "Can only insert epilog into returning blocks"); + unsigned RetOpcode = MBBI->getOpcode(); + DebugLoc dl = MBBI->getDebugLoc(); + const ARMBaseInstrInfo &TII = *static_cast<const ARMBaseInstrInfo *>( + MBB.getParent()->getSubtarget().getInstrInfo()); + + // Tail call return: adjust the stack pointer and jump to callee. + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + + // Jump to label or value in register. + if (RetOpcode == ARM::TCRETURNdi) { + unsigned TCOpcode = + STI->isThumb() + ? (STI->isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) + : ARM::TAILJMPd; + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); + if (JumpTarget.isGlobal()) + MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), + JumpTarget.getTargetFlags()); + else { + assert(JumpTarget.isSymbol()); + MIB.addExternalSymbol(JumpTarget.getSymbolName(), + JumpTarget.getTargetFlags()); + } + + // Add the default predicate in Thumb mode. + if (STI->isThumb()) + MIB.addImm(ARMCC::AL).addReg(0); + } else if (RetOpcode == ARM::TCRETURNri) { + BuildMI(MBB, MBBI, dl, + TII.get(STI->isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)) + .addReg(JumpTarget.getReg(), RegState::Kill); + } + + MachineInstr *NewMI = std::prev(MBBI); + for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) + NewMI->addOperand(MBBI->getOperand(i)); + + // Delete the pseudo instruction TCRETURN. + MBB.erase(MBBI); + MBBI = NewMI; + return true; + } case ARM::VMOVScc: case ARM::VMOVDcc: { unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD; diff --git a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp index fdd0763..9bdf823c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFastISel.cpp @@ -922,12 +922,9 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr, if (Addr.BaseType == Address::FrameIndexBase) { int FI = Addr.Base.FI; int Offset = Addr.Offset; - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI, Offset), - Flags, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // Now add the rest of the operands. MIB.addFrameIndex(FI); @@ -1278,8 +1275,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) .addMBB(TBB).addImm(ARMPred).addReg(ARM::CPSR); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { @@ -1303,8 +1299,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (const ConstantInt *CI = @@ -1341,8 +1336,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) { unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BrOpc)) .addMBB(TBB).addImm(CCMode).addReg(ARM::CPSR); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } @@ -1355,8 +1349,8 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) { TII.get(Opc)).addReg(AddrReg)); const IndirectBrInst *IB = cast<IndirectBrInst>(I); - for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]); + for (const BasicBlock *SuccBB : IB->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]); return true; } @@ -1860,8 +1854,9 @@ CCAssignFn *ARMFastISel::CCAssignFnForCall(CallingConv::ID CC, return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); else return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS); - } else - return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } else { + return (Return ? RetCC_ARM_APCS: CC_ARM_APCS); + } case CallingConv::ARM_AAPCS_VFP: if (!isVarArg) return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP); @@ -2944,48 +2939,51 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT) { - bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); - ARMConstantPoolConstant *CPV = - ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); - unsigned Idx = MCP.getConstantPoolIndex(CPV, Align); + bool UseGOT_PREL = + !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + + LLVMContext *Context = &MF->getFunction()->getContext(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, + UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, + /*AddCurrentAddress=*/UseGOT_PREL); + + unsigned ConstAlign = + MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context)); + unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign); + + unsigned TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); + unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp; + MachineInstrBuilder MIB = + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), TempReg) + .addConstantPoolIndex(Idx); + if (Opc == ARM::LDRcp) + MIB.addImm(0); + AddDefaultPred(MIB); - unsigned Opc; - unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT)); - // Load value. - if (isThumb2) { - DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0); - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(ARM::t2LDRpci), DestReg1) - .addConstantPoolIndex(Idx)); - Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs; - } else { - // The extra immediate is for addrmode2. - DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0); - AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(ARM::LDRcp), DestReg1) - .addConstantPoolIndex(Idx).addImm(0)); - Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs; - } + // Fix the address by adding pc. + unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); + Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR + : ARM::PICADD; + DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg) + .addReg(TempReg) + .addImm(ARMPCLabelIndex); + if (!Subtarget->isThumb()) + AddDefaultPred(MIB); - unsigned GlobalBaseReg = AFI->getGlobalBaseReg(); - if (GlobalBaseReg == 0) { - GlobalBaseReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT)); - AFI->setGlobalBaseReg(GlobalBaseReg); + if (UseGOT_PREL && Subtarget->isThumb()) { + unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT)); + MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(ARM::t2LDRi12), NewDestReg) + .addReg(DestReg) + .addImm(0); + DestReg = NewDestReg; + AddOptionalDefs(MIB); } - - unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT)); - DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0); - DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1); - GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2); - MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, - DbgLoc, TII.get(Opc), DestReg2) - .addReg(DestReg1) - .addReg(GlobalBaseReg); - if (!UseGOTOFF) - MIB.addImm(0); - AddOptionalDefs(MIB); - - return DestReg2; + return DestReg; } bool ARMFastISel::fastLowerArguments() { @@ -3038,7 +3036,7 @@ bool ARMFastISel::fastLowerArguments() { } - static const uint16_t GPRArgRegs[] = { + static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; @@ -3055,7 +3053,7 @@ bool ARMFastISel::fastLowerArguments() { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(DstReg, getKillRegState(true)); - updateValueMap(I, ResultReg); + updateValueMap(&*I, ResultReg); } return true; diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp index 6744000..c5990bb 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCContext.h" @@ -58,7 +59,7 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const { const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); // iOS requires FP not to be clobbered for backtracing purpose. - if (STI.isTargetIOS()) + if (STI.isTargetIOS() || STI.isTargetWatchOS()) return true; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -288,7 +289,6 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI, void ARMFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -305,7 +305,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); unsigned NumBytes = MFI->getStackSize(); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + unsigned FramePtr = RegInfo->getFrameRegister(MF); // Determine the sizes of each callee-save spill areas and record which frame @@ -489,7 +493,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes) { // Adjust SP after all the callee-save spills. - if (tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) + if (AFI->getNumAlignedDPRCS2Regs() == 0 && + tryFoldSPUpdateIntoPushPop(STI, MF, LastPush, NumBytes)) DefCFAOffsetCandidates.addExtraBytes(LastPush, NumBytes); else { emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes, @@ -689,60 +694,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF, AFI->setShouldRestoreSPFromFP(true); } -// Resolve TCReturn pseudo-instruction -void ARMFrameLowering::fixTCReturn(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); - unsigned RetOpcode = MBBI->getOpcode(); - DebugLoc dl = MBBI->getDebugLoc(); - const ARMBaseInstrInfo &TII = - *static_cast<const ARMBaseInstrInfo *>(MF.getSubtarget().getInstrInfo()); - - if (!(RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri)) - return; - - // Tail call return: adjust the stack pointer and jump to callee. - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - - // Jump to label or value in register. - if (RetOpcode == ARM::TCRETURNdi) { - unsigned TCOpcode = STI.isThumb() ? - (STI.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND) : - ARM::TAILJMPd; - MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(TCOpcode)); - if (JumpTarget.isGlobal()) - MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), - JumpTarget.getTargetFlags()); - else { - assert(JumpTarget.isSymbol()); - MIB.addExternalSymbol(JumpTarget.getSymbolName(), - JumpTarget.getTargetFlags()); - } - - // Add the default predicate in Thumb mode. - if (STI.isThumb()) MIB.addImm(ARMCC::AL).addReg(0); - } else if (RetOpcode == ARM::TCRETURNri) { - BuildMI(MBB, MBBI, dl, - TII.get(STI.isThumb() ? ARM::tTAILJMPr : ARM::TAILJMPr)). - addReg(JumpTarget.getReg(), RegState::Kill); - } - - MachineInstr *NewMI = std::prev(MBBI); - for (unsigned i = 1, e = MBBI->getNumOperands(); i != e; ++i) - NewMI->addOperand(MBBI->getOperand(i)); - - // Delete the pseudo instruction TCRETURN. - MBB.erase(MBBI); - MBBI = NewMI; -} - void ARMFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->isReturn() && "Can only insert epilog into returning blocks"); - DebugLoc dl = MBBI->getDebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); @@ -758,10 +711,12 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. - if (MF.getFunction()->getCallingConv() == CallingConv::GHC) { - fixTCReturn(MF, MBB); + if (MF.getFunction()->getCallingConv() == CallingConv::GHC) return; - } + + // First put ourselves on the first (from top) terminator instructions. + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); if (!AFI->hasStackFrame()) { if (NumBytes - ArgRegsSaveSize != 0) @@ -840,8 +795,6 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF, if (AFI->getGPRCalleeSavedArea1Size()) MBBI++; } - fixTCReturn(MF, MBB); - if (ArgRegsSaveSize) emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize); } @@ -932,12 +885,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF, return Offset; } -int ARMFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - unsigned FrameReg; - return getFrameIndexReference(MF, FI, FrameReg); -} - void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -950,7 +897,6 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; - if (MI != MBB.end()) DL = MI->getDebugLoc(); SmallVector<std::pair<unsigned,bool>, 4> Regs; unsigned i = CSI.size(); @@ -1008,7 +954,8 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB, // Put any subsequent vpush instructions before this one: they will refer to // higher register numbers so need to be pushed first in order to preserve // monotonicity. - --MI; + if (MI != MBB.begin()) + --MI; } } @@ -1022,12 +969,20 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc DL = MI->getDebugLoc(); - unsigned RetOpcode = MI->getOpcode(); - bool isTailCall = (RetOpcode == ARM::TCRETURNdi || - RetOpcode == ARM::TCRETURNri); - bool isInterrupt = - RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; + DebugLoc DL; + bool isTailCall = false; + bool isInterrupt = false; + bool isTrap = false; + if (MBB.end() != MI) { + DL = MI->getDebugLoc(); + unsigned RetOpcode = MI->getOpcode(); + isTailCall = (RetOpcode == ARM::TCRETURNdi || RetOpcode == ARM::TCRETURNri); + isInterrupt = + RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR; + isTrap = + RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl || + RetOpcode == ARM::tTRAP; + } SmallVector<unsigned, 4> Regs; unsigned i = CSI.size(); @@ -1043,11 +998,14 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, continue; if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt && - STI.hasV5TOps()) { - Reg = ARM::PC; - LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; + !isTrap && STI.hasV5TOps()) { + if (MBB.succ_empty()) { + Reg = ARM::PC; + DeleteRet = true; + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET; + } else + LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_UPD : ARM::LDMIA_UPD; // Fold the return instruction into the LDM. - DeleteRet = true; } // If NoGap is true, pop consecutive registers and then leave the rest @@ -1068,7 +1026,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, .addReg(ARM::SP)); for (unsigned i = 0, e = Regs.size(); i < e; ++i) MIB.addReg(Regs[i], getDefRegState(true)); - if (DeleteRet) { + if (DeleteRet && MI != MBB.end()) { MIB.copyImplicitOps(&*MI); MI->eraseFromParent(); } @@ -1095,7 +1053,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB, // Put any subsequent vpop instructions after this one: they will refer to // higher register numbers so need to be popped afterwards. - ++MI; + if (MI != MBB.end()) + ++MI; } } @@ -1109,7 +1068,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) { MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineFrameInfo &MFI = *MF.getFrameInfo(); @@ -1118,7 +1077,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB, // slot offsets can be wrong. The offset for d8 will always be correct. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned DNum = CSI[i].getReg() - ARM::D8; - if (DNum >= 8) + if (DNum > NumAlignedDPRCS2Regs - 1) continue; int FI = CSI[i].getFrameIdx(); // The even-numbered registers will be 16-byte aligned, the odd-numbered @@ -1269,7 +1228,7 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB, const TargetRegisterInfo *TRI) { MachineFunction &MF = *MBB.getParent(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // Find the frame index assigned to d8. @@ -1654,13 +1613,11 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, // FIXME: We could add logic to be more precise about negative offsets // and which instructions will need a scratch register for them. Is it // worth the effort and added fragility? - bool BigStack = - (RS && - (MFI->estimateStackSize(MF) + - ((hasFP(MF) && AFI->hasStackFrame()) ? 4:0) >= - estimateRSStackSizeLimit(MF, this))) - || MFI->hasVarSizedObjects() - || (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); + bool BigStack = (RS && (MFI->estimateStackSize(MF) + + ((hasFP(MF) && AFI->hasStackFrame()) ? 4 : 0) >= + estimateRSStackSizeLimit(MF, this))) || + MFI->hasVarSizedObjects() || + (MFI->adjustsStack() && !canSimplifyCallFramePseudos(MF)); bool ExtraCSSpill = false; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) { @@ -1698,8 +1655,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF, if (CS1Spilled && !UnspilledCS1GPRs.empty()) { for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) { unsigned Reg = UnspilledCS1GPRs[i]; - // Don't spill high register if the function is thumb + // Don't spill high register if the function is thumb. In the case of + // Windows on ARM, accept R11 (frame pointer) if (!AFI->isThumbFunction() || + (STI.isTargetWindows() && Reg == ARM::R11) || isARMLowRegister(Reg) || Reg == ARM::LR) { SavedRegs.set(Reg); if (!MRI.isReserved(Reg)) @@ -1784,8 +1743,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned Align = getStackAlignment(); - Amount = (Amount+Align-1)/Align*Align; + Amount = alignSPAdjust(Amount); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); assert(!AFI->isThumb1OnlyFunction() && @@ -1885,7 +1843,6 @@ void ARMFrameLowering::adjustForSegmentedStacks( if (!ST->isTargetAndroid() && !ST->isTargetLinux()) report_fatal_error("Segmented stacks not supported on this platform."); - assert(&PrologueMBB == &MF.front() && "Shrink-wrapping not yet implemented"); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); MCContext &Context = MMI.getContext(); @@ -1913,21 +1870,48 @@ void ARMFrameLowering::adjustForSegmentedStacks( MachineBasicBlock *GetMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *McrMBB = MF.CreateMachineBasicBlock(); - for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(), - e = PrologueMBB.livein_end(); - i != e; ++i) { - AllocMBB->addLiveIn(*i); - GetMBB->addLiveIn(*i); - McrMBB->addLiveIn(*i); - PrevStackMBB->addLiveIn(*i); - PostStackMBB->addLiveIn(*i); + // Grab everything that reaches PrologueMBB to update there liveness as well. + SmallPtrSet<MachineBasicBlock *, 8> BeforePrologueRegion; + SmallVector<MachineBasicBlock *, 2> WalkList; + WalkList.push_back(&PrologueMBB); + + do { + MachineBasicBlock *CurMBB = WalkList.pop_back_val(); + for (MachineBasicBlock *PredBB : CurMBB->predecessors()) { + if (BeforePrologueRegion.insert(PredBB).second) + WalkList.push_back(PredBB); + } + } while (!WalkList.empty()); + + // The order in that list is important. + // The blocks will all be inserted before PrologueMBB using that order. + // Therefore the block that should appear first in the CFG should appear + // first in the list. + MachineBasicBlock *AddedBlocks[] = {PrevStackMBB, McrMBB, GetMBB, AllocMBB, + PostStackMBB}; + + for (MachineBasicBlock *B : AddedBlocks) + BeforePrologueRegion.insert(B); + + for (const auto &LI : PrologueMBB.liveins()) { + for (MachineBasicBlock *PredBB : BeforePrologueRegion) + PredBB->addLiveIn(LI); + } + + // Remove the newly added blocks from the list, since we know + // we do not have to do the following updates for them. + for (MachineBasicBlock *B : AddedBlocks) { + BeforePrologueRegion.erase(B); + MF.insert(PrologueMBB.getIterator(), B); } - MF.push_front(PostStackMBB); - MF.push_front(AllocMBB); - MF.push_front(GetMBB); - MF.push_front(McrMBB); - MF.push_front(PrevStackMBB); + for (MachineBasicBlock *MBB : BeforePrologueRegion) { + // Make sure the LiveIns are still sorted and unique. + MBB->sortUniqueLiveIns(); + // Replace the edges to PrologueMBB by edges to the sequences + // we are about to add. + MBB->ReplaceUsesOfBlockWith(&PrologueMBB, AddedBlocks[0]); + } // The required stack size that is aligned to ARM constant criterion. AlignedStackSize = alignToARMConstant(StackSize); @@ -1991,7 +1975,7 @@ void ARMFrameLowering::adjustForSegmentedStacks( ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create( MF.getFunction()->getContext(), "__STACK_LIMIT", PCLabelId, 0); MachineConstantPool *MCP = MF.getConstantPool(); - unsigned CPI = MCP->getConstantPoolIndex(NewCPV, MF.getAlignment()); + unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4); // ldr SR0, [pc, offset(STACK_LIMIT)] AddDefaultPred(BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0) diff --git a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h index 6fdc5ef..66f4dfb 100644 --- a/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMFrameLowering.h @@ -31,8 +31,6 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; - void fixTCReturn(MachineFunction &MF, MachineBasicBlock &MBB) const; - bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -52,7 +50,6 @@ public: unsigned &FrameReg) const override; int ResolveFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg, int SPAdj) const; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; @@ -60,6 +57,11 @@ public: void adjustForSegmentedStacks(MachineFunction &MF, MachineBasicBlock &MBB) const override; + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override { + return true; + } + private: void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc, diff --git a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index b110628..0242440 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -160,11 +160,6 @@ public: // Thumb Addressing Modes: bool SelectThumbAddrModeRR(SDValue N, SDValue &Base, SDValue &Offset); - bool SelectThumbAddrModeRI(SDValue N, SDValue &Base, SDValue &Offset, - unsigned Scale); - bool SelectThumbAddrModeRI5S1(SDValue N, SDValue &Base, SDValue &Offset); - bool SelectThumbAddrModeRI5S2(SDValue N, SDValue &Base, SDValue &Offset); - bool SelectThumbAddrModeRI5S4(SDValue N, SDValue &Base, SDValue &Offset); bool SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm); bool SelectThumbAddrModeImm5S1(SDValue N, SDValue &Base, @@ -176,8 +171,6 @@ public: bool SelectThumbAddrModeSP(SDValue N, SDValue &Base, SDValue &OffImm); // Thumb 2 Addressing Modes: - bool SelectT2ShifterOperandReg(SDValue N, - SDValue &BaseReg, SDValue &Opc); bool SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm); bool SelectT2AddrModeImm8(SDValue N, SDValue &Base, SDValue &OffImm); @@ -278,6 +271,22 @@ private: // Get the alignment operand for a NEON VLD or VST instruction. SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs, bool is64BitVector); + + /// Returns the number of instructions required to materialize the given + /// constant in a register, or 3 if a literal pool load is needed. + unsigned ConstantMaterializationCost(unsigned Val) const; + + /// Checks if N is a multiplication by a constant where we can extract out a + /// power of two from the constant so that it can be used in a shift, but only + /// if it simplifies the materialization of the constant. Returns true if it + /// is, and assigns to PowerOfTwo the power of two that should be extracted + /// out and to NewMulConst the new constant to be multiplied by. + bool canExtractShiftFromMul(const SDValue &N, unsigned MaxShift, + unsigned &PowerOfTwo, SDValue &NewMulConst) const; + + /// Replace N with M in CurDAG, in a way that also ensures that M gets + /// selected when N would have been selected. + void replaceDAGValue(const SDValue &N, SDValue M); }; } @@ -334,7 +343,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() { bool isThumb2 = Subtarget->isThumb(); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { - SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (N->getOpcode() != ISD::ADD) continue; @@ -388,7 +397,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() { SDValue CPTmp1; SDValue CPTmp2; if (isThumb2) { - if (SelectT2ShifterOperandReg(N0, CPTmp0, CPTmp1)) + if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1)) continue; } else { if (SelectImmShifterOperand(N0, CPTmp0, CPTmp1) || @@ -471,6 +480,61 @@ bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift, (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1)); } +unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const { + if (Subtarget->isThumb()) { + if (Val <= 255) return 1; // MOV + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW + if (~Val <= 255) return 2; // MOV + MVN + if (ARM_AM::isThumbImmShiftedVal(Val)) return 2; // MOV + LSL + } else { + if (ARM_AM::getSOImmVal(Val) != -1) return 1; // MOV + if (ARM_AM::getSOImmVal(~Val) != -1) return 1; // MVN + if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW + if (ARM_AM::isSOImmTwoPartVal(Val)) return 2; // two instrs + } + if (Subtarget->useMovt(*MF)) return 2; // MOVW + MOVT + return 3; // Literal pool load +} + +bool ARMDAGToDAGISel::canExtractShiftFromMul(const SDValue &N, + unsigned MaxShift, + unsigned &PowerOfTwo, + SDValue &NewMulConst) const { + assert(N.getOpcode() == ISD::MUL); + assert(MaxShift > 0); + + // If the multiply is used in more than one place then changing the constant + // will make other uses incorrect, so don't. + if (!N.hasOneUse()) return false; + // Check if the multiply is by a constant + ConstantSDNode *MulConst = dyn_cast<ConstantSDNode>(N.getOperand(1)); + if (!MulConst) return false; + // If the constant is used in more than one place then modifying it will mean + // we need to materialize two constants instead of one, which is a bad idea. + if (!MulConst->hasOneUse()) return false; + unsigned MulConstVal = MulConst->getZExtValue(); + if (MulConstVal == 0) return false; + + // Find the largest power of 2 that MulConstVal is a multiple of + PowerOfTwo = MaxShift; + while ((MulConstVal % (1 << PowerOfTwo)) != 0) { + --PowerOfTwo; + if (PowerOfTwo == 0) return false; + } + + // Only optimise if the new cost is better + unsigned NewMulConstVal = MulConstVal / (1 << PowerOfTwo); + NewMulConst = CurDAG->getConstant(NewMulConstVal, SDLoc(N), MVT::i32); + unsigned OldCost = ConstantMaterializationCost(MulConstVal); + unsigned NewCost = ConstantMaterializationCost(NewMulConstVal); + return NewCost < OldCost; +} + +void ARMDAGToDAGISel::replaceDAGValue(const SDValue &N, SDValue M) { + CurDAG->RepositionNode(N.getNode()->getIterator(), M.getNode()); + CurDAG->ReplaceAllUsesWith(N, M); +} + bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, SDValue &BaseReg, SDValue &Opc, @@ -478,6 +542,24 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N, if (DisableShifterOp) return false; + // If N is a multiply-by-constant and it's profitable to extract a shift and + // use it in a shifted operand do so. + if (N.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) { + BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32, + N.getOperand(0), NewMulConst) + .getNode()), + 0); + replaceDAGValue(N.getOperand(1), NewMulConst); + Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl, + PowerOfTwo), + SDLoc(N), MVT::i32); + return true; + } + } + ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); // Don't match base register only case. That is matched to a separate @@ -662,6 +744,18 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, } } + // If Offset is a multiply-by-constant and it's profitable to extract a shift + // and use it in a shifted operand do so. + if (Offset.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(Offset, 31, PowerOfTwo, NewMulConst)) { + replaceDAGValue(Offset.getOperand(1), NewMulConst); + ShAmt = PowerOfTwo; + ShOpcVal = ARM_AM::lsl; + } + } + Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal), SDLoc(N), MVT::i32); return true; @@ -1086,77 +1180,13 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDValue N, } bool -ARMDAGToDAGISel::SelectThumbAddrModeRI(SDValue N, SDValue &Base, - SDValue &Offset, unsigned Scale) { - if (Scale == 4) { - SDValue TmpBase, TmpOffImm; - if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) - return false; // We want to select tLDRspi / tSTRspi instead. - - if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() == ISD::TargetConstantPool) - return false; // We want to select tLDRpci instead. - } - - if (!CurDAG->isBaseWithConstantOffset(N)) - return false; - - // Thumb does not have [sp, r] address mode. - RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); - RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); - if ((LHSR && LHSR->getReg() == ARM::SP) || - (RHSR && RHSR->getReg() == ARM::SP)) - return false; - - // FIXME: Why do we explicitly check for a match here and then return false? - // Presumably to allow something else to match, but shouldn't this be - // documented? - int RHSC; - if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) - return false; - - Base = N.getOperand(0); - Offset = N.getOperand(1); - return true; -} - -bool -ARMDAGToDAGISel::SelectThumbAddrModeRI5S1(SDValue N, - SDValue &Base, - SDValue &Offset) { - return SelectThumbAddrModeRI(N, Base, Offset, 1); -} - -bool -ARMDAGToDAGISel::SelectThumbAddrModeRI5S2(SDValue N, - SDValue &Base, - SDValue &Offset) { - return SelectThumbAddrModeRI(N, Base, Offset, 2); -} - -bool -ARMDAGToDAGISel::SelectThumbAddrModeRI5S4(SDValue N, - SDValue &Base, - SDValue &Offset) { - return SelectThumbAddrModeRI(N, Base, Offset, 4); -} - -bool ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, SDValue &Base, SDValue &OffImm) { - if (Scale == 4) { - SDValue TmpBase, TmpOffImm; - if (SelectThumbAddrModeSP(N, TmpBase, TmpOffImm)) - return false; // We want to select tLDRspi / tSTRspi instead. - - if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() == ISD::TargetConstantPool) - return false; // We want to select tLDRpci instead. - } - if (!CurDAG->isBaseWithConstantOffset(N)) { - if (N.getOpcode() == ARMISD::Wrapper && - N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { + if (N.getOpcode() == ISD::ADD) { + return false; // We want to select register offset instead + } else if (N.getOpcode() == ARMISD::Wrapper && + N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) { Base = N.getOperand(0); } else { Base = N; @@ -1166,23 +1196,6 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, return true; } - RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0)); - RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1)); - if ((LHSR && LHSR->getReg() == ARM::SP) || - (RHSR && RHSR->getReg() == ARM::SP)) { - ConstantSDNode *LHS = dyn_cast<ConstantSDNode>(N.getOperand(0)); - ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1)); - unsigned LHSC = LHS ? LHS->getZExtValue() : 0; - unsigned RHSC = RHS ? RHS->getZExtValue() : 0; - - // Thumb does not have [sp, #imm5] address mode for non-zero imm5. - if (LHSC != 0 || RHSC != 0) return false; - - Base = N; - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); - return true; - } - // If the RHS is + imm5 * scale, fold into addr mode. int RHSC; if (isScaledConstantInRange(N.getOperand(1), Scale, 0, 32, RHSC)) { @@ -1191,9 +1204,8 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale, return true; } - Base = N.getOperand(0); - OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); - return true; + // Offset is too large, so use register offset instead. + return false; } bool @@ -1263,28 +1275,6 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N, //===----------------------------------------------------------------------===// -bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDValue N, SDValue &BaseReg, - SDValue &Opc) { - if (DisableShifterOp) - return false; - - ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode()); - - // Don't match base register only case. That is matched to a separate - // lower complexity pattern with explicit register operand. - if (ShOpcVal == ARM_AM::no_shift) return false; - - BaseReg = N.getOperand(0); - unsigned ShImmVal = 0; - if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) { - ShImmVal = RHS->getZExtValue() & 31; - Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal), SDLoc(N)); - return true; - } - - return false; -} - bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm) { // Match simple R + imm12 operands. @@ -1425,6 +1415,17 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, } } + // If OffReg is a multiply-by-constant and it's profitable to extract a shift + // and use it in a shifted operand do so. + if (OffReg.getOpcode() == ISD::MUL) { + unsigned PowerOfTwo = 0; + SDValue NewMulConst; + if (canExtractShiftFromMul(OffReg, 3, PowerOfTwo, NewMulConst)) { + replaceDAGValue(OffReg.getOperand(1), NewMulConst); + ShAmt = PowerOfTwo; + } + } + ShImm = CurDAG->getTargetConstant(ShAmt, SDLoc(N), MVT::i32); return true; @@ -2503,25 +2504,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case ISD::Constant: { unsigned Val = cast<ConstantSDNode>(N)->getZExtValue(); - bool UseCP = true; - if (Subtarget->useMovt(*MF)) - // Thumb2-aware targets have the MOVT instruction, so all immediates can - // be done with MOV + MOVT, at worst. - UseCP = false; - else { - if (Subtarget->isThumb()) { - UseCP = (Val > 255 && // MOV - ~Val > 255 && // MOV + MVN - !ARM_AM::isThumbImmShiftedVal(Val) && // MOV + LSL - !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW - } else - UseCP = (ARM_AM::getSOImmVal(Val) == -1 && // MOV - ARM_AM::getSOImmVal(~Val) == -1 && // MVN - !ARM_AM::isSOImmTwoPartVal(Val) && // two instrs. - !(Subtarget->hasV6T2Ops() && Val <= 0xffff)); // MOVW - } - - if (UseCP) { + // If we can't materialize the constant we need to use a literal pool + if (ConstantMaterializationCost(Val) > 2) { SDValue CPIdx = CurDAG->getTargetConstantPool( ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val), TLI->getPointerTy(CurDAG->getDataLayout())); @@ -3376,7 +3360,7 @@ static void getIntOperandsFromRegisterString(StringRef RegString, SelectionDAG *CurDAG, SDLoc DL, std::vector<SDValue>& Ops) { SmallVector<StringRef, 5> Fields; - RegString.split(Fields, ":"); + RegString.split(Fields, ':'); if (Fields.size() > 1) { bool AllIntFields = true; @@ -3461,9 +3445,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) { // The flags here are common to those allowed for apsr in the A class cores and // those allowed for the special registers in the M class cores. Returns a // value representing which flags were present, -1 if invalid. -static inline int getMClassFlagsMask(StringRef Flags) { +static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) { if (Flags.empty()) - return 0x3; + return 0x2 | (int)hasDSP; return StringSwitch<int>(Flags) .Case("g", 0x1) @@ -3492,7 +3476,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead, } // We know we are now handling a write so need to get the mask for the flags. - int Mask = getMClassFlagsMask(Flags); + int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP()); // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values // shouldn't have flags present. @@ -3501,7 +3485,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead, // The _g and _nzcvqg versions are only valid if the DSP extension is // available. - if (!Subtarget->hasThumb2DSP() && (Mask & 0x2)) + if (!Subtarget->hasDSP() && (Mask & 0x1)) return -1; // The register was valid so need to put the mask in the correct place @@ -3523,7 +3507,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) { // The flags permitted for apsr are the same flags that are allowed in // M class registers. We get the flag value and then shift the flags into // the correct place to combine with the mask. - Mask = getMClassFlagsMask(Flags); + Mask = getMClassFlagsMask(Flags, true); if (Mask == -1) return -1; return Mask << 2; @@ -3742,7 +3726,7 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){ } SmallVector<StringRef, 5> Fields; - StringRef(SpecialReg).split(Fields, "_", 1, false); + StringRef(SpecialReg).split(Fields, '_', 1, false); std::string Reg = Fields[0].str(); StringRef Flags = Fields.size() == 2 ? Fields[1] : ""; @@ -3943,6 +3927,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, // be an immediate and not a memory constraint. // Fallthrough. case InlineAsm::Constraint_m: + case InlineAsm::Constraint_o: case InlineAsm::Constraint_Q: case InlineAsm::Constraint_Um: case InlineAsm::Constraint_Un: diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8cc06df..9cfb06b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -142,6 +142,11 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + + if (!VT.isFloatingPoint() && + VT != MVT::v2i64 && VT != MVT::v1i64) + for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) + setOperationAction(Opcode, VT, Legal); } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { @@ -166,77 +171,78 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { - // Single-precision floating-point arithmetic. - setLibcallName(RTLIB::ADD_F32, "__addsf3vfp"); - setLibcallName(RTLIB::SUB_F32, "__subsf3vfp"); - setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp"); - setLibcallName(RTLIB::DIV_F32, "__divsf3vfp"); - - // Double-precision floating-point arithmetic. - setLibcallName(RTLIB::ADD_F64, "__adddf3vfp"); - setLibcallName(RTLIB::SUB_F64, "__subdf3vfp"); - setLibcallName(RTLIB::MUL_F64, "__muldf3vfp"); - setLibcallName(RTLIB::DIV_F64, "__divdf3vfp"); - - // Single-precision comparisons. - setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp"); - setLibcallName(RTLIB::UNE_F32, "__nesf2vfp"); - setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp"); - setLibcallName(RTLIB::OLE_F32, "__lesf2vfp"); - setLibcallName(RTLIB::OGE_F32, "__gesf2vfp"); - setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp"); - setLibcallName(RTLIB::UO_F32, "__unordsf2vfp"); - setLibcallName(RTLIB::O_F32, "__unordsf2vfp"); - - setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE); - setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ); - - // Double-precision comparisons. - setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp"); - setLibcallName(RTLIB::UNE_F64, "__nedf2vfp"); - setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp"); - setLibcallName(RTLIB::OLE_F64, "__ledf2vfp"); - setLibcallName(RTLIB::OGE_F64, "__gedf2vfp"); - setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp"); - setLibcallName(RTLIB::UO_F64, "__unorddf2vfp"); - setLibcallName(RTLIB::O_F64, "__unorddf2vfp"); - - setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::UO_F64, ISD::SETNE); - setCmpLibcallCC(RTLIB::O_F64, ISD::SETEQ); - - // Floating-point to integer conversions. - // i64 conversions are done via library routines even when generating VFP - // instructions, so use the same ones. - setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp"); - setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp"); - setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp"); - setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp"); - - // Conversions between floating types. - setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp"); - setLibcallName(RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp"); + static const struct { + const RTLIB::Libcall Op; + const char * const Name; + const ISD::CondCode Cond; + } LibraryCalls[] = { + // Single-precision floating-point arithmetic. + { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, + + // Double-precision floating-point arithmetic. + { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, + { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, + { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, + { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, + + // Single-precision comparisons. + { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, + { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, + { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, + { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, + { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, + { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, + { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, + { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, + + // Double-precision comparisons. + { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, + { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, + { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, + { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, + { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, + { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, + { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, + { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, + + // Floating-point to integer conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, + + // Conversions between floating types. + { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, + { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, + + // Integer to floating-point conversions. + // i64 conversions are done via library routines even when generating VFP + // instructions, so use the same ones. + // FIXME: There appears to be some naming inconsistency in ARM libgcc: + // e.g., __floatunsidf vs. __floatunssidfvfp. + { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } - // Integer to floating-point conversions. - // i64 conversions are done via library routines even when generating VFP - // instructions, so use the same ones. - // FIXME: There appears to be some naming inconsistency in ARM libgcc: - // e.g., __floatunsidf vs. __floatunssidfvfp. - setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp"); - setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp"); - setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp"); - setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp"); + // Set the correct calling convention for ARMv7k WatchOS. It's just + // AAPCS_VFP for functions as simple as libcalls. + if (Subtarget->isTargetWatchOS()) { + for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) + setLibcallCallingConv((RTLIB::Libcall)i, CallingConv::ARM_AAPCS_VFP); } } @@ -245,8 +251,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); - if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() && - !Subtarget->isTargetWindows()) { + // RTLIB + if (Subtarget->isAAPCS_ABI() && + (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || + Subtarget->isTargetAndroid())) { static const struct { const RTLIB::Libcall Op; const char * const Name; @@ -334,12 +342,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - - // Memory operations - // RTABI chapter 4.3.4 - { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, - { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { @@ -348,6 +350,30 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } + + // EABI dependent RTLIB + if (TM.Options.EABIVersion == EABI::EABI4 || + TM.Options.EABIVersion == EABI::EABI5) { + static const struct { + const RTLIB::Libcall Op; + const char *const Name; + const CallingConv::ID CC; + const ISD::CondCode Cond; + } MemOpsLibraryCalls[] = { + // Memory operations + // RTABI chapter 4.3.4 + { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, + }; + + for (const auto &LC : MemOpsLibraryCalls) { + setLibcallName(LC.Op, LC.Name); + setLibcallCallingConv(LC.Op, LC.CC); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + } } if (Subtarget->isTargetWindows()) { @@ -364,6 +390,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I32, "__rt_sdiv", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UDIV_I32, "__rt_udiv", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::SDIV_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS_VFP }, + { RTLIB::UDIV_I64, "__rt_udiv64", CallingConv::ARM_AAPCS_VFP }, }; for (const auto &LC : LibraryCalls) { @@ -373,8 +403,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->getTargetTriple().isiOS() && - !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) { + if (Subtarget->isTargetWatchOS() || + (Subtarget->isTargetIOS() && + !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } @@ -392,6 +423,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); } + // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have + // a __gnu_ prefix (which is the default). + if (Subtarget->isTargetAEABI()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__aeabi_f2h"); + setLibcallName(RTLIB::FPROUND_F64_F16, "__aeabi_d2h"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__aeabi_h2f"); + } + if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else @@ -579,7 +618,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -605,7 +643,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ADDC); if (Subtarget->isFPOnlySP()) { - // When targetting a floating-point unit with only single-precision + // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, // loads and stores are provided by the hardware. @@ -689,7 +727,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); } if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() - || (Subtarget->isThumb2() && !Subtarget->hasThumb2DSP())) + || (Subtarget->isThumb2() && !Subtarget->hasDSP())) setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); @@ -706,8 +744,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, MVT::i32, Custom); } + if (!Subtarget->isThumb1Only()) + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + // ARM does not have ROTL. - setOperationAction(ISD::ROTL, MVT::i32, Expand); + setOperationAction(ISD::ROTL, MVT::i32, Expand); + for (MVT VT : MVT::vector_valuetypes()) { + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); + } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) @@ -717,7 +762,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i32 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF , MVT::i32 , Expand); - setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); + // @llvm.readcyclecounter requires the Performance Monitors extension. + // Default to the 0 expansion on unsupported platforms. + // FIXME: Technically there are older ARM CPUs that have + // implementation-specific ways of obtaining this information. + if (Subtarget->hasPerfMon()) + setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) @@ -726,15 +776,17 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) && !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) { // These are expanded into libcalls if the cpu doesn't have HW divider. - setOperationAction(ISD::SDIV, MVT::i32, Expand); - setOperationAction(ISD::UDIV, MVT::i32, Expand); + setOperationAction(ISD::SDIV, MVT::i32, LibCall); + setOperationAction(ISD::UDIV, MVT::i32, LibCall); } - // FIXME: Also set divmod for SREM on EABI setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); // Register based DivRem for AEABI (RTABI 4.2) - if (Subtarget->isTargetAEABI()) { + if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) { + setOperationAction(ISD::SREM, MVT::i64, Custom); + setOperationAction(ISD::UREM, MVT::i64, Custom); + setLibcallName(RTLIB::SDIVREM_I8, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod"); setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod"); @@ -762,7 +814,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); - setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); @@ -776,13 +827,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - if (!Subtarget->isTargetMachO()) { - // Non-MachO platforms may return values in these registers via the - // personality function. - setExceptionPointerRegister(ARM::R0); - setExceptionSelectorRegister(ARM::R1); - } - if (Subtarget->getTargetTriple().isWindowsItaniumEnvironment()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); else @@ -849,11 +893,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget->isTargetDarwin()) { - setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); - setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); + setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); + if (Subtarget->useSjLjEH()) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); - } setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); @@ -912,7 +956,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); - if (Subtarget->getTargetTriple().isiOS()) { + if (Subtarget->isTargetWatchOS()) { + setLibcallCallingConv(RTLIB::SINCOS_F32, CallingConv::ARM_AAPCS_VFP); + setLibcallCallingConv(RTLIB::SINCOS_F64, CallingConv::ARM_AAPCS_VFP); + } + if (Subtarget->isTargetIOS() || Subtarget->isTargetWatchOS()) { // For iOS, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); @@ -928,6 +976,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + if (!Subtarget->isFPOnlySP()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); @@ -935,8 +990,22 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); } } + + if (Subtarget->hasNEON()) { + // vmin and vmax aren't available in a scalar form, so we use + // a NEON instruction with an undef lane instead. + setOperationAction(ISD::FMINNAN, MVT::f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); + } + // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); @@ -959,11 +1028,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, //// temporary - rewrite interface to use type MaxStoresPerMemset = 8; - MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemsetOptSize = 4; MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores - MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + MaxStoresPerMemcpyOptSize = 2; MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores - MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 4 : 2; + MaxStoresPerMemmoveOptSize = 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. @@ -1054,8 +1123,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::CMOV: return "ARMISD::CMOV"; - case ARMISD::RBIT: return "ARMISD::RBIT"; - case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; @@ -1069,7 +1136,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; - case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; + case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; @@ -1082,6 +1150,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::WIN__CHKSTK: return "ARMISD:::WIN__CHKSTK"; + case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; @@ -1133,14 +1202,11 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; - case ARMISD::FMAX: return "ARMISD::FMAX"; - case ARMISD::FMIN: return "ARMISD::FMIN"; - case ARMISD::VMAXNM: return "ARMISD::VMAX"; - case ARMISD::VMINNM: return "ARMISD::VMIN"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; + case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; @@ -1449,9 +1515,10 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); - return DAG.getStore(Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); } void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG, @@ -1734,9 +1801,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), false, false, - false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) { const char *Sym = S->getSymbol(); @@ -1748,9 +1816,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), false, false, - false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { const GlobalValue *GV = G->getGlobal(); @@ -1768,7 +1837,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMISD::WrapperPIC, dl, PtrVt, DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), false, false, true, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, true, 0); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); @@ -1781,7 +1851,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } else { // On ELF targets for PIC code, direct calls should go through the PLT unsigned OpFlags = 0; @@ -1804,9 +1875,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ARMPCLabelIndex, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), false, false, - false, 0); + Callee = DAG.getLoad( + PtrVt, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { @@ -1821,7 +1893,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // FIXME: handle tail calls differently. unsigned CallOpc; - bool HasMinSizeAttr = MF.getFunction()->hasFnAttribute(Attribute::MinSize); if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; @@ -1831,8 +1902,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRAS() && - // Emit regular call when code size is the priority - !HasMinSizeAttr) + // Emit regular call when code size is the priority + !MF.getFunction()->optForMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else @@ -2014,6 +2085,8 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; + assert(Subtarget->supportsTailCall()); + // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. @@ -2033,26 +2106,6 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: - // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as - // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation - // support in the assembler and linker to be used. This would need to be - // fixed to fully support tail calls in Thumb1. - // - // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take - // LR. This means if we need to reload LR, it takes an extra instructions, - // which outweighs the value of the tail call; but here we don't know yet - // whether LR is going to be used. Probably the right approach is to - // generate the tail call here and turn it back into CALL/RET in - // emitEpilogue if LR is used. - - // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, - // but we need to make sure there are enough registers; the only valid - // registers are the 4 used for parameters. We don't currently do this - // case. - if (Subtarget->isThumb1Only()) - return false; - // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls @@ -2400,7 +2453,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; - return !Subtarget->isThumb1Only(); + return true; } // Trying to write a 64 bit value so need to split into two 32 bit values first, @@ -2467,9 +2520,10 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = + DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); if (RelocM == Reloc::Static) return Result; SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); @@ -2491,9 +2545,10 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); - Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Argument = + DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 0); SDValue Chain = Argument.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2543,17 +2598,19 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); Chain = Offset.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } else { // local exec model assert(model == TLSModel::LocalExec); @@ -2561,9 +2618,10 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); - Offset = DAG.getLoad(PtrVT, dl, Chain, Offset, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, dl, Chain, Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } // The address of the thread local variable is the add of the thread @@ -2577,6 +2635,8 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "TLS not implemented for non-ELF targets"); GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); @@ -2597,22 +2657,31 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SDLoc dl(Op); const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility(); - ARMConstantPoolValue *CPV = - ARMConstantPoolConstant::Create(GV, - UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT); + bool UseGOT_PREL = + !(GV->hasHiddenVisibility() || GV->hasLocalLinkage()); + + MachineFunction &MF = DAG.getMachineFunction(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDLoc dl(Op); + unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; + ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create( + GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj, + UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier, + /*AddCurrentAddress=*/UseGOT_PREL); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), - CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); SDValue Chain = Result.getValue(1); - SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); - Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT); - if (!UseGOTOFF) + SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); + Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); + if (UseGOT_PREL) Result = DAG.getLoad(PtrVT, dl, Chain, Result, - MachinePointerInfo::getGOT(), + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); return Result; } @@ -2628,9 +2697,10 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + return DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); } } @@ -2654,7 +2724,8 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } @@ -2680,32 +2751,11 @@ SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, TargetFlags)); if (GV->hasDLLImportStorageClass()) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } -SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, - SelectionDAG &DAG) const { - assert(Subtarget->isTargetELF() && - "GLOBAL OFFSET TABLE not implemented for non-ELF targets"); - MachineFunction &MF = DAG.getMachineFunction(); - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - SDLoc dl(Op); - unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; - ARMConstantPoolValue *CPV = - ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_", - ARMPCLabelIndex, PCAdj); - SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); - SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); - return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); -} - SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -2722,6 +2772,13 @@ ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); } +SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, + Op.getOperand(0)); +} + SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { @@ -2732,7 +2789,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, case Intrinsic::arm_rbit: { assert(Op.getOperand(1).getValueType() == MVT::i32 && "RBIT intrinsic must have i32 type!"); - return DAG.getNode(ARMISD::RBIT, dl, MVT::i32, Op.getOperand(1)); + return DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Op.getOperand(1)); } case Intrinsic::arm_thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2752,10 +2809,10 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, ARMCP::CPLSDA, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - SDValue Result = - DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - MachinePointerInfo::getConstantPool(), - false, false, false, 0); + SDValue Result = DAG.getLoad( + PtrVT, dl, DAG.getEntryNode(), CPAddr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); if (RelocM == Reloc::PIC_) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); @@ -2770,6 +2827,36 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } + case Intrinsic::arm_neon_vminnm: + case Intrinsic::arm_neon_vmaxnm: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) + ? ISD::FMINNUM : ISD::FMAXNUM; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vminu: + case Intrinsic::arm_neon_vmaxu: { + if (Op.getValueType().isFloatingPoint()) + return SDValue(); + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) + ? ISD::UMIN : ISD::UMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + case Intrinsic::arm_neon_vmins: + case Intrinsic::arm_neon_vmaxs: { + // v{min,max}s is overloaded between signed integers and floats. + if (!Op.getValueType().isFloatingPoint()) { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::SMIN : ISD::SMAX; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) + ? ISD::FMINNAN : ISD::FMAXNAN; + return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } } } @@ -2870,9 +2957,10 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + ArgValue2 = DAG.getLoad( + MVT::i32, dl, Root, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); @@ -3056,9 +3144,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, if (VA.isMemLoc()) { int FI = MFI->CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + ArgValue2 = DAG.getLoad( + MVT::f64, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); @@ -3139,9 +3228,9 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); - int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, CurOrigArg, - CurByValIndex, VA.getLocMemOffset(), - Flags.getByValSize()); + int FrameIndex = StoreByValRegs( + CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, + VA.getLocMemOffset(), Flags.getByValSize()); InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { @@ -3151,9 +3240,10 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain, // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); - InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0)); + InVals.push_back(DAG.getLoad( + VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0)); } lastInsIndex = index; } @@ -3188,13 +3278,9 @@ static bool isFloatingPointZero(SDValue Op) { // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) // created by LowerConstantFP(). SDValue BitcastOp = Op->getOperand(0); - if (BitcastOp->getOpcode() == ARMISD::VMOVIMM) { - SDValue MoveOp = BitcastOp->getOperand(0); - if (MoveOp->getOpcode() == ISD::TargetConstant && - cast<ConstantSDNode>(MoveOp)->getZExtValue() == 0) { - return true; - } - } + if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && + isNullConstant(BitcastOp->getOperand(0))) + return true; } return false; } @@ -3559,113 +3645,6 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { // Try to generate VMAXNM/VMINNM on ARMv8. if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { - // We can use VMAXNM/VMINNM for a compare followed by a select with the - // same operands, as follows: - // c = fcmp [?gt, ?ge, ?lt, ?le] a, b - // select c, a, b - // In NoNaNsFPMath the CC will have been changed from, e.g., 'ogt' to 'gt'. - bool swapSides = false; - if (!getTargetMachine().Options.NoNaNsFPMath) { - // transformability may depend on which way around we compare - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETOLT: - case ISD::SETOLE: - // the non-NaN should be RHS - swapSides = DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS); - break; - case ISD::SETUGT: - case ISD::SETUGE: - case ISD::SETULT: - case ISD::SETULE: - // the non-NaN should be LHS - swapSides = DAG.isKnownNeverNaN(RHS) && !DAG.isKnownNeverNaN(LHS); - break; - } - } - swapSides = swapSides || (LHS == FalseVal && RHS == TrueVal); - if (swapSides) { - CC = ISD::getSetCCSwappedOperands(CC); - std::swap(LHS, RHS); - } - if (LHS == TrueVal && RHS == FalseVal) { - bool canTransform = true; - // FIXME: FastMathFlags::noSignedZeros() doesn't appear reachable from here - if (!getTargetMachine().Options.UnsafeFPMath && - !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { - const ConstantFPSDNode *Zero; - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETUGT: - case ISD::SETGT: - // RHS must not be -0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) && - !Zero->isNegative(); - break; - case ISD::SETOGE: - case ISD::SETUGE: - case ISD::SETGE: - // LHS must not be -0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) && - !Zero->isNegative(); - break; - case ISD::SETOLT: - case ISD::SETULT: - case ISD::SETLT: - // RHS must not be +0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(RHS)) && - Zero->isNegative(); - break; - case ISD::SETOLE: - case ISD::SETULE: - case ISD::SETLE: - // LHS must not be +0 - canTransform = (Zero = dyn_cast<ConstantFPSDNode>(LHS)) && - Zero->isNegative(); - break; - } - } - if (canTransform) { - // Note: If one of the elements in a pair is a number and the other - // element is NaN, the corresponding result element is the number. - // This is consistent with the IEEE 754-2008 standard. - // Therefore, a > b ? a : b <=> vmax(a,b), if b is constant and a is NaN - switch (CC) { - default: - break; - case ISD::SETOGT: - case ISD::SETOGE: - if (!DAG.isKnownNeverNaN(RHS)) - break; - return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); - case ISD::SETUGT: - case ISD::SETUGE: - if (!DAG.isKnownNeverNaN(LHS)) - break; - case ISD::SETGT: - case ISD::SETGE: - return DAG.getNode(ARMISD::VMAXNM, dl, VT, LHS, RHS); - case ISD::SETOLT: - case ISD::SETOLE: - if (!DAG.isKnownNeverNaN(RHS)) - break; - return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); - case ISD::SETULT: - case ISD::SETULE: - if (!DAG.isKnownNeverNaN(LHS)) - break; - case ISD::SETLT: - case ISD::SETLE: - return DAG.getNode(ARMISD::VMINNM, dl, VT, LHS, RHS); - } - } - } - bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); @@ -3890,16 +3869,18 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { Addr, Op.getOperand(2), JTI); } if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { - Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(), - false, false, false, 0); + Addr = + DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { - Addr = DAG.getLoad(PTy, dl, Chain, Addr, - MachinePointerInfo::getJumpTable(), - false, false, false, 0); + Addr = + DAG.getLoad(PTy, dl, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + false, false, false, 0); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } @@ -3936,7 +3917,7 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); - return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1, + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } @@ -3988,7 +3969,7 @@ SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); - return makeLibCall(DAG, LC, Op.getValueType(), &Op.getOperand(0), 1, + return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } @@ -4153,6 +4134,56 @@ static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results, Results.push_back(Read.getOperand(0)); } +/// \p BC is a bitcast that is about to be turned into a VMOVDRR. +/// When \p DstVT, the destination type of \p BC, is on the vector +/// register bank and the source of bitcast, \p Op, operates on the same bank, +/// it might be possible to combine them, such that everything stays on the +/// vector register bank. +/// \p return The node that would replace \p BT, if the combine +/// is possible. +static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, + SelectionDAG &DAG) { + SDValue Op = BC->getOperand(0); + EVT DstVT = BC->getValueType(0); + + // The only vector instruction that can produce a scalar (remember, + // since the bitcast was about to be turned into VMOVDRR, the source + // type is i64) from a vector is EXTRACT_VECTOR_ELT. + // Moreover, we can do this combine only if there is one use. + // Finally, if the destination type is not a vector, there is not + // much point on forcing everything on the vector bank. + if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || + !Op.hasOneUse()) + return SDValue(); + + // If the index is not constant, we will introduce an additional + // multiply that will stick. + // Give up in that case. + ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + if (!Index) + return SDValue(); + unsigned DstNumElt = DstVT.getVectorNumElements(); + + // Compute the new index. + const APInt &APIntIndex = Index->getAPIntValue(); + APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); + NewIndex *= APIntIndex; + // Check if the new constant index fits into i32. + if (NewIndex.getBitWidth() > 32) + return SDValue(); + + // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> + // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) + SDLoc dl(Op); + SDValue ExtractSrc = Op.getOperand(0); + EVT VecVT = EVT::getVectorVT( + *DAG.getContext(), DstVT.getScalarType(), + ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); + SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, + DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); +} + /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 @@ -4172,6 +4203,11 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) { // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { + // Do not force values to GPRs (this is what VMOVDRR does for the inputs) + // if we can combine the bitcast with its source. + if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) + return Val; + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, @@ -4383,7 +4419,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, if (!ST->hasV6T2Ops()) return SDValue(); - SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0)); + SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } @@ -4544,8 +4580,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, "Unknown shift to lower!"); // We only lower SRA, SRL of 1 here, all others use generic lowering. - if (!isa<ConstantSDNode>(N->getOperand(1)) || - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1) + if (!isOneConstant(N->getOperand(1))) return SDValue(); // If we are in thumb mode, we don't have RRX. @@ -5036,18 +5071,56 @@ static bool isVTBLMask(ArrayRef<int> M, EVT VT) { return VT == MVT::v8i8 && M.size() == 8; } +// Checks whether the shuffle mask represents a vector transpose (VTRN) by +// checking that pairs of elements in the shuffle mask represent the same index +// in each vector, incrementing the expected index by 2 at each step. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} +// v2={e,f,g,h} +// WhichResult gives the offset for each element in the mask based on which +// of the two results it belongs to. +// +// The transpose can be represented either as: +// result1 = shufflevector v1, v2, result1_shuffle_mask +// result2 = shufflevector v1, v2, result2_shuffle_mask +// where v1/v2 and the shuffle masks have the same number of elements +// (here WhichResult (see below) indicates which result is being checked) +// +// or as: +// results = shufflevector v1, v2, shuffle_mask +// where both results are returned in one vector and the shuffle mask has twice +// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we +// want to check the low half and high half of the shuffle mask as if it were +// the other case static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + NumElts + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + // If the mask is twice as long as the input vector then we need to check the + // upper and lower parts of the mask with a matching value for WhichResult + // FIXME: A mask with only even values will be rejected in case the first + // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only + // M[0] is used to determine WhichResult + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } @@ -5060,28 +5133,55 @@ static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != i + WhichResult) || - (M[i+1] >= 0 && (unsigned) M[i+1] != i + WhichResult)) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + if (M.size() == NumElts * 2) + WhichResult = i / NumElts; + else + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) + return false; + } } + + if (M.size() == NumElts*2) + WhichResult = 0; + return true; } +// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking +// that the mask elements are either all even and in steps of size 2 or all odd +// and in steps of size 2. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with +// respect the how results are returned. static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned i = 0; i != NumElts; ++i) { - if (M[i] < 0) continue; // ignore UNDEF indices - if ((unsigned) M[i] != 2 * i + WhichResult) - return false; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; ++j) { + if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) + return false; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5097,18 +5197,27 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ if (EltSz == 64) return false; - unsigned Half = VT.getVectorNumElements() / 2; - WhichResult = (M[0] == 0 ? 0 : 1); - for (unsigned j = 0; j != 2; ++j) { - unsigned Idx = WhichResult; - for (unsigned i = 0; i != Half; ++i) { - int MIdx = M[i + j * Half]; - if (MIdx >= 0 && (unsigned) MIdx != Idx) - return false; - Idx += 2; + unsigned NumElts = VT.getVectorNumElements(); + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + unsigned Half = NumElts / 2; + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + for (unsigned j = 0; j < NumElts; j += Half) { + unsigned Idx = WhichResult; + for (unsigned k = 0; k < Half; ++k) { + int MIdx = M[i + j + k]; + if (MIdx >= 0 && (unsigned) MIdx != Idx) + return false; + Idx += 2; + } } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5116,21 +5225,37 @@ static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return true; } +// Checks whether the shuffle mask represents a vector zip (VZIP) by checking +// that pairs of elements of the shufflemask represent the same index in each +// vector incrementing sequentially through the vectors. +// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] +// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} +// v2={e,f,g,h} +// Requires similar checks to that of isVTRNMask with respect the how results +// are returned. static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getVectorElementType().getSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx + NumElts)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5147,15 +5272,23 @@ static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){ return false; unsigned NumElts = VT.getVectorNumElements(); - WhichResult = (M[0] == 0 ? 0 : 1); - unsigned Idx = WhichResult * NumElts / 2; - for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned) M[i] != Idx) || - (M[i+1] >= 0 && (unsigned) M[i+1] != Idx)) - return false; - Idx += 1; + if (M.size() != NumElts && M.size() != NumElts*2) + return false; + + for (unsigned i = 0; i < M.size(); i += NumElts) { + WhichResult = M[i] == 0 ? 0 : 1; + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned j = 0; j < NumElts; j += 2) { + if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || + (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) + return false; + Idx += 1; + } } + if (M.size() == NumElts*2) + WhichResult = 0; + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; @@ -5329,16 +5462,14 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // just use VDUPLANE. We can only do this if the lane being extracted // is at a constant index, as the VDUP from lane instructions only have // constant-index forms. + ConstantSDNode *constIndex; if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && - isa<ConstantSDNode>(Value->getOperand(1))) { + (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) { // We need to create a new undef vector to use for the VDUPLANE if the // size of the vector from which we get the value is different than the // size of the vector that we need to create. We will insert the element // such that the register coalescer will remove unnecessary copies. if (VT != Value->getOperand(0).getValueType()) { - ConstantSDNode *constIndex; - constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)); - assert(constIndex && "The index is not a constant!"); unsigned index = constIndex->getAPIntValue().getLimitedValue() % VT.getVectorNumElements(); N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, @@ -5437,14 +5568,35 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); - SmallVector<SDValue, 2> SourceVecs; - SmallVector<unsigned, 2> MinElts; - SmallVector<unsigned, 2> MaxElts; + struct ShuffleSourceInfo { + SDValue Vec; + unsigned MinElt; + unsigned MaxElt; + + // We may insert some combination of BITCASTs and VEXT nodes to force Vec to + // be compatible with the shuffle we intend to construct. As a result + // ShuffleVec will be some sliding window into the original Vec. + SDValue ShuffleVec; + + // Code should guarantee that element i in Vec starts at element "WindowBase + // + i * WindowScale in ShuffleVec". + int WindowBase; + int WindowScale; + + bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } + ShuffleSourceInfo(SDValue Vec) + : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), + WindowScale(1) {} + }; + // First gather all vectors used as an immediate source for this BUILD_VECTOR + // node. + SmallVector<ShuffleSourceInfo, 2> Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) @@ -5453,127 +5605,166 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); - } else if (V.getOperand(0).getValueType().getVectorElementType() != - VT.getVectorElementType()) { - // This code doesn't know how to handle shuffles where the vector - // element types do not match (this happens because type legalization - // promotes the return type of EXTRACT_VECTOR_ELT). - // FIXME: It might be appropriate to extend this code to handle - // mismatched types. + } else if (!isa<ConstantSDNode>(V.getOperand(1))) { + // Furthermore, shuffles require a constant mask, whereas extractelts + // accept variable indices. return SDValue(); } - // Record this extraction against the appropriate vector if possible... + // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); - // If the element number isn't a constant, we can't effectively - // analyze what's going on. - if (!isa<ConstantSDNode>(V.getOperand(1))) - return SDValue(); - unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); - bool FoundSource = false; - for (unsigned j = 0; j < SourceVecs.size(); ++j) { - if (SourceVecs[j] == SourceVec) { - if (MinElts[j] > EltNo) - MinElts[j] = EltNo; - if (MaxElts[j] < EltNo) - MaxElts[j] = EltNo; - FoundSource = true; - break; - } - } + auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); + if (Source == Sources.end()) + Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); - // Or record a new source if not... - if (!FoundSource) { - SourceVecs.push_back(SourceVec); - MinElts.push_back(EltNo); - MaxElts.push_back(EltNo); - } + // Update the minimum and maximum lane number seen. + unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); + Source->MinElt = std::min(Source->MinElt, EltNo); + Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors - // involved. - if (SourceVecs.size() > 2) + // are involved. + if (Sources.size() > 2) return SDValue(); - SDValue ShuffleSrcs[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT) }; - int VEXTOffsets[2] = {0, 0}; + // Find out the smallest element size among result and two sources, and use + // it as element size to build the shuffle_vector. + EVT SmallestEltTy = VT.getVectorElementType(); + for (auto &Source : Sources) { + EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); + if (SrcEltTy.bitsLT(SmallestEltTy)) + SmallestEltTy = SrcEltTy; + } + unsigned ResMultiplier = + VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); + NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); + EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); + + // If the source vector is too wide or too narrow, we may nevertheless be able + // to construct a compatible shuffle either by concatenating it with UNDEF or + // extracting a suitable range of elements. + for (auto &Src : Sources) { + EVT SrcVT = Src.ShuffleVec.getValueType(); + + if (SrcVT.getSizeInBits() == VT.getSizeInBits()) + continue; + + // This stage of the search produces a source with the same element type as + // the original, but with a total width matching the BUILD_VECTOR output. + EVT EltVT = SrcVT.getVectorElementType(); + unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); + EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); - // This loop extracts the usage patterns of the source vectors - // and prepares appropriate SDValues for a shuffle if possible. - for (unsigned i = 0; i < SourceVecs.size(); ++i) { - if (SourceVecs[i].getValueType() == VT) { - // No VEXT necessary - ShuffleSrcs[i] = SourceVecs[i]; - VEXTOffsets[i] = 0; + if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { + if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) + return SDValue(); + // We can pad out the smaller vector for free, so if it's part of a + // shuffle... + Src.ShuffleVec = + DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, + DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; - } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) { - // It probably isn't worth padding out a smaller vector just to - // break it down again in a shuffle. - return SDValue(); } - // Since only 64-bit and 128-bit vectors are legal on ARM and - // we've eliminated the other cases... - assert(SourceVecs[i].getValueType().getVectorNumElements() == 2*NumElts && - "unexpected vector sizes in ReconstructShuffle"); + if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) + return SDValue(); - if (MaxElts[i] - MinElts[i] >= NumElts) { + if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } - if (MinElts[i] >= NumElts) { + if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half - VEXTOffsets[i] = NumElts; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - } else if (MaxElts[i] < NumElts) { + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + Src.WindowBase = -NumSrcElts; + } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half - VEXTOffsets[i] = 0; - ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); + Src.ShuffleVec = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); } else { // An actual VEXT is needed - VEXTOffsets[i] = MinElts[i]; - SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(0, dl)); - SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - SourceVecs[i], - DAG.getIntPtrConstant(NumElts, dl)); - ShuffleSrcs[i] = DAG.getNode(ARMISD::VEXT, dl, VT, VEXTSrc1, VEXTSrc2, - DAG.getConstant(VEXTOffsets[i], dl, - MVT::i32)); - } - } - - SmallVector<int, 8> Mask; - - for (unsigned i = 0; i < NumElts; ++i) { + SDValue VEXTSrc1 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(0, dl, MVT::i32)); + SDValue VEXTSrc2 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, + DAG.getConstant(NumSrcElts, dl, MVT::i32)); + + Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, + VEXTSrc2, + DAG.getConstant(Src.MinElt, dl, MVT::i32)); + Src.WindowBase = -Src.MinElt; + } + } + + // Another possible incompatibility occurs from the vector element types. We + // can fix this by bitcasting the source vectors to the same type we intend + // for the shuffle. + for (auto &Src : Sources) { + EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); + if (SrcEltTy == SmallestEltTy) + continue; + assert(ShuffleVT.getVectorElementType() == SmallestEltTy); + Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); + Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); + Src.WindowBase *= Src.WindowScale; + } + + // Final sanity check before we try to actually produce a shuffle. + DEBUG( + for (auto Src : Sources) + assert(Src.ShuffleVec.getValueType() == ShuffleVT); + ); + + // The stars all align, our next step is to produce the mask for the shuffle. + SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); + int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); + for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); - if (Entry.getOpcode() == ISD::UNDEF) { - Mask.push_back(-1); + if (Entry.getOpcode() == ISD::UNDEF) continue; - } - SDValue ExtractVec = Entry.getOperand(0); - int ExtractElt = cast<ConstantSDNode>(Op.getOperand(i) - .getOperand(1))->getSExtValue(); - if (ExtractVec == SourceVecs[0]) { - Mask.push_back(ExtractElt - VEXTOffsets[0]); - } else { - Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]); - } + auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); + int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); + + // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit + // trunc. So only std::min(SrcBits, DestBits) actually get defined in this + // segment. + EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); + int BitsDefined = std::min(OrigEltTy.getSizeInBits(), + VT.getVectorElementType().getSizeInBits()); + int LanesDefined = BitsDefined / BitsPerShuffleLane; + + // This source is expected to fill ResMultiplier lanes of the final shuffle, + // starting at the appropriate offset. + int *LaneMask = &Mask[i * ResMultiplier]; + + int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; + ExtractBase += NumElts * (Src - Sources.begin()); + for (int j = 0; j < LanesDefined; ++j) + LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... - if (isShuffleMaskLegal(Mask, VT)) - return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1], - &Mask[0]); + if (!isShuffleMaskLegal(Mask, ShuffleVT)) + return SDValue(); - return SDValue(); + // We can't handle more than two sources. This should have already + // been checked before this point. + assert(Sources.size() <= 2 && "Too many sources!"); + + SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; + for (unsigned i = 0; i < Sources.size(); ++i) + ShuffleOps[i] = Sources[i].ShuffleVec; + + SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], + ShuffleOps[1], &Mask[0]); + return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } /// isShuffleMaskLegal - Targets can use this to indicate that they only @@ -6235,6 +6426,8 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); @@ -6265,6 +6458,8 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) { static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? + SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); @@ -6337,6 +6532,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { + // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); @@ -6445,45 +6641,56 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. - StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); - - // Create stack object for sret. + Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); auto &DL = DAG.getDataLayout(); - const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); - const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); - int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); - SDValue SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL)); ArgListTy Args; - ArgListEntry Entry; - - Entry.Node = SRet; - Entry.Ty = RetTy->getPointerTo(); - Entry.isSExt = false; - Entry.isZExt = false; - Entry.isSRet = true; - Args.push_back(Entry); + bool ShouldUseSRet = Subtarget->isAPCS_ABI(); + SDValue SRet; + if (ShouldUseSRet) { + // Create stack object for sret. + const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); + const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); + int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false); + SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); + + ArgListEntry Entry; + Entry.Node = SRet; + Entry.Ty = RetTy->getPointerTo(); + Entry.isSExt = false; + Entry.isZExt = false; + Entry.isSRet = true; + Args.push_back(Entry); + RetTy = Type::getVoidTy(*DAG.getContext()); + } + ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.isSExt = false; Entry.isZExt = false; Args.push_back(Entry); - const char *LibcallName = (ArgVT == MVT::f64) - ? "__sincos_stret" : "__sincosf_stret"; + const char *LibcallName = + (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; + RTLIB::Libcall LC = + (ArgVT == MVT::f64) ? RTLIB::SINCOS_F64 : RTLIB::SINCOS_F32; + CallingConv::ID CC = getLibcallCallingConv(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) - .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee, - std::move(Args), 0) - .setDiscardResult(); - + CLI.setDebugLoc(dl) + .setChain(DAG.getEntryNode()) + .setCallee(CC, RetTy, Callee, std::move(Args), 0) + .setDiscardResult(ShouldUseSRet); std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + if (!ShouldUseSRet) + return CallResult.first; + SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo(), false, false, false, 0); @@ -6498,6 +6705,85 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { LoadSin.getValue(0), LoadCos.getValue(0)); } +SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, + bool Signed, + SDValue &Chain) const { + EVT VT = Op.getValueType(); + assert((VT == MVT::i32 || VT == MVT::i64) && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + const char *Name = nullptr; + if (Signed) + Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; + else + Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; + + SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); + + ARMTargetLowering::ArgListTy Args; + + for (auto AI : {1, 0}) { + ArgListEntry Arg; + Arg.Node = Op.getOperand(AI); + Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); + Args.push_back(Arg); + } + + CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), + ES, std::move(Args), 0); + + return LowerCallTo(CLI).first; +} + +SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, + bool Signed) const { + assert(Op.getValueType() == MVT::i32 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, + DAG.getEntryNode(), Op.getOperand(1)); + + return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); +} + +void ARMTargetLowering::ExpandDIV_Windows( + SDValue Op, SelectionDAG &DAG, bool Signed, + SmallVectorImpl<SDValue> &Results) const { + const auto &DL = DAG.getDataLayout(); + const auto &TLI = DAG.getTargetLoweringInfo(); + + assert(Op.getValueType() == MVT::i64 && + "unexpected type for custom lowering DIV"); + SDLoc dl(Op); + + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(0, dl, MVT::i32)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op.getOperand(1), + DAG.getConstant(1, dl, MVT::i32)); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, Lo, Hi); + + SDValue DBZCHK = + DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Or); + + SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); + + SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); + SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, + DAG.getConstant(32, dl, TLI.getPointerTy(DL))); + Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); + + Results.push_back(Lower); + Results.push_back(Upper); +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { // Monotonic load/store is legal for all targets if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic) @@ -6513,36 +6799,22 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc DL(N); - SDValue Cycles32, OutChain; - - if (Subtarget->hasPerfMon()) { - // Under Power Management extensions, the cycle-count is: - // mrc p15, #0, <Rt>, c9, c13, #0 - SDValue Ops[] = { N->getOperand(0), // Chain - DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), - DAG.getConstant(15, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32), - DAG.getConstant(9, DL, MVT::i32), - DAG.getConstant(13, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i32) - }; - - Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, - DAG.getVTList(MVT::i32, MVT::Other), Ops); - OutChain = Cycles32.getValue(1); - } else { - // Intrinsic is defined to return 0 on unsupported platforms. Technically - // there are older ARM CPUs that have implementation-specific ways of - // obtaining this information (FIXME!). - Cycles32 = DAG.getConstant(0, DL, MVT::i32); - OutChain = DAG.getEntryNode(); - } - + // Under Power Management extensions, the cycle-count is: + // mrc p15, #0, <Rt>, c9, c13, #0 + SDValue Ops[] = { N->getOperand(0), // Chain + DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), + DAG.getConstant(15, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(9, DL, MVT::i32), + DAG.getConstant(13, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i32) + }; - SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, - Cycles32, DAG.getConstant(0, DL, MVT::i32)); - Results.push_back(Cycles64); - Results.push_back(OutChain); + SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(MVT::i32, MVT::Other), Ops); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, + DAG.getConstant(0, DL, MVT::i32))); + Results.push_back(Cycles32.getValue(1)); } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -6576,15 +6848,17 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); - case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); + case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); + case ISD::SREM: return LowerREM(Op.getNode(), DAG); + case ISD::UREM: return LowerREM(Op.getNode(), DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); @@ -6622,13 +6896,14 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("Don't know how to custom lower this!"); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); + case ARMISD::WIN__DBZCHK: return SDValue(); } } /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, - SmallVectorImpl<SDValue>&Results, + SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { SDValue Res; switch (N->getOpcode()) { @@ -6644,9 +6919,18 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N, case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; + case ISD::SREM: + case ISD::UREM: + Res = LowerREM(N, DAG); + break; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; + case ISD::UDIV: + case ISD::SDIV: + assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); + return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, + Results); } if (Res.getNode()) Results.push_back(Res); @@ -6683,12 +6967,12 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB, // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = - MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(), - MachineMemOperand::MOLoad, 4, 4); + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); MachineMemOperand *FIMMOSt = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, 4, 4); + MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOStore, 4, 4); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { @@ -6792,7 +7076,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, MachineModuleInfo &MMI = MF->getMMI(); for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { - if (!BB->isLandingPad()) continue; + if (!BB->isEHPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. @@ -6807,7 +7091,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, for (SmallVectorImpl<unsigned>::iterator CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); CSI != CSE; ++CSI) { - CallSiteNumToLPad[*CSI].push_back(BB); + CallSiteNumToLPad[*CSI].push_back(&*BB); MaxCSNum = std::max(MaxCSNum, *CSI); } break; @@ -6840,7 +7124,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); - DispatchBB->setIsLandingPad(); + DispatchBB->setIsEHPad(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); unsigned trap_opcode; @@ -6864,10 +7148,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // context. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); - MachineMemOperand *FIMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad | - MachineMemOperand::MOVolatile, 4, 4); + MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), + MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); @@ -6982,9 +7265,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) @@ -7066,9 +7348,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, AddDefaultPred(BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI)); - MachineMemOperand *JTMMOLd = - MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(), - MachineMemOperand::MOLoad, 4, 4); + MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( + MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); AddDefaultPred( BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) @@ -7109,13 +7390,14 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, BB->succ_end()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); - if (SMBB->isLandingPad()) { + if (SMBB->isEHPad()) { BB->removeSuccessor(SMBB); MBBLPads.push_back(SMBB); } } - BB->addSuccessor(DispatchBB); + BB->addSuccessor(DispatchBB, BranchProbability::getZero()); + BB->normalizeSuccProbs(); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from @@ -7157,7 +7439,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr *MI, // landing pad now. for (SmallVectorImpl<MachineBasicBlock*>::iterator I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) - (*I)->setIsLandingPad(false); + (*I)->setIsEHPad(false); // The instruction is gone now. MI->eraseFromParent(); @@ -7280,8 +7562,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI, // Otherwise, we will generate unrolled scalar copies. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned src = MI->getOperand(1).getReg(); @@ -7574,6 +7855,32 @@ ARMTargetLowering::EmitLowered__chkstk(MachineInstr *MI, } MachineBasicBlock * +ARMTargetLowering::EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MBB->getParent(); + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + + MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); + MF->push_back(ContBB); + ContBB->splice(ContBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(MI)), MBB->end()); + MBB->addSuccessor(ContBB); + + MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); + MF->push_back(TrapBB); + BuildMI(TrapBB, DL, TII->get(ARM::t2UDF)).addImm(249); + MBB->addSuccessor(TrapBB); + + BuildMI(*MBB, MI, DL, TII->get(ARM::tCBZ)) + .addReg(MI->getOperand(0).getReg()) + .addMBB(TrapBB); + + MI->eraseFromParent(); + return ContBB; +} + +MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); @@ -7643,8 +7950,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -7741,6 +8047,9 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: + return BB; + + case ARM::Int_eh_sjlj_setup_dispatch: EmitSjLjDispatchBlock(MI, BB); return BB; @@ -7759,8 +8068,7 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) // SinkBB: V1 = PHI(V2, V3) const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator BBI = BB; - ++BBI; + MachineFunction::iterator BBI = ++BB->getIterator(); MachineFunction *Fn = BB->getParent(); MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); @@ -7824,11 +8132,46 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return EmitStructByval(MI, BB); case ARM::WIN__CHKSTK: return EmitLowered__chkstk(MI, BB); + case ARM::WIN__DBZCHK: + return EmitLowered__dbzchk(MI, BB); + } +} + +/// \brief Attaches vregs to MEMCPY that it will use as scratch registers +/// when it is expanded into LDM/STM. This is done as a post-isel lowering +/// instead of as a custom inserter because we need the use list from the SDNode. +static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, + MachineInstr *MI, const SDNode *Node) { + bool isThumb1 = Subtarget->isThumb1Only(); + + DebugLoc DL = MI->getDebugLoc(); + MachineFunction *MF = MI->getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineInstrBuilder MIB(*MF, MI); + + // If the new dst/src is unused mark it as dead. + if (!Node->hasAnyUseOfValue(0)) { + MI->getOperand(0).setIsDead(true); + } + if (!Node->hasAnyUseOfValue(1)) { + MI->getOperand(1).setIsDead(true); + } + + // The MEMCPY both defines and kills the scratch registers. + for (unsigned I = 0; I != MI->getOperand(4).getImm(); ++I) { + unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass + : &ARM::GPRRegClass); + MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } } void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, SDNode *Node) const { + if (MI->getOpcode() == ARM::MEMCPY) { + attachMEMCPYScratchRegs(Subtarget, MI, Node); + return; + } + const MCInstrDesc *MCID = &MI->getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional @@ -7898,10 +8241,7 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI, // Helper function that checks if N is a null or all ones constant. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N); - if (!C) - return false; - return AllOnes ? C->isAllOnesValue() : C->isNullValue(); + return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); } // Return true if N is conditionally 0 or all ones. @@ -8723,12 +9063,88 @@ static SDValue PerformXORCombine(SDNode *N, return SDValue(); } -/// PerformBFICombine - (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff -/// the bits being cleared by the AND are not demanded by the BFI. +// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, +// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and +// their position in "to" (Rd). +static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { + assert(N->getOpcode() == ARMISD::BFI); + + SDValue From = N->getOperand(1); + ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue(); + FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); + + // If the Base came from a SHR #C, we can deduce that it is really testing bit + // #C in the base of the SHR. + if (From->getOpcode() == ISD::SRL && + isa<ConstantSDNode>(From->getOperand(1))) { + APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue(); + assert(Shift.getLimitedValue() < 32 && "Shift too large!"); + FromMask <<= Shift.getLimitedValue(31); + From = From->getOperand(0); + } + + return From; +} + +// If A and B contain one contiguous set of bits, does A | B == A . B? +// +// Neither A nor B must be zero. +static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { + unsigned LastActiveBitInA = A.countTrailingZeros(); + unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; + return LastActiveBitInA - 1 == FirstActiveBitInB; +} + +static SDValue FindBFIToCombineWith(SDNode *N) { + // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, + // if one exists. + APInt ToMask, FromMask; + SDValue From = ParseBFI(N, ToMask, FromMask); + SDValue To = N->getOperand(0); + + // Now check for a compatible BFI to merge with. We can pass through BFIs that + // aren't compatible, but not if they set the same bit in their destination as + // we do (or that of any BFI we're going to combine with). + SDValue V = To; + APInt CombinedToMask = ToMask; + while (V.getOpcode() == ARMISD::BFI) { + APInt NewToMask, NewFromMask; + SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); + if (NewFrom != From) { + // This BFI has a different base. Keep going. + CombinedToMask |= NewToMask; + V = V.getOperand(0); + continue; + } + + // Do the written bits conflict with any we've seen so far? + if ((NewToMask & CombinedToMask).getBoolValue()) + // Conflicting bits - bail out because going further is unsafe. + return SDValue(); + + // Are the new bits contiguous when combined with the old bits? + if (BitsProperlyConcatenate(ToMask, NewToMask) && + BitsProperlyConcatenate(FromMask, NewFromMask)) + return V; + if (BitsProperlyConcatenate(NewToMask, ToMask) && + BitsProperlyConcatenate(NewFromMask, FromMask)) + return V; + + // We've seen a write to some bits, so track it. + CombinedToMask |= NewToMask; + // Keep going... + V = V.getOperand(0); + } + + return SDValue(); +} + static SDValue PerformBFICombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); if (N1.getOpcode() == ISD::AND) { + // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff + // the bits being cleared by the AND are not demanded by the BFI. ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); if (!N11C) return SDValue(); @@ -8744,6 +9160,38 @@ static SDValue PerformBFICombine(SDNode *N, return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); + } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { + // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. + // Keep track of any consecutive bits set that all come from the same base + // value. We can combine these together into a single BFI. + SDValue CombineBFI = FindBFIToCombineWith(N); + if (CombineBFI == SDValue()) + return SDValue(); + + // We've found a BFI. + APInt ToMask1, FromMask1; + SDValue From1 = ParseBFI(N, ToMask1, FromMask1); + + APInt ToMask2, FromMask2; + SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); + assert(From1 == From2); + (void)From2; + + // First, unlink CombineBFI. + DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); + // Then create a new BFI, combining the two together. + APInt NewFromMask = FromMask1 | FromMask2; + APInt NewToMask = ToMask1 | ToMask2; + + EVT VT = N->getValueType(0); + SDLoc dl(N); + + if (NewFromMask[0] == 0) + From1 = DCI.DAG.getNode( + ISD::SRL, dl, VT, From1, + DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); + return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, + DCI.DAG.getConstant(~NewToMask, dl, VT)); } return SDValue(); } @@ -9521,32 +9969,6 @@ static SDValue PerformSTORECombine(SDNode *N, return SDValue(); } -// isConstVecPow2 - Return true if each vector element is a power of 2, all -// elements are the same constant, C, and Log2(C) ranges from 1 to 32. -static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) -{ - integerPart cN; - integerPart c0 = 0; - for (unsigned I = 0, E = ConstVec.getValueType().getVectorNumElements(); - I != E; I++) { - ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(ConstVec.getOperand(I)); - if (!C) - return false; - - bool isExact; - APFloat APF = C->getValueAPF(); - if (APF.convertToInteger(&cN, 64, isSigned, APFloat::rmTowardZero, &isExact) - != APFloat::opOK || !isExact) - return false; - - c0 = (I == 0) ? cN : c0; - if (!isPowerOf2_64(cN) || c0 != cN || Log2_64(c0) < 1 || Log2_64(c0) > 32) - return false; - } - C = c0; - return true; -} - /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) /// can replace combinations of VMUL and VCVT (floating-point to integer) /// when the VMUL has a constant operand that is a power of 2. @@ -9556,30 +9978,25 @@ static bool isConstVecPow2(SDValue ConstVec, bool isSigned, uint64_t &C) /// vcvt.s32.f32 d16, d16 /// becomes: /// vcvt.s32.f32 d16, d16, #3 -static SDValue PerformVCVTCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, +static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; - SDValue Op = N->getOperand(0); + if (!Subtarget->hasNEON()) + return SDValue(); - if (!Subtarget->hasNEON() || !Op.getValueType().isVector() || - Op.getOpcode() != ISD::FMUL) + SDValue Op = N->getOperand(0); + if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL) return SDValue(); - uint64_t C; - SDValue N0 = Op->getOperand(0); SDValue ConstVec = Op->getOperand(1); - bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; - - if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || - !isConstVecPow2(ConstVec, isSigned, C)) + if (!isa<BuildVectorSDNode>(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32 || - NumLanes > 4) { + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would // be lossy. We also can't handle more then 4 lanes, since these intructions @@ -9587,16 +10004,22 @@ static SDValue PerformVCVTCombine(SDNode *N, return SDValue(); } + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + SDLoc dl(N); + bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; - SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, - NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, - DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), - N0, - DAG.getConstant(Log2_64(C), dl, MVT::i32)); + SDValue FixConv = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, + DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), + DAG.getConstant(C, dl, MVT::i32)); - if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); return FixConv; @@ -9611,38 +10034,44 @@ static SDValue PerformVCVTCombine(SDNode *N, /// vdiv.f32 d16, d17, d16 /// becomes: /// vcvt.f32.s32 d16, d16, #3 -static SDValue PerformVDIVCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, +static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { - SelectionDAG &DAG = DCI.DAG; + if (!Subtarget->hasNEON()) + return SDValue(); + SDValue Op = N->getOperand(0); unsigned OpOpcode = Op.getNode()->getOpcode(); - - if (!Subtarget->hasNEON() || !N->getValueType(0).isVector() || + if (!N->getValueType(0).isVector() || (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) return SDValue(); - uint64_t C; SDValue ConstVec = N->getOperand(1); - bool isSigned = OpOpcode == ISD::SINT_TO_FP; - - if (ConstVec.getOpcode() != ISD::BUILD_VECTOR || - !isConstVecPow2(ConstVec, isSigned, C)) + if (!isa<BuildVectorSDNode>(ConstVec)) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); + uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); - if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) { + uint32_t IntBits = IntTy.getSizeInBits(); + unsigned NumLanes = Op.getValueType().getVectorNumElements(); + if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would - // be lossy. + // be lossy. We also can't handle more then 4 lanes, since these intructions + // only support v2i32/v4i32 types. return SDValue(); } + BitVector UndefElements; + BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec); + int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); + if (C == -1 || C == 0 || C > 32) + return SDValue(); + SDLoc dl(N); + bool isSigned = OpOpcode == ISD::SINT_TO_FP; SDValue ConvInput = Op.getOperand(0); - unsigned NumLanes = Op.getValueType().getVectorNumElements(); - if (IntTy.getSizeInBits() < FloatTy.getSizeInBits()) + if (IntBits < FloatBits) ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput); @@ -9652,7 +10081,7 @@ static SDValue PerformVDIVCombine(SDNode *N, return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), - ConvInput, DAG.getConstant(Log2_64(C), dl, MVT::i32)); + ConvInput, DAG.getConstant(C, dl, MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate @@ -9680,7 +10109,7 @@ static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); @@ -9695,12 +10124,16 @@ static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); - unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); + int64_t ElementBits = VT.getVectorElementType().getSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; - if (isIntrinsic) + if (!isIntrinsic) + return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { Cnt = -Cnt; - return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); + return true; + } + return false; } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. @@ -9939,89 +10372,123 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// PerformSELECT_CCCombine - Target-specific DAG combining for ISD::SELECT_CC -/// to match f32 max/min patterns to use NEON vmax/vmin instructions. -static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG, - const ARMSubtarget *ST) { - // If the target supports NEON, try to use vmax/vmin instructions for f32 - // selects like "x < y ? x : y". Unless the NoNaNsFPMath option is set, - // be careful about NaNs: NEON's vmax/vmin return NaN if either operand is - // a NaN; only do the transformation when it matches that behavior. - - // For now only do this when using NEON for FP operations; if using VFP, it - // is not obvious that the benefit outweighs the cost of switching to the - // NEON pipeline. - if (!ST->hasNEON() || !ST->useNEONForSinglePrecisionFP() || - N->getValueType(0) != MVT::f32) +static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero, + APInt &KnownOne) { + if (Op.getOpcode() == ARMISD::BFI) { + // Conservatively, we can recurse down the first operand + // and just mask out all affected bits. + computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne); + + // The operand to BFI is already a mask suitable for removing the bits it + // sets. + ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2)); + APInt Mask = CI->getAPIntValue(); + KnownZero &= Mask; + KnownOne &= Mask; + return; + } + if (Op.getOpcode() == ARMISD::CMOV) { + APInt KZ2(KnownZero.getBitWidth(), 0); + APInt KO2(KnownOne.getBitWidth(), 0); + computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne); + computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2); + + KnownZero &= KZ2; + KnownOne &= KO2; + return; + } + return DAG.computeKnownBits(Op, KnownZero, KnownOne); +} + +SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { + // If we have a CMOV, OR and AND combination such as: + // if (x & CN) + // y |= CM; + // + // And: + // * CN is a single bit; + // * All bits covered by CM are known zero in y + // + // Then we can convert this into a sequence of BFI instructions. This will + // always be a win if CM is a single bit, will always be no worse than the + // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is + // three bits (due to the extra IT instruction). + + SDValue Op0 = CMOV->getOperand(0); + SDValue Op1 = CMOV->getOperand(1); + auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2)); + auto CC = CCNode->getAPIntValue().getLimitedValue(); + SDValue CmpZ = CMOV->getOperand(4); + + // The compare must be against zero. + if (!isNullConstant(CmpZ->getOperand(1))) + return SDValue(); + + assert(CmpZ->getOpcode() == ARMISD::CMPZ); + SDValue And = CmpZ->getOperand(0); + if (And->getOpcode() != ISD::AND) + return SDValue(); + ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(And->getOperand(1)); + if (!AndC || !AndC->getAPIntValue().isPowerOf2()) return SDValue(); + SDValue X = And->getOperand(0); - SDValue CondLHS = N->getOperand(0); - SDValue CondRHS = N->getOperand(1); - SDValue LHS = N->getOperand(2); - SDValue RHS = N->getOperand(3); - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get(); - - unsigned Opcode = 0; - bool IsReversed; - if (DAG.isEqualTo(LHS, CondLHS) && DAG.isEqualTo(RHS, CondRHS)) { - IsReversed = false; // x CC y ? x : y - } else if (DAG.isEqualTo(LHS, CondRHS) && DAG.isEqualTo(RHS, CondLHS)) { - IsReversed = true ; // x CC y ? y : x + if (CC == ARMCC::EQ) { + // We're performing an "equal to zero" compare. Swap the operands so we + // canonicalize on a "not equal to zero" compare. + std::swap(Op0, Op1); } else { - return SDValue(); + assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); } + + if (Op1->getOpcode() != ISD::OR) + return SDValue(); - bool IsUnordered; - switch (CC) { - default: break; - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETLT: - case ISD::SETLE: - case ISD::SETULT: - case ISD::SETULE: - // If LHS is NaN, an ordered comparison will be false and the result will - // be the RHS, but vmin(NaN, RHS) = NaN. Avoid this by checking that LHS - // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - IsUnordered = (CC == ISD::SETULT || CC == ISD::SETULE); - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - break; - // For less-than-or-equal comparisons, "+0 <= -0" will be true but vmin - // will return -0, so vmin can only be used for unsafe math or if one of - // the operands is known to be nonzero. - if ((CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE) && - !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - break; - Opcode = IsReversed ? ARMISD::FMAX : ARMISD::FMIN; - break; + ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1)); + if (!OrC) + return SDValue(); + SDValue Y = Op1->getOperand(0); - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETUGT: - case ISD::SETUGE: - // If LHS is NaN, an ordered comparison will be false and the result will - // be the RHS, but vmax(NaN, RHS) = NaN. Avoid this by checking that LHS - // != NaN. Likewise, for unordered comparisons, check for RHS != NaN. - IsUnordered = (CC == ISD::SETUGT || CC == ISD::SETUGE); - if (!DAG.isKnownNeverNaN(IsUnordered ? RHS : LHS)) - break; - // For greater-than-or-equal comparisons, "-0 >= +0" will be true but vmax - // will return +0, so vmax can only be used for unsafe math or if one of - // the operands is known to be nonzero. - if ((CC == ISD::SETGE || CC == ISD::SETOGE || CC == ISD::SETUGE) && - !DAG.getTarget().Options.UnsafeFPMath && - !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) - break; - Opcode = IsReversed ? ARMISD::FMIN : ARMISD::FMAX; - break; - } + if (Op0 != Y) + return SDValue(); + + // Now, is it profitable to continue? + APInt OrCI = OrC->getAPIntValue(); + unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; + if (OrCI.countPopulation() > Heuristic) + return SDValue(); - if (!Opcode) + // Lastly, can we determine that the bits defined by OrCI + // are zero in Y? + APInt KnownZero, KnownOne; + computeKnownBits(DAG, Y, KnownZero, KnownOne); + if ((OrCI & KnownZero) != OrCI) return SDValue(); - return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS); + + // OK, we can do the combine. + SDValue V = Y; + SDLoc dl(X); + EVT VT = X.getValueType(); + unsigned BitInX = AndC->getAPIntValue().logBase2(); + + if (BitInX != 0) { + // We must shift X first. + X = DAG.getNode(ISD::SRL, dl, VT, X, + DAG.getConstant(BitInX, dl, VT)); + } + + for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); + BitInY < NumActiveBits; ++BitInY) { + if (OrCI[BitInY] == 0) + continue; + APInt Mask(VT.getSizeInBits(), 0); + Mask.setBit(BitInY); + V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, + // Confusingly, the operand is an *inverted* mask. + DAG.getConstant(~Mask, dl, VT)); + } + + return V; } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. @@ -10042,6 +10509,13 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { ARMCC::CondCodes CC = (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue(); + // BFI is only available on V6T2+. + if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { + SDValue R = PerformCMOVToBFICombine(N, DAG); + if (R) + return R; + } + // Simplify // mov r1, r0 // cmp r1, x @@ -10108,8 +10582,10 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI, Subtarget); - case ISD::FDIV: return PerformVDIVCombine(N, DCI, Subtarget); + case ISD::FP_TO_UINT: + return PerformVCVTCombine(N, DCI.DAG, Subtarget); + case ISD::FDIV: + return PerformVDIVCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: @@ -10117,7 +10593,6 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); - case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD2DUP: @@ -11043,37 +11518,61 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { - assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only"); - unsigned Opcode = Op->getOpcode(); - assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && - "Invalid opcode for Div/Rem lowering"); - bool isSigned = (Opcode == ISD::SDIVREM); - EVT VT = Op->getValueType(0); - Type *Ty = VT.getTypeForEVT(*DAG.getContext()); - +static RTLIB::Libcall getDivRemLibcall( + const SDNode *N, MVT::SimpleValueType SVT) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemLibcall"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; RTLIB::Libcall LC; - switch (VT.getSimpleVT().SimpleTy) { + switch (SVT) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; } + return LC; +} - SDValue InChain = DAG.getEntryNode(); - +static TargetLowering::ArgListTy getDivRemArgList( + const SDNode *N, LLVMContext *Context) { + assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || + N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && + "Unhandled Opcode in getDivRemArgList"); + bool isSigned = N->getOpcode() == ISD::SDIVREM || + N->getOpcode() == ISD::SREM; TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; - for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { - EVT ArgVT = Op->getOperand(i).getValueType(); - Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - Entry.Node = Op->getOperand(i); + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { + EVT ArgVT = N->getOperand(i).getValueType(); + Type *ArgTy = ArgVT.getTypeForEVT(*Context); + Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; Entry.isSExt = isSigned; Entry.isZExt = !isSigned; Args.push_back(Entry); } + return Args; +} + +SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { + assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid()) && + "Register-based DivRem lowering only"); + unsigned Opcode = Op->getOpcode(); + assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && + "Invalid opcode for Div/Rem lowering"); + bool isSigned = (Opcode == ISD::SDIVREM); + EVT VT = Op->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + + RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), + VT.getSimpleVT().SimpleTy); + SDValue InChain = DAG.getEntryNode(); + + TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), + DAG.getContext()); SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); @@ -11090,6 +11589,47 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { return CallInfo.first; } +// Lowers REM using divmod helpers +// see RTABI section 4.2/4.3 +SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { + // Build return types (div and rem) + std::vector<Type*> RetTyParams; + Type *RetTyElement; + + switch (N->getValueType(0).getSimpleVT().SimpleTy) { + default: llvm_unreachable("Unexpected request for libcall!"); + case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; + case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; + case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; + case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; + } + + RetTyParams.push_back(RetTyElement); + RetTyParams.push_back(RetTyElement); + ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams); + Type *RetTy = StructType::get(*DAG.getContext(), ret); + + RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). + SimpleTy); + SDValue InChain = DAG.getEntryNode(); + TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext()); + bool isSigned = N->getOpcode() == ISD::SREM; + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), + getPointerTy(DAG.getDataLayout())); + + // Lower call + CallLoweringInfo CLI(DAG); + CLI.setChain(InChain) + .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args), 0) + .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); + std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); + + // Return second (rem) result operand (first contains div) + SDNode *ResNode = CallResult.first.getNode(); + assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); + return ResNode->getOperand(1); +} + SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "unsupported target platform"); @@ -11124,8 +11664,8 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -11137,8 +11677,8 @@ SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); - return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, - /*isSigned*/ false, SDLoc(Op)).first; + return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, + SDLoc(Op)).first; } bool @@ -11186,7 +11726,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); - uint64_t NumElts = DL.getTypeAllocSize(I.getType()) / 8; + uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; @@ -11212,7 +11752,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; - NumElts += DL.getTypeAllocSize(ArgTy) / 8; + NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); @@ -11295,8 +11835,6 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } -bool ARMTargetLowering::hasLoadLinkedStoreConditional() const { return true; } - Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -11392,19 +11930,26 @@ bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that // guarantee, see DDI0406C ARM architecture reference manual, // sections A8.8.72-74 LDRD) -bool ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); - return (Size == 64) && !Subtarget->isMClass(); + return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly + : AtomicExpansionKind::None; } // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); return (Size <= (Subtarget->isMClass() ? 32U : 64U)) - ? AtomicRMWExpansionKind::LLSC - : AtomicRMWExpansionKind::None; + ? AtomicExpansionKind::LLSC + : AtomicExpansionKind::None; +} + +bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( + AtomicCmpXchgInst *AI) const { + return true; } // This has so far only been implemented for MachO. @@ -11419,7 +11964,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; // Floating point values and vector values map to the same register file. - // Therefore, althought we could do a store extract of a vector type, this is + // Therefore, although we could do a store extract of a vector type, this is // better to leave at float as we have more freedom in the addressing mode for // those. if (VectorTy->isFPOrFPVectorTy()) @@ -11441,6 +11986,14 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, return false; } +bool ARMTargetLowering::isCheapToSpeculateCttz() const { + return Subtarget->hasV6T2Ops(); +} + +bool ARMTargetLowering::isCheapToSpeculateCtlz() const { + return Subtarget->hasV6T2Ops(); +} + Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); @@ -11477,6 +12030,14 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, cast<PointerType>(Addr->getType())->getElementType()); } +void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( + IRBuilder<> &Builder) const { + if (!Subtarget->hasV7Ops()) + return; + Module *M = Builder.GetInsertBlock()->getParent()->getParent(); + Builder.CreateCall(llvm::Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); +} + Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { @@ -11534,12 +12095,12 @@ bool ARMTargetLowering::lowerInterleavedLoad( Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); - unsigned VecSize = DL.getTypeAllocSizeInBits(VecTy); - bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; + unsigned VecSize = DL.getTypeSizeInBits(VecTy); + bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip illegal vector types and vector types of i64/f64 element (vldN doesn't - // support i64/f64 element). - if ((VecSize != 64 && VecSize != 128) || EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vldN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits) return false; // A pointer vector can not be the return type of the ldN intrinsics. Need to @@ -11552,9 +12113,6 @@ bool ARMTargetLowering::lowerInterleavedLoad( Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], VecTy); - IRBuilder<> Builder(LI); SmallVector<Value *, 2> Ops; @@ -11562,6 +12120,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); + Type *Tys[] = { VecTy, Int8Ptr }; + Function *VldnFunc = + Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded @@ -11624,12 +12185,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, VectorType *SubVecTy = VectorType::get(EltTy, NumSubElts); const DataLayout &DL = SI->getModule()->getDataLayout(); - unsigned SubVecSize = DL.getTypeAllocSizeInBits(SubVecTy); - bool EltIs64Bits = DL.getTypeAllocSizeInBits(EltTy) == 64; + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); + bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64; - // Skip illegal sub vector types and vector types of i64/f64 element (vstN - // doesn't support i64/f64 element). - if ((SubVecSize != 64 && SubVecSize != 128) || EltIs64Bits) + // Skip if we do not have NEON and skip illegal vector types and vector types + // with i64/f64 elements (vstN doesn't support i64/f64 elements). + if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) || + EltIs64Bits) return false; Value *Op0 = SVI->getOperand(0); @@ -11650,17 +12212,18 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, SubVecTy = VectorType::get(IntTy, NumSubElts); } - static Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, - Intrinsic::arm_neon_vst3, - Intrinsic::arm_neon_vst4}; - Function *VstNFunc = Intrinsic::getDeclaration( - SI->getModule(), StoreInts[Factor - 2], SubVecTy); - + static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, + Intrinsic::arm_neon_vst3, + Intrinsic::arm_neon_vst4}; SmallVector<Value *, 6> Ops; Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr)); + Type *Tys[] = { Int8Ptr, SubVecTy }; + Function *VstNFunc = Intrinsic::getDeclaration( + SI->getModule(), StoreInts[Factor - 2], Tys); + // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) Ops.push_back(Builder.CreateShuffleVector( @@ -11681,14 +12244,14 @@ enum HABaseType { static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members) { - if (const StructType *ST = dyn_cast<StructType>(Ty)) { + if (auto *ST = dyn_cast<StructType>(Ty)) { for (unsigned i = 0; i < ST->getNumElements(); ++i) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) return false; Members += SubMembers; } - } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) { + } else if (auto *AT = dyn_cast<ArrayType>(Ty)) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) return false; @@ -11703,7 +12266,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, return false; Members = 1; Base = HA_DOUBLE; - } else if (const VectorType *VT = dyn_cast<VectorType>(Ty)) { + } else if (auto *VT = dyn_cast<VectorType>(Ty)) { Members = 1; switch (Base) { case HA_FLOAT: @@ -11747,3 +12310,17 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); return IsHA || IsIntArray; } + +unsigned ARMTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; +} + +unsigned ARMTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Platforms which do not use SjLj EH may return values in these registers + // via the personality function. + return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; +} diff --git a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h index efc9020..b764624 100644 --- a/contrib/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/contrib/llvm/lib/Target/ARM/ARMISelLowering.h @@ -63,8 +63,6 @@ namespace llvm { BCC_i64, - RBIT, // ARM bitreverse instruction - SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. @@ -79,6 +77,7 @@ namespace llvm { EH_SJLJ_SETJMP, // SjLj exception handling setjmp. EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. + EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch. TC_RETURN, // Tail call return pseudo. @@ -91,6 +90,7 @@ namespace llvm { PRELOAD, // Preload WIN__CHKSTK, // Windows' __chkstk call to do stack probing. + WIN__DBZCHK, // Windows' divide by zero check VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. @@ -172,12 +172,6 @@ namespace llvm { // BUILD_VECTOR for this purpose. BUILD_VECTOR, - // Floating-point max and min: - FMAX, - FMIN, - VMAXNM, - VMINNM, - // Bit-field insert BFI, @@ -189,6 +183,10 @@ namespace llvm { // Vector bitwise select VBSL, + // Pseudo-instruction representing a memory copy using ldm/stm + // instructions. + MEMCPY, + // Vector load N-element structure to all lanes: VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD3DUP, @@ -260,6 +258,7 @@ namespace llvm { SDNode *Node) const override; SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override; @@ -348,6 +347,8 @@ namespace llvm { getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; + else if (ConstraintCode == "o") + return InlineAsm::Constraint_o; else if (ConstraintCode.size() == 2) { if (ConstraintCode[0] == 'U') { switch(ConstraintCode[1]) { @@ -420,13 +421,24 @@ namespace llvm { bool functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; - bool hasLoadLinkedStoreConditional() const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; + void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; + Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, bool IsStore, bool IsLoad) const override; Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, @@ -441,16 +453,21 @@ namespace llvm { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; bool useLoadStackGuardNode() const override; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -496,6 +513,7 @@ namespace llvm { ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; @@ -508,7 +526,6 @@ namespace llvm { SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const; - SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -526,6 +543,12 @@ namespace llvm { const ARMSubtarget *ST) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; + void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, + SmallVectorImpl<SDValue> &Results) const; + SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, + SDValue &Chain) const; + SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; @@ -635,6 +658,8 @@ namespace llvm { MachineBasicBlock *EmitLowered__chkstk(MachineInstr *MI, MachineBasicBlock *MBB) const; + MachineBasicBlock *EmitLowered__dbzchk(MachineInstr *MI, + MachineBasicBlock *MBB) const; }; enum NEONModImmType { diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp index 84f95be..cf973d6 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.cpp @@ -51,7 +51,8 @@ void ARMInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const { switch (Opc) { - default: break; + default: + break; case ARM::LDR_PRE_IMM: case ARM::LDR_PRE_REG: case ARM::LDR_POST_IMM: @@ -124,82 +125,10 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI, .addGlobalAddress(GV, 0, ARMII::MO_NONLAZY); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( - MachinePointerInfo::getGOT(), Flag, 4, 4); + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 4, 4); MIB.addMemOperand(MMO); MIB = BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg); MIB.addReg(Reg, RegState::Kill).addImm(0); MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); AddDefaultPred(MIB); } - -namespace { - /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC - /// global base register for ARM ELF. - struct ARMCGBR : public MachineFunctionPass { - static char ID; - ARMCGBR() : MachineFunctionPass(ID) {} - - bool runOnMachineFunction(MachineFunction &MF) override { - ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); - if (AFI->getGlobalBaseReg() == 0) - return false; - const ARMSubtarget &STI = - static_cast<const ARMSubtarget &>(MF.getSubtarget()); - // Don't do this for Thumb1. - if (STI.isThumb1Only()) - return false; - - const TargetMachine &TM = MF.getTarget(); - if (TM.getRelocationModel() != Reloc::PIC_) - return false; - - LLVMContext *Context = &MF.getFunction()->getContext(); - unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); - unsigned PCAdj = STI.isThumb() ? 4 : 8; - ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create( - *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj); - - unsigned Align = TM.getDataLayout()->getPrefTypeAlignment( - Type::getInt32PtrTy(*Context)); - unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align); - - MachineBasicBlock &FirstMBB = MF.front(); - MachineBasicBlock::iterator MBBI = FirstMBB.begin(); - DebugLoc DL = FirstMBB.findDebugLoc(MBBI); - unsigned TempReg = - MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass); - unsigned Opc = STI.isThumb2() ? ARM::t2LDRpci : ARM::LDRcp; - const TargetInstrInfo &TII = *STI.getInstrInfo(); - MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL, - TII.get(Opc), TempReg) - .addConstantPoolIndex(Idx); - if (Opc == ARM::LDRcp) - MIB.addImm(0); - AddDefaultPred(MIB); - - // Fix the GOT address by adding pc. - unsigned GlobalBaseReg = AFI->getGlobalBaseReg(); - Opc = STI.isThumb2() ? ARM::tPICADD : ARM::PICADD; - MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg) - .addReg(TempReg) - .addImm(ARMPCLabelIndex); - if (Opc == ARM::PICADD) - AddDefaultPred(MIB); - - return true; - } - - const char *getPassName() const override { - return "ARM PIC Global Base Reg Initialization"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - }; -} - -char ARMCGBR::ID = 0; -FunctionPass* -llvm::createARMGlobalBaseRegPass() { return new ARMCGBR(); } diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td index 9f5bde3..b9de83b 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -59,6 +59,7 @@ def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>; def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisInt<2>]>; def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>; +def SDT_ARMEH_SJLJ_SetupDispatch: SDTypeProfile<0, 0, []>; def SDT_ARMMEMBARRIER : SDTypeProfile<0, 1, [SDTCisInt<0>]>; @@ -70,8 +71,11 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>; def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>; -def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; -def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>; +def SDT_WIN__DBZCHK : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; + +def SDT_ARMMEMCPY : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, + SDTCisVT<2, i32>, SDTCisVT<3, i32>, + SDTCisVT<4, i32>]>; def SDTBinaryArithWithFlags : SDTypeProfile<2, 2, [SDTCisSameAs<0, 2>, @@ -163,21 +167,23 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP", SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain, SDNPSideEffect]>; +def ARMeh_sjlj_setup_dispatch: SDNode<"ARMISD::EH_SJLJ_SETUP_DISPATCH", + SDT_ARMEH_SJLJ_SetupDispatch, + [SDNPHasChain, SDNPSideEffect]>; def ARMMemBarrierMCR : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER, [SDNPHasChain, SDNPSideEffect]>; def ARMPreload : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH, [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; -def ARMrbit : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>; - def ARMtcret : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def ARMbfi : SDNode<"ARMISD::BFI", SDT_ARMBFI>; -def ARMvmaxnm : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>; -def ARMvminnm : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>; +def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY, + [SDNPHasChain, SDNPInGlue, SDNPOutGlue, + SDNPMayStore, SDNPMayLoad]>; //===----------------------------------------------------------------------===// // ARM Instruction Predicate Definitions. @@ -209,6 +215,8 @@ def PreV8 : Predicate<"!Subtarget->hasV8Ops()">, AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">; def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">, AssemblerPredicate<"HasV8_1aOps", "armv8.1a">; +def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">, + AssemblerPredicate<"HasV8_2aOps", "armv8.2a">; def NoVFP : Predicate<"!Subtarget->hasVFP2()">; def HasVFP2 : Predicate<"Subtarget->hasVFP2()">, AssemblerPredicate<"FeatureVFP2", "VFP2">; @@ -228,7 +236,9 @@ def HasCrypto : Predicate<"Subtarget->hasCrypto()">, def HasCRC : Predicate<"Subtarget->hasCRC()">, AssemblerPredicate<"FeatureCRC", "crc">; def HasFP16 : Predicate<"Subtarget->hasFP16()">, - AssemblerPredicate<"FeatureFP16","half-float">; + AssemblerPredicate<"FeatureFP16","half-float conversions">; +def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, + AssemblerPredicate<"FeatureFullFP16","full half-float">; def HasDivide : Predicate<"Subtarget->hasDivide()">, AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, @@ -236,9 +246,8 @@ def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">, AssemblerPredicate<"FeatureT2XtPk", "pack/extract">; -def HasThumb2DSP : Predicate<"Subtarget->hasThumb2DSP()">, - AssemblerPredicate<"FeatureDSPThumb2", - "thumb2-dsp">; +def HasDSP : Predicate<"Subtarget->hasDSP()">, + AssemblerPredicate<"FeatureDSP", "dsp">; def HasDB : Predicate<"Subtarget->hasDataBarrier()">, AssemblerPredicate<"FeatureDB", "data-barriers">; @@ -2322,6 +2331,7 @@ def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", let Inst{23-4} = 0b01100000000000000111; let Inst{3-0} = opt; } +def : MnemonicAlias<"smi", "smc">; // Supervisor Call (Software Interrupt) let isCall = 1, Uses = [SP] in { @@ -3671,10 +3681,10 @@ def USAT16 : AI<(outs GPRnopc:$Rd), let Inst{3-0} = Rn; } -def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm:$pos), - (SSAT imm:$pos, GPRnopc:$a, 0)>; -def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm:$pos), - (USAT imm:$pos, GPRnopc:$a, 0)>; +def : ARMV6Pat<(int_arm_ssat GPRnopc:$a, imm1_32:$pos), + (SSAT imm1_32:$pos, GPRnopc:$a, 0)>; +def : ARMV6Pat<(int_arm_usat GPRnopc:$a, imm0_31:$pos), + (USAT imm0_31:$pos, GPRnopc:$a, 0)>; //===----------------------------------------------------------------------===// // Bitwise Instructions. @@ -4186,7 +4196,7 @@ def CLZ : AMiscA1I<0b00010110, 0b0001, (outs GPR:$Rd), (ins GPR:$Rm), def RBIT : AMiscA1I<0b01101111, 0b0011, (outs GPR:$Rd), (ins GPR:$Rm), IIC_iUNAr, "rbit", "\t$Rd, $Rm", - [(set GPR:$Rd, (ARMrbit GPR:$Rm))]>, + [(set GPR:$Rd, (bitreverse GPR:$Rm))]>, Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]>; @@ -4578,6 +4588,19 @@ let usesCustomInserter = 1 in { [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>; } +let hasPostISelHook = 1, Constraints = "$newdst = $dst, $newsrc = $src" in { + // %newsrc, %newdst = MEMCPY %dst, %src, N, ...N scratch regs... + // Copies N registers worth of memory from address %src to address %dst + // and returns the incremented addresses. N scratch register will + // be attached for the copy to use. + def MEMCPY : PseudoInst< + (outs GPR:$newdst, GPR:$newsrc), + (ins GPR:$dst, GPR:$src, i32imm:$nreg, variable_ops), + NoItinerary, + [(set GPR:$newdst, GPR:$newsrc, + (ARMmemcopy GPR:$dst, GPR:$src, imm:$nreg))]>; +} + def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; }]>; @@ -4705,7 +4728,7 @@ def STLEXD : AIstlex<0b01, (outs GPR:$Rd), def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", [(int_arm_clrex)]>, - Requires<[IsARM, HasV6]> { + Requires<[IsARM, HasV6K]> { let Inst{31-0} = 0b11110101011111111111000000011111; } @@ -5242,6 +5265,12 @@ def win__chkstk : SDNode<"ARMISD::WIN__CHKSTK", SDTNone, let usesCustomInserter = 1, Uses = [R4], Defs = [R4, SP] in def WIN__CHKSTK : PseudoInst<(outs), (ins), NoItinerary, [(win__chkstk)]>; +def win__dbzchk : SDNode<"ARMISD::WIN__DBZCHK", SDT_WIN__DBZCHK, + [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; +let usesCustomInserter = 1, Defs = [CPSR] in + def WIN__DBZCHK : PseudoInst<(outs), (ins GPR:$divisor), NoItinerary, + [(win__dbzchk GPR:$divisor)]>; + //===----------------------------------------------------------------------===// // TLS Instructions // @@ -5301,6 +5330,10 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch), Requires<[IsARM]>; } +let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in +def Int_eh_sjlj_setup_dispatch : PseudoInst<(outs), (ins), NoItinerary, + [(ARMeh_sjlj_setup_dispatch)]>; + // eh.sjlj.dispatchsetup pseudo-instruction. // This pseudo is used for both ARM and Thumb. Any differences are handled when // the pseudo is expanded (which happens before any passes that need the @@ -5622,16 +5655,16 @@ def : ARMInstAlias<"mvn${s}${p} $Rd, $imm", (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>; // Same for AND <--> BIC def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm", - (ANDri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm, + (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; def : ARMInstAlias<"bic${s}${p} $Rdn, $imm", - (ANDri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm, + (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm", - (BICri rGPR:$Rd, rGPR:$Rn, mod_imm_not:$imm, + (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; def : ARMInstAlias<"and${s}${p} $Rdn, $imm", - (BICri rGPR:$Rdn, rGPR:$Rdn, mod_imm_not:$imm, + (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>; // Likewise, "add Rd, mod_imm_neg" -> sub diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td index f035d61..7020ffb 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -587,11 +587,6 @@ def SDTARMVMULL : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, def NEONvmulls : SDNode<"ARMISD::VMULLs", SDTARMVMULL>; def NEONvmullu : SDNode<"ARMISD::VMULLu", SDTARMVMULL>; -def SDTARMFMAX : SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisSameAs<0, 1>, - SDTCisSameAs<0, 2>]>; -def NEONfmax : SDNode<"ARMISD::FMAX", SDTARMFMAX>; -def NEONfmin : SDNode<"ARMISD::FMIN", SDTARMFMAX>; - def NEONimmAllZerosV: PatLeaf<(NEONvmovImm (i32 timm)), [{ ConstantSDNode *ConstVal = cast<ConstantSDNode>(N->getOperand(0)); unsigned EltBits = 0; @@ -2465,17 +2460,17 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; // Same as above, but not predicated. -class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7, +class N2VDIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> - : N2Vnp<0b10, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm), + : N2Vnp<op19_18, op17_16, op10_8, op7, 0, (outs DPR:$Vd), (ins DPR:$Vm), itin, OpcodeStr, Dt, [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>; -class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7, +class N2VQIntnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, InstrItinClass itin, string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp> - : N2Vnp<0b10, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm), + : N2Vnp<op19_18, op17_16, op10_8, op7, 1, (outs QPR:$Vd), (ins QPR:$Vm), itin, OpcodeStr, Dt, [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>; @@ -3255,6 +3250,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, [(set DPR:$Vd, (v2i32 (OpNode (v2f32 DPR:$Vm))))]> { let Inst{10} = 1; // overwrite F = 1 } + def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4, + (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary, + opc, "f16", asm, "", + [(set DPR:$Vd, (v4i16 (OpNode (v4f16 DPR:$Vm))))]>, + Requires<[HasNEON,HasFullFP16]> { + let Inst{10} = 1; // overwrite F = 1 + } // 128-bit vector types. def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4, @@ -3275,6 +3277,13 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16, [(set QPR:$Vd, (v4i32 (OpNode (v4f32 QPR:$Vm))))]> { let Inst{10} = 1; // overwrite F = 1 } + def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4, + (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary, + opc, "f16", asm, "", + [(set QPR:$Vd, (v8i16 (OpNode (v8f16 QPR:$Vm))))]>, + Requires<[HasNEON,HasFullFP16]> { + let Inst{10} = 1; // overwrite F = 1 + } } @@ -4110,6 +4119,12 @@ def VADDfd : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32", v2f32, v2f32, fadd, 1>; def VADDfq : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32", v4f32, v4f32, fadd, 1>; +def VADDhd : N3VD<0, 0, 0b01, 0b1101, 0, IIC_VBIND, "vadd", "f16", + v4f16, v4f16, fadd, 1>, + Requires<[HasNEON,HasFullFP16]>; +def VADDhq : N3VQ<0, 0, 0b01, 0b1101, 0, IIC_VBINQ, "vadd", "f16", + v8f16, v8f16, fadd, 1>, + Requires<[HasNEON,HasFullFP16]>; // VADDL : Vector Add Long (Q = D + D) defm VADDLs : N3VLExt_QHS<0,1,0b0000,0, IIC_VSHLiD, IIC_VSHLiD, "vaddl", "s", add, sext, 1>; @@ -4165,10 +4180,21 @@ def VMULfd : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VFMULD, "vmul", "f32", v2f32, v2f32, fmul, 1>; def VMULfq : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VFMULQ, "vmul", "f32", v4f32, v4f32, fmul, 1>; +def VMULhd : N3VD<1, 0, 0b01, 0b1101, 1, IIC_VFMULD, "vmul", "f16", + v4f16, v4f16, fmul, 1>, + Requires<[HasNEON,HasFullFP16]>; +def VMULhq : N3VQ<1, 0, 0b01, 0b1101, 1, IIC_VFMULQ, "vmul", "f16", + v8f16, v8f16, fmul, 1>, + Requires<[HasNEON,HasFullFP16]>; defm VMULsl : N3VSL_HS<0b1000, "vmul", mul>; def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>; def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32, v2f32, fmul>; +def VMULslhd : N3VDSL16<0b01, 0b1001, "vmul", "f16", v4f16, fmul>, + Requires<[HasNEON,HasFullFP16]>; +def VMULslhq : N3VQSL16<0b01, 0b1001, "vmul", "f16", v8f16, + v4f16, fmul>, + Requires<[HasNEON,HasFullFP16]>; def : Pat<(v8i16 (mul (v8i16 QPR:$src1), (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))), @@ -4277,6 +4303,12 @@ def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32", v4f32, fmul_su, fadd_mlx>, Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLAhd : N3VDMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACD, "vmla", "f16", + v4f16, fmul_su, fadd_mlx>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +def VMLAhq : N3VQMulOp<0, 0, 0b01, 0b1101, 1, IIC_VMACQ, "vmla", "f16", + v8f16, fmul_su, fadd_mlx>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>; def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", @@ -4285,6 +4317,12 @@ def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32", def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32", v4f32, v2f32, fmul_su, fadd_mlx>, Requires<[HasNEON, UseFPVMLx]>; +def VMLAslhd : N3VDMulOpSL16<0b01, 0b0001, IIC_VMACD, "vmla", "f16", + v4f16, fmul, fadd>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +def VMLAslhq : N3VQMulOpSL16<0b01, 0b0001, IIC_VMACQ, "vmla", "f16", + v8f16, v4f16, fmul, fadd>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def : Pat<(v8i16 (add (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -4495,6 +4533,12 @@ def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32", v4f32, fmul_su, fsub_mlx>, Requires<[HasNEON, UseFPVMLx, DontUseFusedMAC]>; +def VMLShd : N3VDMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACD, "vmls", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; +def VMLShq : N3VQMulOp<0, 0, 0b11, 0b1101, 1, IIC_VMACQ, "vmls", "f16", + v8f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx, DontUseFusedMAC]>; defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D, IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>; def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", @@ -4503,6 +4547,12 @@ def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32", def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32", v4f32, v2f32, fmul_su, fsub_mlx>, Requires<[HasNEON, UseFPVMLx]>; +def VMLSslhd : N3VDMulOpSL16<0b01, 0b0101, IIC_VMACD, "vmls", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; +def VMLSslhq : N3VQMulOpSL16<0b01, 0b0101, IIC_VMACQ, "vmls", "f16", + v8f16, v4f16, fmul, fsub>, + Requires<[HasNEON, HasFullFP16, UseFPVMLx]>; def : Pat<(v8i16 (sub (v8i16 QPR:$src1), (mul (v8i16 QPR:$src2), @@ -4570,6 +4620,13 @@ def VFMAfd : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32", def VFMAfq : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32", v4f32, fmul_su, fadd_mlx>, Requires<[HasNEON,HasVFP4,UseFusedMAC]>; +def VFMAhd : N3VDMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACD, "vfma", "f16", + v4f16, fmul, fadd>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; + +def VFMAhq : N3VQMulOp<0, 0, 0b01, 0b1100, 1, IIC_VFMACQ, "vfma", "f16", + v8f16, fmul, fadd>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; // Fused Vector Multiply Subtract (floating-point) def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", @@ -4578,6 +4635,12 @@ def VFMSfd : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32", def VFMSfq : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32", v4f32, fmul_su, fsub_mlx>, Requires<[HasNEON,HasVFP4,UseFusedMAC]>; +def VFMShd : N3VDMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACD, "vfms", "f16", + v4f16, fmul, fsub>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; +def VFMShq : N3VQMulOp<0, 0, 0b11, 0b1100, 1, IIC_VFMACQ, "vfms", "f16", + v8f16, fmul, fsub>, + Requires<[HasNEON,HasFullFP16,UseFusedMAC]>; // Match @llvm.fma.* intrinsics def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)), @@ -4602,6 +4665,12 @@ def VSUBfd : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32", v2f32, v2f32, fsub, 0>; def VSUBfq : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32", v4f32, v4f32, fsub, 0>; +def VSUBhd : N3VD<0, 0, 0b11, 0b1101, 0, IIC_VBIND, "vsub", "f16", + v4f16, v4f16, fsub, 0>, + Requires<[HasNEON,HasFullFP16]>; +def VSUBhq : N3VQ<0, 0, 0b11, 0b1101, 0, IIC_VBINQ, "vsub", "f16", + v8f16, v8f16, fsub, 0>, + Requires<[HasNEON,HasFullFP16]>; // VSUBL : Vector Subtract Long (Q = D - D) defm VSUBLs : N3VLExt_QHS<0,1,0b0010,0, IIC_VSHLiD, IIC_VSHLiD, "vsubl", "s", sub, sext, 0>; @@ -4646,6 +4715,12 @@ def VCEQfd : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32, NEONvceq, 1>; def VCEQfq : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32, NEONvceq, 1>; +def VCEQhd : N3VD<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16, + NEONvceq, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VCEQhq : N3VQ<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16, + NEONvceq, 1>, + Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in defm VCEQz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i", @@ -4660,6 +4735,12 @@ def VCGEfd : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32, NEONvcge, 0>; def VCGEfq : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32, NEONvcge, 0>; +def VCGEhd : N3VD<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16, + NEONvcge, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VCGEhq : N3VQ<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16, + NEONvcge, 0>, + Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGEz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s", @@ -4677,6 +4758,12 @@ def VCGTfd : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32, NEONvcgt, 0>; def VCGTfq : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32, NEONvcgt, 0>; +def VCGThd : N3VD<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16, + NEONvcgt, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VCGThq : N3VQ<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16, + NEONvcgt, 0>, + Requires<[HasNEON, HasFullFP16]>; let TwoOperandAliasConstraint = "$Vm = $Vd" in { defm VCGTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s", @@ -4686,36 +4773,68 @@ defm VCLTz : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s", } // VACGE : Vector Absolute Compare Greater Than or Equal (aka VCAGE) -def VACGEd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", +def VACGEfd : N3VDInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", "f32", v2i32, v2f32, int_arm_neon_vacge, 0>; -def VACGEq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", +def VACGEfq : N3VQInt<1, 0, 0b00, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", "f32", v4i32, v4f32, int_arm_neon_vacge, 0>; +def VACGEhd : N3VDInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacge", + "f16", v4i16, v4f16, int_arm_neon_vacge, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VACGEhq : N3VQInt<1, 0, 0b01, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacge", + "f16", v8i16, v8f16, int_arm_neon_vacge, 0>, + Requires<[HasNEON, HasFullFP16]>; // VACGT : Vector Absolute Compare Greater Than (aka VCAGT) -def VACGTd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", +def VACGTfd : N3VDInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", "f32", v2i32, v2f32, int_arm_neon_vacgt, 0>; -def VACGTq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", +def VACGTfq : N3VQInt<1, 0, 0b10, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", "f32", v4i32, v4f32, int_arm_neon_vacgt, 0>; +def VACGThd : N3VDInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBIND, "vacgt", + "f16", v4i16, v4f16, int_arm_neon_vacgt, 0>, + Requires<[HasNEON, HasFullFP16]>; +def VACGThq : N3VQInt<1, 0, 0b11, 0b1110, 1, N3RegFrm, IIC_VBINQ, "vacgt", + "f16", v8f16, v8f16, int_arm_neon_vacgt, 0>, + Requires<[HasNEON, HasFullFP16]>; // VTST : Vector Test Bits defm VTST : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q, "vtst", "", NEONvtst, 1>; def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", - (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; + (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vn, $Vm", - (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; + (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", - (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; + (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vn, $Vm", - (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; + (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in { +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm", + (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vn, $Vm", + (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm", + (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vn, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vn, $Vm", + (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vn, pred:$p)>; +} def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", - (VACGTd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; + (VACGTfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; def: NEONInstAlias<"vaclt${p}.f32 $Vd, $Vm", - (VACGTq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; + (VACGTfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", - (VACGEd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; + (VACGEfd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; def: NEONInstAlias<"vacle${p}.f32 $Vd, $Vm", - (VACGEq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; + (VACGEfq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in { +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm", + (VACGThd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vaclt${p}.f16 $Vd, $Vm", + (VACGThq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", + (VACGEhd DPR:$Vd, DPR:$Vm, DPR:$Vd, pred:$p)>; +def: NEONInstAlias<"vacle${p}.f16 $Vd, $Vm", + (VACGEhq QPR:$Vd, QPR:$Vm, QPR:$Vd, pred:$p)>; +} // Vector Bitwise Operations. @@ -5007,6 +5126,12 @@ def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBIND, "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 1>; def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, N3RegFrm, IIC_VBINQ, "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 1>; +def VABDhd : N3VDInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBIND, + "vabd", "f16", v4f16, v4f16, int_arm_neon_vabds, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VABDhq : N3VQInt<1, 0, 0b11, 0b1101, 0, N3RegFrm, IIC_VBINQ, + "vabd", "f16", v8f16, v8f16, int_arm_neon_vabds, 1>, + Requires<[HasNEON, HasFullFP16]>; // VABDL : Vector Absolute Difference Long (Q = | D - D |) defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, @@ -5014,6 +5139,29 @@ defm VABDLs : N3VLIntExt_QHS<0,1,0b0111,0, IIC_VSUBi4Q, defm VABDLu : N3VLIntExt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, "vabdl", "u", int_arm_neon_vabdu, zext, 1>; +def abd_shr : + PatFrag<(ops node:$in1, node:$in2, node:$shift), + (NEONvshrs (sub (zext node:$in1), + (zext node:$in2)), (i32 $shift))>; + +def : Pat<(xor (v4i32 (bitconvert (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15)))), + (v4i32 (bitconvert (v8i16 (add (sub (zext (v8i8 DPR:$opA)), + (zext (v8i8 DPR:$opB))), + (v8i16 (abd_shr (v8i8 DPR:$opA), (v8i8 DPR:$opB), 15))))))), + (VABDLuv8i16 DPR:$opA, DPR:$opB)>; + +def : Pat<(xor (v4i32 (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)), + (v4i32 (add (sub (zext (v4i16 DPR:$opA)), + (zext (v4i16 DPR:$opB))), + (abd_shr (v4i16 DPR:$opA), (v4i16 DPR:$opB), 31)))), + (VABDLuv4i32 DPR:$opA, DPR:$opB)>; + +def : Pat<(xor (v4i32 (bitconvert (v2i64 (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))), + (v4i32 (bitconvert (v2i64 (add (sub (zext (v2i32 DPR:$opA)), + (zext (v2i32 DPR:$opB))), + (abd_shr (v2i32 DPR:$opA), (v2i32 DPR:$opB), 63)))))), + (VABDLuv2i64 DPR:$opA, DPR:$opB)>; + // VABA : Vector Absolute Difference and Accumulate defm VABAs : N3VIntOp_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, "vaba", "s", int_arm_neon_vabds, add>; @@ -5031,53 +5179,85 @@ defm VABALu : N3VLIntExtOp_QHS<1,1,0b0101,0, IIC_VABAD, // VMAX : Vector Maximum defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmax", "s", int_arm_neon_vmaxs, 1>; + "vmax", "s", smax, 1>; defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmax", "u", int_arm_neon_vmaxu, 1>; + "vmax", "u", umax, 1>; def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmax", "f32", - v2f32, v2f32, int_arm_neon_vmaxs, 1>; + v2f32, v2f32, fmaxnan, 1>; def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmax", "f32", - v4f32, v4f32, int_arm_neon_vmaxs, 1>; + v4f32, v4f32, fmaxnan, 1>; +def VMAXhd : N3VDInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmax", "f16", + v4f16, v4f16, fmaxnan, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VMAXhq : N3VQInt<0, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmax", "f16", + v8f16, v8f16, fmaxnan, 1>, + Requires<[HasNEON, HasFullFP16]>; // VMAXNM let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def VMAXNMND : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, + def VMAXNMNDf : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1, N3RegFrm, NoItinerary, "vmaxnm", "f32", - v2f32, v2f32, int_arm_neon_vmaxnm, 1>, + v2f32, v2f32, fmaxnum, 1>, Requires<[HasV8, HasNEON]>; - def VMAXNMNQ : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, + def VMAXNMNQf : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1, N3RegFrm, NoItinerary, "vmaxnm", "f32", - v4f32, v4f32, int_arm_neon_vmaxnm, 1>, + v4f32, v4f32, fmaxnum, 1>, Requires<[HasV8, HasNEON]>; + def VMAXNMNDh : N3VDIntnp<0b00110, 0b01, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v4f16, v4f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def VMAXNMNQh : N3VQIntnp<0b00110, 0b01, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vmaxnm", "f16", + v8f16, v8f16, fmaxnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } // VMIN : Vector Minimum defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmin", "s", int_arm_neon_vmins, 1>; + "vmin", "s", smin, 1>; defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, N3RegFrm, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q, - "vmin", "u", int_arm_neon_vminu, 1>; + "vmin", "u", umin, 1>; def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBIND, "vmin", "f32", - v2f32, v2f32, int_arm_neon_vmins, 1>; + v2f32, v2f32, fminnan, 1>; def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ, "vmin", "f32", - v4f32, v4f32, int_arm_neon_vmins, 1>; + v4f32, v4f32, fminnan, 1>; +def VMINhd : N3VDInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBIND, + "vmin", "f16", + v4f16, v4f16, fminnan, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VMINhq : N3VQInt<0, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VBINQ, + "vmin", "f16", + v8f16, v8f16, fminnan, 1>, + Requires<[HasNEON, HasFullFP16]>; // VMINNM let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def VMINNMND : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, + def VMINNMNDf : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1, N3RegFrm, NoItinerary, "vminnm", "f32", - v2f32, v2f32, int_arm_neon_vminnm, 1>, + v2f32, v2f32, fminnum, 1>, Requires<[HasV8, HasNEON]>; - def VMINNMNQ : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, + def VMINNMNQf : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1, N3RegFrm, NoItinerary, "vminnm", "f32", - v4f32, v4f32, int_arm_neon_vminnm, 1>, + v4f32, v4f32, fminnum, 1>, Requires<[HasV8, HasNEON]>; + def VMINNMNDh : N3VDIntnp<0b00110, 0b11, 0b1111, 0, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v4f16, v4f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def VMINNMNQh : N3VQIntnp<0b00110, 0b11, 0b1111, 1, 1, + N3RegFrm, NoItinerary, "vminnm", "f16", + v8f16, v8f16, fminnum, 1>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } // Vector Pairwise Operations. @@ -5095,6 +5275,10 @@ def VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, N3RegFrm, IIC_VSHLiD, def VPADDf : N3VDInt<1, 0, 0b00, 0b1101, 0, N3RegFrm, IIC_VPBIND, "vpadd", "f32", v2f32, v2f32, int_arm_neon_vpadd, 0>; +def VPADDh : N3VDInt<1, 0, 0b01, 0b1101, 0, N3RegFrm, + IIC_VPBIND, "vpadd", "f16", + v4f16, v4f16, int_arm_neon_vpadd, 0>, + Requires<[HasNEON, HasFullFP16]>; // VPADDL : Vector Pairwise Add Long defm VPADDLs : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s", @@ -5123,6 +5307,9 @@ def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, N3RegFrm, IIC_VSUBi4D, "vpmax", "u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", "f32", v2f32, v2f32, int_arm_neon_vpmaxs, 0>; +def VPMAXh : N3VDInt<1, 0, 0b01, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmax", + "f16", v4f16, v4f16, int_arm_neon_vpmaxs, 0>, + Requires<[HasNEON, HasFullFP16]>; // VPMIN : Vector Pairwise Minimum def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", @@ -5139,6 +5326,9 @@ def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, N3RegFrm, IIC_VSUBi4D, "vpmin", "u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", "f32", v2f32, v2f32, int_arm_neon_vpmins, 0>; +def VPMINh : N3VDInt<1, 0, 0b11, 0b1111, 0, N3RegFrm, IIC_VPBIND, "vpmin", + "f16", v4f16, v4f16, int_arm_neon_vpmins, 0>, + Requires<[HasNEON, HasFullFP16]>; // Vector Reciprocal and Reciprocal Square Root Estimate and Step. @@ -5155,6 +5345,14 @@ def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, IIC_VUNAQ, "vrecpe", "f32", v4f32, v4f32, int_arm_neon_vrecpe>; +def VRECPEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0, + IIC_VUNAD, "vrecpe", "f16", + v4f16, v4f16, int_arm_neon_vrecpe>, + Requires<[HasNEON, HasFullFP16]>; +def VRECPEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01010, 0, + IIC_VUNAQ, "vrecpe", "f16", + v8f16, v8f16, int_arm_neon_vrecpe>, + Requires<[HasNEON, HasFullFP16]>; // VRECPS : Vector Reciprocal Step def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, @@ -5163,6 +5361,14 @@ def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, def VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrecps", "f32", v4f32, v4f32, int_arm_neon_vrecps, 1>; +def VRECPShd : N3VDInt<0, 0, 0b01, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrecps", "f16", + v4f16, v4f16, int_arm_neon_vrecps, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VRECPShq : N3VQInt<0, 0, 0b01, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrecps", "f16", + v8f16, v8f16, int_arm_neon_vrecps, 1>, + Requires<[HasNEON, HasFullFP16]>; // VRSQRTE : Vector Reciprocal Square Root Estimate def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, @@ -5177,6 +5383,14 @@ def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, IIC_VUNAQ, "vrsqrte", "f32", v4f32, v4f32, int_arm_neon_vrsqrte>; +def VRSQRTEhd : N2VDInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0, + IIC_VUNAD, "vrsqrte", "f16", + v4f16, v4f16, int_arm_neon_vrsqrte>, + Requires<[HasNEON, HasFullFP16]>; +def VRSQRTEhq : N2VQInt<0b11, 0b11, 0b01, 0b11, 0b01011, 0, + IIC_VUNAQ, "vrsqrte", "f16", + v8f16, v8f16, int_arm_neon_vrsqrte>, + Requires<[HasNEON, HasFullFP16]>; // VRSQRTS : Vector Reciprocal Square Root Step def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, @@ -5185,6 +5399,14 @@ def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1, N3RegFrm, IIC_VRECSQ, "vrsqrts", "f32", v4f32, v4f32, int_arm_neon_vrsqrts, 1>; +def VRSQRTShd : N3VDInt<0, 0, 0b11, 0b1111, 1, N3RegFrm, + IIC_VRECSD, "vrsqrts", "f16", + v4f16, v4f16, int_arm_neon_vrsqrts, 1>, + Requires<[HasNEON, HasFullFP16]>; +def VRSQRTShq : N3VQInt<0, 0, 0b11, 0b1111, 1, N3RegFrm, + IIC_VRECSQ, "vrsqrts", "f16", + v8f16, v8f16, int_arm_neon_vrsqrts, 1>, + Requires<[HasNEON, HasFullFP16]>; // Vector Shifts. @@ -5336,6 +5558,14 @@ def VABSfd : N2VD<0b11, 0b11, 0b10, 0b01, 0b01110, 0, def VABSfq : N2VQ<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs", "f32", v4f32, v4f32, fabs>; +def VABShd : N2VD<0b11, 0b11, 0b01, 0b01, 0b01110, 0, + "vabs", "f16", + v4f16, v4f16, fabs>, + Requires<[HasNEON, HasFullFP16]>; +def VABShq : N2VQ<0b11, 0b11, 0b01, 0b01, 0b01110, 0, + "vabs", "f16", + v8f16, v8f16, fabs>, + Requires<[HasNEON, HasFullFP16]>; def : Pat<(xor (v2i32 (bitconvert (v8i8 (NEONvshrs DPR:$src, (i32 7))))), (v2i32 (bitconvert (v8i8 (add DPR:$src, @@ -5398,6 +5628,16 @@ def VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0, (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, "vneg", "f32", "$Vd, $Vm", "", [(set QPR:$Vd, (v4f32 (fneg QPR:$Vm)))]>; +def VNEGhd : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 0, 0, + (outs DPR:$Vd), (ins DPR:$Vm), IIC_VUNAD, + "vneg", "f16", "$Vd, $Vm", "", + [(set DPR:$Vd, (v4f16 (fneg DPR:$Vm)))]>, + Requires<[HasNEON, HasFullFP16]>; +def VNEGhq : N2V<0b11, 0b11, 0b01, 0b01, 0b01111, 1, 0, + (outs QPR:$Vd), (ins QPR:$Vm), IIC_VUNAQ, + "vneg", "f16", "$Vd, $Vm", "", + [(set QPR:$Vd, (v8f16 (fneg QPR:$Vm)))]>, + Requires<[HasNEON, HasFullFP16]>; def : Pat<(v8i8 (vnegd DPR:$src)), (VNEGs8d DPR:$src)>; def : Pat<(v4i16 (vnegd DPR:$src)), (VNEGs16d DPR:$src)>; @@ -5868,18 +6108,56 @@ def VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32", def VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32", v4f32, v4i32, uint_to_fp>; +def VCVTh2sd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16", + v4i16, v4f16, fp_to_sint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTh2ud : N2VD<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16", + v4i16, v4f16, fp_to_uint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTs2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16", + v4f16, v4i16, sint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTu2hd : N2VD<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16", + v4f16, v4i16, uint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; + +def VCVTh2sq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01110, 0, "vcvt", "s16.f16", + v8i16, v8f16, fp_to_sint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTh2uq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01111, 0, "vcvt", "u16.f16", + v8i16, v8f16, fp_to_uint>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTs2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01100, 0, "vcvt", "f16.s16", + v8f16, v8i16, sint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; +def VCVTu2hq : N2VQ<0b11, 0b11, 0b01, 0b11, 0b01101, 0, "vcvt", "f16.u16", + v8f16, v8i16, uint_to_fp>, + Requires<[HasNEON, HasFullFP16]>; + // VCVT{A, N, P, M} multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS, SDPatternOperator IntU> { let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + def SDf : N2VDIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>; - def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + def SQf : N2VQIntnp<0b10, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>; - def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + def UDf : N2VDIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>; - def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + def UQf : N2VQIntnp<0b10, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>; + def SDh : N2VDIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s16.f16", v4i16, v4f16, IntS>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def SQh : N2VQIntnp<0b01, 0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op), + "s16.f16", v8i16, v8f16, IntS>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def UDh : N2VDIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u16.f16", v4i16, v4f16, IntU>, + Requires<[HasV8, HasNEON, HasFullFP16]>; + def UQh : N2VQIntnp<0b01, 0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op), + "u16.f16", v8i16, v8f16, IntU>, + Requires<[HasV8, HasNEON, HasFullFP16]>; } } @@ -5898,6 +6176,16 @@ def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", v2f32, v2i32, int_arm_neon_vcvtfxs2fp>; def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", v2f32, v2i32, int_arm_neon_vcvtfxu2fp>; +let Predicates = [HasNEON, HasFullFP16] in { +def VCVTh2xsd : N2VCvtD<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16", + v4i16, v4f16, int_arm_neon_vcvtfp2fxs>; +def VCVTh2xud : N2VCvtD<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16", + v4i16, v4f16, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2hd : N2VCvtD<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16", + v4f16, v4i16, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2hd : N2VCvtD<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16", + v4f16, v4i16, int_arm_neon_vcvtfxu2fp>; +} // Predicates = [HasNEON, HasFullFP16] } let DecoderMethod = "DecodeVCVTQ" in { @@ -5909,6 +6197,16 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32", v4f32, v4i32, int_arm_neon_vcvtfxs2fp>; def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32", v4f32, v4i32, int_arm_neon_vcvtfxu2fp>; +let Predicates = [HasNEON, HasFullFP16] in { +def VCVTh2xsq : N2VCvtQ<0, 1, 0b1101, 0, 1, "vcvt", "s16.f16", + v8i16, v8f16, int_arm_neon_vcvtfp2fxs>; +def VCVTh2xuq : N2VCvtQ<1, 1, 0b1101, 0, 1, "vcvt", "u16.f16", + v8i16, v8f16, int_arm_neon_vcvtfp2fxu>; +def VCVTxs2hq : N2VCvtQ<0, 1, 0b1100, 0, 1, "vcvt", "f16.s16", + v8f16, v8i16, int_arm_neon_vcvtfxs2fp>; +def VCVTxu2hq : N2VCvtQ<1, 1, 0b1100, 0, 1, "vcvt", "f16.u16", + v8f16, v8i16, int_arm_neon_vcvtfxu2fp>; +} // Predicates = [HasNEON, HasFullFP16] } def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", @@ -5929,6 +6227,24 @@ def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.s16.f16 $Dd, $Dm, #0", + (VCVTh2sd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u16.f16 $Dd, $Dm, #0", + (VCVTh2ud DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.s16 $Dd, $Dm, #0", + (VCVTs2hd DPR:$Dd, DPR:$Dm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.u16 $Dd, $Dm, #0", + (VCVTu2hd DPR:$Dd, DPR:$Dm, pred:$p)>; + +def : NEONInstAlias<"vcvt${p}.s16.f16 $Qd, $Qm, #0", + (VCVTh2sq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.u16.f16 $Qd, $Qm, #0", + (VCVTh2uq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.s16 $Qd, $Qm, #0", + (VCVTs2hq QPR:$Qd, QPR:$Qm, pred:$p)>; +def : NEONInstAlias<"vcvt${p}.f16.u16 $Qd, $Qm, #0", + (VCVTu2hq QPR:$Qd, QPR:$Qm, pred:$p)>; + // VCVT : Vector Convert Between Half-Precision and Single-Precision. def VCVTf2h : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0, @@ -6182,22 +6498,40 @@ def VTBX4Pseudo // VRINT : Vector Rounding multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> { let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in { - def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary, + def Df : N2VDIntnp<0b10, 0b10, 0b100, 0, NoItinerary, !strconcat("vrint", op), "f32", v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> { let Inst{9-7} = op9_7; } - def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary, + def Qf : N2VQIntnp<0b10, 0b10, 0b100, 0, NoItinerary, !strconcat("vrint", op), "f32", v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> { let Inst{9-7} = op9_7; } + def Dh : N2VDIntnp<0b01, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f16", + v4f16, v4f16, Int>, + Requires<[HasV8, HasNEON, HasFullFP16]> { + let Inst{9-7} = op9_7; + } + def Qh : N2VQIntnp<0b01, 0b10, 0b100, 0, NoItinerary, + !strconcat("vrint", op), "f16", + v8f16, v8f16, Int>, + Requires<[HasV8, HasNEON, HasFullFP16]> { + let Inst{9-7} = op9_7; + } } def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"), - (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>; + (!cast<Instruction>(NAME#"Df") DPR:$Dd, DPR:$Dm)>; def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"), - (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>; + (!cast<Instruction>(NAME#"Qf") QPR:$Qd, QPR:$Qm)>; + let Predicates = [HasNEON, HasFullFP16] in { + def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Dd, $Dm"), + (!cast<Instruction>(NAME#"Dh") DPR:$Dd, DPR:$Dm)>; + def : NEONInstAlias<!strconcat("vrint", op, ".f16.f16\t$Qd, $Qm"), + (!cast<Instruction>(NAME#"Qh") QPR:$Qd, QPR:$Qm)>; + } } defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>; @@ -6343,8 +6677,8 @@ def : N3VSMulOpPat<fmul, fsub, VFMSfd>, Requires<[HasVFP4, UseNEONForFP, UseFusedMAC]>; def : N2VSPat<fabs, VABSfd>; def : N2VSPat<fneg, VNEGfd>; -def : N3VSPat<NEONfmax, VMAXfd>; -def : N3VSPat<NEONfmin, VMINfd>; +def : N3VSPat<fmaxnan, VMAXfd>, Requires<[HasNEON]>; +def : N3VSPat<fminnan, VMINfd>, Requires<[HasNEON]>; def : NVCVTFIPat<fp_to_sint, VCVTf2sd>; def : NVCVTFIPat<fp_to_uint, VCVTf2ud>; def : NVCVTIFPat<sint_to_fp, VCVTs2fd>; @@ -7704,6 +8038,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Dd, $Dn, $Dm", (VCGEuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; def : NEONInstAlias<"vcle${p}.f32 $Dd, $Dn, $Dm", (VCGEfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vcle${p}.f16 $Dd, $Dn, $Dm", + (VCGEhd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; // Q-register versions. def : NEONInstAlias<"vcle${p}.s8 $Qd, $Qn, $Qm", (VCGEsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; @@ -7719,6 +8056,9 @@ def : NEONInstAlias<"vcle${p}.u32 $Qd, $Qn, $Qm", (VCGEuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; def : NEONInstAlias<"vcle${p}.f32 $Qd, $Qn, $Qm", (VCGEfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vcle${p}.f16 $Qd, $Qn, $Qm", + (VCGEhq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; // VCLT (register) is an assembler alias for VCGT w/ the operands reversed. // D-register versions. @@ -7736,6 +8076,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Dd, $Dn, $Dm", (VCGTuv2i32 DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; def : NEONInstAlias<"vclt${p}.f32 $Dd, $Dn, $Dm", (VCGTfd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vclt${p}.f16 $Dd, $Dn, $Dm", + (VCGThd DPR:$Dd, DPR:$Dm, DPR:$Dn, pred:$p)>; // Q-register versions. def : NEONInstAlias<"vclt${p}.s8 $Qd, $Qn, $Qm", (VCGTsv16i8 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; @@ -7751,6 +8094,9 @@ def : NEONInstAlias<"vclt${p}.u32 $Qd, $Qn, $Qm", (VCGTuv4i32 QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; def : NEONInstAlias<"vclt${p}.f32 $Qd, $Qn, $Qm", (VCGTfq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; +let Predicates = [HasNEON, HasFullFP16] in +def : NEONInstAlias<"vclt${p}.f16 $Qd, $Qn, $Qm", + (VCGThq QPR:$Qd, QPR:$Qm, QPR:$Qn, pred:$p)>; // VSWP allows, but does not require, a type suffix. defm : NEONDTAnyInstAlias<"vswp${p}", "$Vd, $Vm", diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td index 40414da..df6f243 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb.td @@ -591,6 +591,34 @@ def tTRAP : TI<(outs), (ins), IIC_Br, // Load Store Instructions. // +// PC-relative loads need to be matched first as constant pool accesses need to +// always be PC-relative. We do this using AddedComplexity, as the pattern is +// simpler than the patterns of the other load instructions. +let canFoldAsLoad = 1, isReMaterializable = 1, AddedComplexity = 10 in +def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, + T1Encoding<{0,1,0,0,1,?}> { + // A6.2 & A8.6.59 + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + +// SP-relative loads should be matched before standard immediate-offset loads as +// it means we avoid having to move SP to another register. +let canFoldAsLoad = 1 in +def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, + "ldr", "\t$Rt, $addr", + [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, + T1LdStSP<{1,?,?}> { + bits<3> Rt; + bits<8> addr; + let Inst{10-8} = Rt; + let Inst{7-0} = addr; +} + // Loads: reg/reg and reg/imm5 let canFoldAsLoad = 1, isReMaterializable = 1 in multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, @@ -598,16 +626,20 @@ multiclass thumb_ld_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, AddrMode am, InstrItinClass itin_r, InstrItinClass itin_i, string asm, PatFrag opnode> { - def r : // reg/reg - T1pILdStEncode<reg_opc, - (outs tGPR:$Rt), (ins AddrMode_r:$addr), - am, itin_r, asm, "\t$Rt, $addr", - [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>; + // Immediate-offset loads should be matched before register-offset loads as + // when the offset is a constant it's simpler to first check if it fits in the + // immediate offset field then fall back to register-offset if it doesn't. def i : // reg/imm5 T1pILdStEncodeImm<imm_opc, 1 /* Load */, (outs tGPR:$Rt), (ins AddrMode_i:$addr), am, itin_i, asm, "\t$Rt, $addr", [(set tGPR:$Rt, (opnode AddrMode_i:$addr))]>; + // Register-offset loads are matched last. + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs tGPR:$Rt), (ins AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(set tGPR:$Rt, (opnode AddrMode_r:$addr))]>; } // Stores: reg/reg and reg/imm5 multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, @@ -615,32 +647,32 @@ multiclass thumb_st_rr_ri_enc<bits<3> reg_opc, bits<4> imm_opc, AddrMode am, InstrItinClass itin_r, InstrItinClass itin_i, string asm, PatFrag opnode> { - def r : // reg/reg - T1pILdStEncode<reg_opc, - (outs), (ins tGPR:$Rt, AddrMode_r:$addr), - am, itin_r, asm, "\t$Rt, $addr", - [(opnode tGPR:$Rt, AddrMode_r:$addr)]>; def i : // reg/imm5 T1pILdStEncodeImm<imm_opc, 0 /* Store */, (outs), (ins tGPR:$Rt, AddrMode_i:$addr), am, itin_i, asm, "\t$Rt, $addr", [(opnode tGPR:$Rt, AddrMode_i:$addr)]>; + def r : // reg/reg + T1pILdStEncode<reg_opc, + (outs), (ins tGPR:$Rt, AddrMode_r:$addr), + am, itin_r, asm, "\t$Rt, $addr", + [(opnode tGPR:$Rt, AddrMode_r:$addr)]>; } // A8.6.57 & A8.6.60 -defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rrs4, +defm tLDR : thumb_ld_rr_ri_enc<0b100, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iLoad_r, IIC_iLoad_i, "ldr", UnOpFrag<(load node:$Src)>>; // A8.6.64 & A8.6.61 -defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rrs1, +defm tLDRB : thumb_ld_rr_ri_enc<0b110, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrb", UnOpFrag<(zextloadi8 node:$Src)>>; // A8.6.76 & A8.6.73 -defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rrs2, +defm tLDRH : thumb_ld_rr_ri_enc<0b101, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iLoad_bh_r, IIC_iLoad_bh_i, "ldrh", UnOpFrag<(zextloadi16 node:$Src)>>; @@ -659,58 +691,36 @@ def tLDRSH : // A8.6.84 "ldrsh", "\t$Rt, $addr", [(set tGPR:$Rt, (sextloadi16 t_addrmode_rr:$addr))]>; -let canFoldAsLoad = 1 in -def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i, - "ldr", "\t$Rt, $addr", - [(set tGPR:$Rt, (load t_addrmode_sp:$addr))]>, - T1LdStSP<{1,?,?}> { - bits<3> Rt; - bits<8> addr; - let Inst{10-8} = Rt; - let Inst{7-0} = addr; -} -let canFoldAsLoad = 1, isReMaterializable = 1 in -def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i, - "ldr", "\t$Rt, $addr", - [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>, - T1Encoding<{0,1,0,0,1,?}> { - // A6.2 & A8.6.59 +def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, + "str", "\t$Rt, $addr", + [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, + T1LdStSP<{0,?,?}> { bits<3> Rt; bits<8> addr; let Inst{10-8} = Rt; - let Inst{7-0} = addr; + let Inst{7-0} = addr; } // A8.6.194 & A8.6.192 -defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4, +defm tSTR : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rr, t_addrmode_is4, AddrModeT1_4, IIC_iStore_r, IIC_iStore_i, "str", BinOpFrag<(store node:$LHS, node:$RHS)>>; // A8.6.197 & A8.6.195 -defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rrs1, +defm tSTRB : thumb_st_rr_ri_enc<0b010, 0b0111, t_addrmode_rr, t_addrmode_is1, AddrModeT1_1, IIC_iStore_bh_r, IIC_iStore_bh_i, "strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>; // A8.6.207 & A8.6.205 -defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rrs2, +defm tSTRH : thumb_st_rr_ri_enc<0b001, 0b1000, t_addrmode_rr, t_addrmode_is2, AddrModeT1_2, IIC_iStore_bh_r, IIC_iStore_bh_i, "strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>; -def tSTRspi : T1pIs<(outs), (ins tGPR:$Rt, t_addrmode_sp:$addr), IIC_iStore_i, - "str", "\t$Rt, $addr", - [(store tGPR:$Rt, t_addrmode_sp:$addr)]>, - T1LdStSP<{0,?,?}> { - bits<3> Rt; - bits<8> addr; - let Inst{10-8} = Rt; - let Inst{7-0} = addr; -} - //===----------------------------------------------------------------------===// // Load / store multiple Instructions. // @@ -730,6 +740,7 @@ def tLDMIA : T1I<(outs), (ins tGPR:$Rn, pred:$p, reglist:$regs, variable_ops), // Writeback version is just a pseudo, as there's no encoding difference. // Writeback happens iff the base register is not in the destination register // list. +let mayLoad = 1, hasExtraDefRegAllocReq = 1 in def tLDMIA_UPD : InstTemplate<AddrModeNone, 0, IndexModeNone, Pseudo, GenericDomain, "$Rn = $wb", IIC_iLoad_mu>, @@ -1328,16 +1339,16 @@ def : T1Pat<(subc tGPR:$lhs, tGPR:$rhs), (tSUBrr tGPR:$lhs, tGPR:$rhs)>; // Bswap 16 with load/store -def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rrs2:$addr)), (i32 16)), - (tREV16 (tLDRHr t_addrmode_rrs2:$addr))>; def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)), (tREV16 (tLDRHi t_addrmode_is2:$addr))>; -def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), - t_addrmode_rrs2:$addr), - (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rrs2:$addr)>; +def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rr:$addr)), (i32 16)), + (tREV16 (tLDRHr t_addrmode_rr:$addr))>; def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), t_addrmode_is2:$addr), (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>; +def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)), + t_addrmode_rr:$addr), + (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rr:$addr)>; // ConstantPool def : T1Pat<(ARMWrapper tconstpool :$dst), (tLEApcrel tconstpool :$dst)>; @@ -1372,10 +1383,10 @@ def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>, Requires<[IsThumb, HasV5T]>; // zextload i1 -> zextload i8 -def : T1Pat<(zextloadi1 t_addrmode_rrs1:$addr), - (tLDRBr t_addrmode_rrs1:$addr)>; def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(zextloadi1 t_addrmode_rr:$addr), + (tLDRBr t_addrmode_rr:$addr)>; // extload from the stack -> word load from the stack, as it avoids having to // materialize the base in a separate register. This only works when a word @@ -1389,61 +1400,61 @@ def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>, Requires<[IsThumb, IsThumb1Only, IsLE]>; // extload -> zextload -def : T1Pat<(extloadi1 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; -def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; -def : T1Pat<(extloadi8 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; -def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; -def : T1Pat<(extloadi16 t_addrmode_rrs2:$addr), (tLDRHr t_addrmode_rrs2:$addr)>; -def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi1 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_rr:$addr), (tLDRBr t_addrmode_rr:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_rr:$addr), (tLDRHr t_addrmode_rr:$addr)>; // If it's impossible to use [r,r] address mode for sextload, select to // ldr{b|h} + sxt{b|h} instead. def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), (tSXTB (tLDRBi t_addrmode_is1:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; -def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), - (tSXTB (tLDRBr t_addrmode_rrs1:$addr))>, +def : T1Pat<(sextloadi8 t_addrmode_rr:$addr), + (tSXTB (tLDRBr t_addrmode_rr:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), (tSXTH (tLDRHi t_addrmode_is2:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; -def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), - (tSXTH (tLDRHr t_addrmode_rrs2:$addr))>, +def : T1Pat<(sextloadi16 t_addrmode_rr:$addr), + (tSXTH (tLDRHr t_addrmode_rr:$addr))>, Requires<[IsThumb, IsThumb1Only, HasV6]>; -def : T1Pat<(sextloadi8 t_addrmode_rrs1:$addr), - (tASRri (tLSLri (tLDRBr t_addrmode_rrs1:$addr), 24), 24)>; def : T1Pat<(sextloadi8 t_addrmode_is1:$addr), (tASRri (tLSLri (tLDRBi t_addrmode_is1:$addr), 24), 24)>; -def : T1Pat<(sextloadi16 t_addrmode_rrs2:$addr), - (tASRri (tLSLri (tLDRHr t_addrmode_rrs2:$addr), 16), 16)>; +def : T1Pat<(sextloadi8 t_addrmode_rr:$addr), + (tASRri (tLSLri (tLDRBr t_addrmode_rr:$addr), 24), 24)>; def : T1Pat<(sextloadi16 t_addrmode_is2:$addr), (tASRri (tLSLri (tLDRHi t_addrmode_is2:$addr), 16), 16)>; +def : T1Pat<(sextloadi16 t_addrmode_rr:$addr), + (tASRri (tLSLri (tLDRHr t_addrmode_rr:$addr), 16), 16)>; def : T1Pat<(atomic_load_8 t_addrmode_is1:$src), (tLDRBi t_addrmode_is1:$src)>; -def : T1Pat<(atomic_load_8 t_addrmode_rrs1:$src), - (tLDRBr t_addrmode_rrs1:$src)>; +def : T1Pat<(atomic_load_8 t_addrmode_rr:$src), + (tLDRBr t_addrmode_rr:$src)>; def : T1Pat<(atomic_load_16 t_addrmode_is2:$src), (tLDRHi t_addrmode_is2:$src)>; -def : T1Pat<(atomic_load_16 t_addrmode_rrs2:$src), - (tLDRHr t_addrmode_rrs2:$src)>; +def : T1Pat<(atomic_load_16 t_addrmode_rr:$src), + (tLDRHr t_addrmode_rr:$src)>; def : T1Pat<(atomic_load_32 t_addrmode_is4:$src), (tLDRi t_addrmode_is4:$src)>; -def : T1Pat<(atomic_load_32 t_addrmode_rrs4:$src), - (tLDRr t_addrmode_rrs4:$src)>; +def : T1Pat<(atomic_load_32 t_addrmode_rr:$src), + (tLDRr t_addrmode_rr:$src)>; def : T1Pat<(atomic_store_8 t_addrmode_is1:$ptr, tGPR:$val), (tSTRBi tGPR:$val, t_addrmode_is1:$ptr)>; -def : T1Pat<(atomic_store_8 t_addrmode_rrs1:$ptr, tGPR:$val), - (tSTRBr tGPR:$val, t_addrmode_rrs1:$ptr)>; +def : T1Pat<(atomic_store_8 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRBr tGPR:$val, t_addrmode_rr:$ptr)>; def : T1Pat<(atomic_store_16 t_addrmode_is2:$ptr, tGPR:$val), (tSTRHi tGPR:$val, t_addrmode_is2:$ptr)>; -def : T1Pat<(atomic_store_16 t_addrmode_rrs2:$ptr, tGPR:$val), - (tSTRHr tGPR:$val, t_addrmode_rrs2:$ptr)>; +def : T1Pat<(atomic_store_16 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRHr tGPR:$val, t_addrmode_rr:$ptr)>; def : T1Pat<(atomic_store_32 t_addrmode_is4:$ptr, tGPR:$val), (tSTRi tGPR:$val, t_addrmode_is4:$ptr)>; -def : T1Pat<(atomic_store_32 t_addrmode_rrs4:$ptr, tGPR:$val), - (tSTRr tGPR:$val, t_addrmode_rrs4:$ptr)>; +def : T1Pat<(atomic_store_32 t_addrmode_rr:$ptr, tGPR:$val), + (tSTRr tGPR:$val, t_addrmode_rr:$ptr)>; // Large immediate handling. diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td index aba8a7b..d460d33 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrThumb2.td @@ -43,7 +43,7 @@ def t2_shift_imm : Operand<i32> { // Shifted operands. No register controlled shifts for Thumb2. // Note: We do not support rrx shifted operands yet. def t2_so_reg : Operand<i32>, // reg imm - ComplexPattern<i32, 2, "SelectT2ShifterOperandReg", + ComplexPattern<i32, 2, "SelectShiftImmShifterOperand", [shl,srl,sra,rotr]> { let EncoderMethod = "getT2SORegOpValue"; let PrintMethod = "printT2SOOperand"; @@ -1554,19 +1554,21 @@ def t2STRBT : T2IstT<0b00, "strbt", IIC_iStore_bh_i>; def t2STRHT : T2IstT<0b01, "strht", IIC_iStore_bh_i>; // ldrd / strd pre / post variants -// For disassembly only. +let mayLoad = 1 in def t2LDRD_PRE : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> { let DecoderMethod = "DecodeT2LDRDPreInstruction"; } +let mayLoad = 1 in def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb), (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm), IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm", "$addr.base = $wb", []>; +let mayStore = 1 in def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr), IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!", @@ -1574,6 +1576,7 @@ def t2STRD_PRE : T2Ii8s4<1, 1, 0, (outs GPR:$wb), let DecoderMethod = "DecodeT2STRDPreInstruction"; } +let mayStore = 1 in def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb), (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr, t2am_imm8s4_offset:$imm), @@ -2100,7 +2103,7 @@ def : T2Pat<(ARMadde rGPR:$src, imm0_65535_neg:$imm, CPSR), def t2SEL : T2ThreeReg<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), NoItinerary, "sel", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-24} = 0b010; let Inst{23} = 0b1; @@ -2117,7 +2120,7 @@ class T2I_pam<bits<3> op22_20, bits<4> op7_4, string opc, dag iops = (ins rGPR:$Rn, rGPR:$Rm), string asm = "\t$Rd, $Rn, $Rm"> : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, pat>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0101; let Inst{22-20} = op22_20; @@ -2215,13 +2218,13 @@ class T2FourReg_mac<bit long, bits<3> op22_20, bits<4> op7_4, dag oops, def t2USAD8 : T2ThreeReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary, "usad8", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2USADA8 : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), NoItinerary, "usada8", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; // Signed/Unsigned saturate. class T2SatI<dag oops, dag iops, InstrItinClass itin, @@ -2254,7 +2257,7 @@ def t2SSAT: T2SatI< def t2SSAT16: T2SatI< (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary, "ssat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11110; let Inst{25-22} = 0b1100; let Inst{20} = 0; @@ -2278,7 +2281,7 @@ def t2USAT: T2SatI< def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), NoItinerary, "usat16", "\t$Rd, $sat_imm, $Rn", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-22} = 0b1111001110; let Inst{20} = 0; let Inst{15} = 0; @@ -2288,8 +2291,8 @@ def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn), let Inst{5-4} = 0b00; } -def : T2Pat<(int_arm_ssat GPR:$a, imm:$pos), (t2SSAT imm:$pos, GPR:$a, 0)>; -def : T2Pat<(int_arm_usat GPR:$a, imm:$pos), (t2USAT imm:$pos, GPR:$a, 0)>; +def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>; +def : T2Pat<(int_arm_usat GPR:$a, imm0_31:$pos), (t2USAT imm0_31:$pos, GPR:$a, 0)>; //===----------------------------------------------------------------------===// // Shift and rotate Instructions. @@ -2605,7 +2608,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110, (outs rGPR:$RdLo, rGPR:$RdHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; } // hasSideEffects // Rounding variants of the below included for disassembly only @@ -2614,7 +2617,7 @@ def t2UMAAL : T2MulLong<0b110, 0b0110, def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (mulhs rGPR:$Rn, rGPR:$Rm))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2624,7 +2627,7 @@ def t2SMMUL : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, def t2SMMULR : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2636,7 +2639,7 @@ def t2SMMLA : T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2646,7 +2649,7 @@ def t2SMMLA : T2FourReg< def t2SMMLAR: T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b101; @@ -2657,7 +2660,7 @@ def t2SMMLS: T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b110; @@ -2667,7 +2670,7 @@ def t2SMMLS: T2FourReg< def t2SMMLSR:T2FourReg< (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b110; @@ -2679,7 +2682,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), (sext_inreg rGPR:$Rm, i16)))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2692,7 +2695,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sext_inreg rGPR:$Rn, i16), (sra rGPR:$Rm, (i32 16))))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2705,7 +2708,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), (sext_inreg rGPR:$Rm, i16)))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2718,7 +2721,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm", [(set rGPR:$Rd, (opnode (sra rGPR:$Rn, (i32 16)), (sra rGPR:$Rm, (i32 16))))]>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2730,7 +2733,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { def WB : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2742,7 +2745,7 @@ multiclass T2I_smul<string opc, PatFrag opnode> { def WT : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2760,7 +2763,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16), (sext_inreg rGPR:$Rm, i16))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2773,7 +2776,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16), (sra rGPR:$Rm, (i32 16)))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2786,7 +2789,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), (sext_inreg rGPR:$Rm, i16))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2799,7 +2802,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra", [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)), (sra rGPR:$Rm, (i32 16)))))]>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b001; @@ -2811,7 +2814,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2823,7 +2826,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> { (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> { + Requires<[IsThumb2, HasDSP, UseMulOps]> { let Inst{31-27} = 0b11111; let Inst{26-23} = 0b0110; let Inst{22-20} = 0b011; @@ -2839,79 +2842,79 @@ defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>; def t2SMLALBB : T2FourReg_mac<1, 0b100, 0b1000, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbb", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALBT : T2FourReg_mac<1, 0b100, 0b1001, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlalbt", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALTB : T2FourReg_mac<1, 0b100, 0b1010, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltb", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALTT : T2FourReg_mac<1, 0b100, 0b1011, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaltt", "\t$Ra, $Rd, $Rn, $Rm", [/* For disassembly only; pattern left blank */]>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; // Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD def t2SMUAD: T2ThreeReg_mac< 0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smuad", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMUADX:T2ThreeReg_mac< 0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smuadx", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMUSD: T2ThreeReg_mac< 0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smusd", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMUSDX:T2ThreeReg_mac< 0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC32, "smusdx", "\t$Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]> { + Requires<[IsThumb2, HasDSP]> { let Inst{15-12} = 0b1111; } def t2SMLAD : T2FourReg_mac< 0, 0b010, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlad", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLADX : T2FourReg_mac< 0, 0b010, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smladx", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSD : T2FourReg_mac<0, 0b100, 0b0000, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsd", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSDX : T2FourReg_mac<0, 0b100, 0b0001, (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32, "smlsdx", "\t$Rd, $Rn, $Rm, $Ra", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALD : T2FourReg_mac<1, 0b100, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64, "smlald", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLALDX : T2FourReg_mac<1, 0b100, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlaldx", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSLD : T2FourReg_mac<1, 0b101, 0b1100, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rn,rGPR:$Rm), IIC_iMAC64, "smlsld", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd), (ins rGPR:$Rm,rGPR:$Rn), IIC_iMAC64, "smlsldx", "\t$Ra, $Rd, $Rn, $Rm", []>, - Requires<[IsThumb2, HasThumb2DSP]>; + Requires<[IsThumb2, HasDSP]>; //===----------------------------------------------------------------------===// // Division Instructions. @@ -2961,7 +2964,7 @@ def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, "rbit", "\t$Rd, $Rm", - [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>, + [(set rGPR:$Rd, (bitreverse rGPR:$Rm))]>, Sched<[WriteALU]>; def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr, diff --git a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td index e83f8c8..050cd1a 100644 --- a/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td +++ b/contrib/llvm/lib/Target/ARM/ARMInstrVFP.td @@ -20,7 +20,6 @@ def arm_cmpfp : SDNode<"ARMISD::CMPFP", SDT_ARMCmp, [SDNPOutGlue]>; def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>; def arm_fmdrr : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>; - //===----------------------------------------------------------------------===// // Operand Definitions. // @@ -93,7 +92,7 @@ def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$Dd), (ins addrmode5:$addr), def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr), IIC_fpLoad32, "vldr", "\t$Sd, $addr", - [(set SPR:$Sd, (load addrmode5:$addr))]> { + [(set SPR:$Sd, (alignedload32 addrmode5:$addr))]> { // Some single precision VFP instructions may be executed on both NEON and VFP // pipelines. let D = VFPNeonDomain; @@ -107,7 +106,7 @@ def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr), def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr), IIC_fpStore32, "vstr", "\t$Sd, $addr", - [(store SPR:$Sd, addrmode5:$addr)]> { + [(alignedstore32 SPR:$Sd, addrmode5:$addr)]> { // Some single precision VFP instructions may be executed on both NEON and VFP // pipelines. let D = VFPNeonDomain; @@ -393,8 +392,8 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> { } } -defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>; -defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>; +defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, fmaxnum>; +defm VMINNM : vmaxmin_inst<"vminnm", 1, fminnum>; // Match reassociated forms only if not sign dependent rounding. def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)), @@ -541,19 +540,23 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm, // FIXME: Verify encoding after integrated assembler is working. def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm), /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm", - [/* For disassembly only; pattern left blank */]>; + [/* For disassembly only; pattern left blank */]>, + Requires<[HasFP16]>; def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs DPR:$Dd), (ins SPR:$Sm), @@ -922,6 +925,22 @@ def VMOVDRR : AVConv5I<0b11000100, 0b1011, let isRegSequence = 1; } +// Hoist an fabs or a fneg of a value coming from integer registers +// and do the fabs/fneg on the integer value. This is never a lose +// and could enable the conversion to float to be removed completely. +def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, + Requires<[IsARM]>; +def : Pat<(fabs (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (t2BFC GPR:$Rh, (i32 0x7FFFFFFF)))>, + Requires<[IsThumb2]>; +def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (EORri GPR:$Rh, (i32 0x80000000)))>, + Requires<[IsARM]>; +def : Pat<(fneg (arm_fmdrr GPR:$Rl, GPR:$Rh)), + (VMOVDRR GPR:$Rl, (t2EORri GPR:$Rh, (i32 0x80000000)))>, + Requires<[IsThumb2]>; + let hasSideEffects = 0 in def VMOVSRR : AVConv5I<0b11000100, 0b1010, (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2), @@ -1003,7 +1022,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(f64 (sint_to_fp GPR:$a)), (VSITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (sint_to_fp (i32 (load addrmode5:$a)))), + def : VFPPat<(f64 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOD (VLDRS addrmode5:$a))>; } @@ -1021,7 +1040,7 @@ def VSITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, def : VFPNoNEONPat<(f32 (sint_to_fp GPR:$a)), (VSITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (load addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VSITOS (VLDRS addrmode5:$a))>; def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011, @@ -1035,7 +1054,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(f64 (uint_to_fp GPR:$a)), (VUITOD (COPY_TO_REGCLASS GPR:$a, SPR))>; - def : VFPPat<(f64 (uint_to_fp (i32 (load addrmode5:$a)))), + def : VFPPat<(f64 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOD (VLDRS addrmode5:$a))>; } @@ -1053,7 +1072,7 @@ def VUITOS : AVConv1InSs_Encode<0b11101, 0b11, 0b1000, 0b1010, def : VFPNoNEONPat<(f32 (uint_to_fp GPR:$a)), (VUITOS (COPY_TO_REGCLASS GPR:$a, SPR))>; -def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (load addrmode5:$a)))), +def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))), (VUITOS (VLDRS addrmode5:$a))>; // FP -> Int: @@ -1106,7 +1125,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(i32 (fp_to_sint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOSIZD DPR:$a), GPR)>; - def : VFPPat<(store (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (fp_to_sint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOSIZD DPR:$a), addrmode5:$ptr)>; } @@ -1124,7 +1143,8 @@ def VTOSIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1101, 0b1010, def : VFPNoNEONPat<(i32 (fp_to_sint SPR:$a)), (COPY_TO_REGCLASS (VTOSIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(store (i32 (fp_to_sint (f32 SPR:$a))), addrmode5:$ptr), +def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))), + addrmode5:$ptr), (VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>; def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011, @@ -1138,7 +1158,7 @@ let Predicates=[HasVFP2, HasDPVFP] in { def : VFPPat<(i32 (fp_to_uint (f64 DPR:$a))), (COPY_TO_REGCLASS (VTOUIZD DPR:$a), GPR)>; - def : VFPPat<(store (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), + def : VFPPat<(alignedstore32 (i32 (fp_to_uint (f64 DPR:$a))), addrmode5:$ptr), (VSTRS (VTOUIZD DPR:$a), addrmode5:$ptr)>; } @@ -1156,7 +1176,8 @@ def VTOUIZS : AVConv1InsS_Encode<0b11101, 0b11, 0b1100, 0b1010, def : VFPNoNEONPat<(i32 (fp_to_uint SPR:$a)), (COPY_TO_REGCLASS (VTOUIZS SPR:$a), GPR)>; -def : VFPNoNEONPat<(store (i32 (fp_to_uint (f32 SPR:$a))), addrmode5:$ptr), +def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))), + addrmode5:$ptr), (VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>; // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR. diff --git a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 265b86f..725b838 100644 --- a/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -60,17 +60,24 @@ STATISTIC(NumSTRD2STM, "Number of strd instructions turned back into stm"); STATISTIC(NumLDRD2LDR, "Number of ldrd instructions turned back into ldr's"); STATISTIC(NumSTRD2STR, "Number of strd instructions turned back into str's"); +namespace llvm { +void initializeARMLoadStoreOptPass(PassRegistry &); +} + +#define ARM_LOAD_STORE_OPT_NAME "ARM load / store optimization pass" + namespace { /// Post- register allocation pass the combine load / store instructions to /// form ldm / stm instructions. struct ARMLoadStoreOpt : public MachineFunctionPass { static char ID; - ARMLoadStoreOpt() : MachineFunctionPass(ID) {} + ARMLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } const MachineFunction *MF; const TargetInstrInfo *TII; const TargetRegisterInfo *TRI; - const MachineRegisterInfo *MRI; const ARMSubtarget *STI; const TargetLowering *TL; ARMFunctionInfo *AFI; @@ -84,7 +91,7 @@ namespace { bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "ARM load / store optimization pass"; + return ARM_LOAD_STORE_OPT_NAME; } private: @@ -118,6 +125,7 @@ namespace { }; SpecificBumpPtrAllocator<MergeCandidate> Allocator; SmallVector<const MergeCandidate*,4> Candidates; + SmallVector<MachineInstr*,4> MergeBaseCandidates; void moveLiveRegsBefore(const MachineBasicBlock &MBB, MachineBasicBlock::const_iterator Before); @@ -140,12 +148,16 @@ namespace { MachineBasicBlock::iterator &MBBI); bool MergeBaseUpdateLoadStore(MachineInstr *MI); bool MergeBaseUpdateLSMultiple(MachineInstr *MI); + bool MergeBaseUpdateLSDouble(MachineInstr &MI) const; bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); + bool CombineMovBx(MachineBasicBlock &MBB); }; char ARMLoadStoreOpt::ID = 0; } +INITIALIZE_PASS(ARMLoadStoreOpt, "arm-load-store-opt", ARM_LOAD_STORE_OPT_NAME, false, false) + static bool definesCPSR(const MachineInstr *MI) { for (const auto &MO : MI->operands()) { if (!MO.isReg()) @@ -619,9 +631,10 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, unsigned NewBase; if (isi32Load(Opcode)) { - // If it is a load, then just use one of the destination register to - // use as the new base. + // If it is a load, then just use one of the destination registers + // as the new base. Will no longer be writeback in Thumb1. NewBase = Regs[NumRegs-1].first; + Writeback = false; } else { // Find a free register that we can use as scratch register. moveLiveRegsBefore(MBB, InsertBefore); @@ -725,9 +738,12 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(MachineBasicBlock &MBB, MachineInstrBuilder MIB; if (Writeback) { - if (Opcode == ARM::tLDMIA) + assert(isThumb1 && "expected Writeback only inThumb1"); + if (Opcode == ARM::tLDMIA) { + assert(!(ContainsReg(Regs, Base)) && "Thumb1 can't LDM ! with Base in Regs"); // Update tLDMIA with writeback if necessary. Opcode = ARM::tLDMIA_UPD; + } MIB = BuildMI(MBB, InsertBefore, DL, TII->get(Opcode)); @@ -784,6 +800,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { SmallVector<std::pair<unsigned, bool>, 8> Regs; SmallVector<unsigned, 4> ImpDefs; DenseSet<unsigned> KilledRegs; + DenseSet<unsigned> UsedRegs; // Determine list of registers and list of implicit super-register defs. for (const MachineInstr *MI : Cand.Instrs) { const MachineOperand &MO = getLoadStoreRegOp(*MI); @@ -792,6 +809,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { if (IsKill) KilledRegs.insert(Reg); Regs.push_back(std::make_pair(Reg, IsKill)); + UsedRegs.insert(Reg); if (IsLoad) { // Collect any implicit defs of super-registers, after merging we can't @@ -881,7 +899,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) { for (MachineOperand &MO : MI.uses()) { if (!MO.isReg() || !MO.isKill()) continue; - if (KilledRegs.count(MO.getReg())) + if (UsedRegs.count(MO.getReg())) MO.setIsKill(false); } } @@ -995,76 +1013,6 @@ void ARMLoadStoreOpt::FormCandidates(const MemOpQueue &MemOps) { } while (SIndex < EIndex); } -static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; - - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::tSUBi8: - case ARM::t2SUBri: - case ARM::SUBri: - CheckCPSRDef = true; - break; - case ARM::tSUBspi: - break; - } - - // Make sure the offset fits in 8 bits. - if (Bytes == 0 || (Limit && Bytes >= Limit)) - return false; - - unsigned Scale = (MI->getOpcode() == ARM::tSUBspi || - MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm() * Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; - - return CheckCPSRDef ? !definesCPSR(MI) : true; -} - -static bool isMatchingIncrement(MachineInstr *MI, unsigned Base, - unsigned Bytes, unsigned Limit, - ARMCC::CondCodes Pred, unsigned PredReg) { - unsigned MyPredReg = 0; - if (!MI) - return false; - - bool CheckCPSRDef = false; - switch (MI->getOpcode()) { - default: return false; - case ARM::tADDi8: - case ARM::t2ADDri: - case ARM::ADDri: - CheckCPSRDef = true; - break; - case ARM::tADDspi: - break; - } - - if (Bytes == 0 || (Limit && Bytes >= Limit)) - // Make sure the offset fits in 8 bits. - return false; - - unsigned Scale = (MI->getOpcode() == ARM::tADDspi || - MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME - if (!(MI->getOperand(0).getReg() == Base && - MI->getOperand(1).getReg() == Base && - (MI->getOperand(2).getImm() * Scale) == Bytes && - getInstrPredicate(MI, MyPredReg) == Pred && - MyPredReg == PredReg)) - return false; - - return CheckCPSRDef ? !definesCPSR(MI) : true; -} - static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, ARM_AM::AMSubMode Mode) { switch (Opc) { @@ -1132,6 +1080,75 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc, } } +/// Check if the given instruction increments or decrements a register and +/// return the amount it is incremented/decremented. Returns 0 if the CPSR flags +/// generated by the instruction are possibly read as well. +static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg) { + bool CheckCPSRDef; + int Scale; + switch (MI.getOpcode()) { + case ARM::tADDi8: Scale = 4; CheckCPSRDef = true; break; + case ARM::tSUBi8: Scale = -4; CheckCPSRDef = true; break; + case ARM::t2SUBri: + case ARM::SUBri: Scale = -1; CheckCPSRDef = true; break; + case ARM::t2ADDri: + case ARM::ADDri: Scale = 1; CheckCPSRDef = true; break; + case ARM::tADDspi: Scale = 4; CheckCPSRDef = false; break; + case ARM::tSUBspi: Scale = -4; CheckCPSRDef = false; break; + default: return 0; + } + + unsigned MIPredReg; + if (MI.getOperand(0).getReg() != Reg || + MI.getOperand(1).getReg() != Reg || + getInstrPredicate(&MI, MIPredReg) != Pred || + MIPredReg != PredReg) + return 0; + + if (CheckCPSRDef && definesCPSR(&MI)) + return 0; + return MI.getOperand(2).getImm() * Scale; +} + +/// Searches for an increment or decrement of \p Reg before \p MBBI. +static MachineBasicBlock::iterator +findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator BeginMBBI = MBB.begin(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + if (MBBI == BeginMBBI) + return EndMBBI; + + // Skip debug values. + MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); + while (PrevMBBI->isDebugValue() && PrevMBBI != BeginMBBI) + --PrevMBBI; + + Offset = isIncrementOrDecrement(*PrevMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : PrevMBBI; +} + +/// Searches for a increment or decrement of \p Reg after \p MBBI. +static MachineBasicBlock::iterator +findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg, + ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) { + Offset = 0; + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineBasicBlock::iterator EndMBBI = MBB.end(); + MachineBasicBlock::iterator NextMBBI = std::next(MBBI); + // Skip debug values. + while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) + ++NextMBBI; + if (NextMBBI == EndMBBI) + return EndMBBI; + + Offset = isIncrementOrDecrement(*NextMBBI, Reg, Pred, PredReg); + return Offset == 0 ? EndMBBI : NextMBBI; +} + /// Fold proceeding/trailing inc/dec of base register into the /// LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible: /// @@ -1151,7 +1168,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { const MachineOperand &BaseOP = MI->getOperand(0); unsigned Base = BaseOP.getReg(); bool BaseKill = BaseOP.isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); unsigned Opcode = MI->getOpcode(); @@ -1163,49 +1179,24 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) { if (MI->getOperand(i).getReg() == Base) return false; - bool DoMerge = false; - ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); - - // Try merging with the previous instruction. + int Bytes = getLSMultipleTransferSize(MI); MachineBasicBlock &MBB = *MI->getParent(); - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); MachineBasicBlock::iterator MBBI(MI); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (Mode == ARM_AM::ia && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::db; - DoMerge = true; - } else if (Mode == ARM_AM::ib && - isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) { - Mode = ARM_AM::da; - DoMerge = true; - } - if (DoMerge) - MBB.erase(PrevMBBI); - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) && - isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) && - isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) { - DoMerge = true; - } - if (DoMerge) - MBB.erase(NextMBBI); + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + ARM_AM::AMSubMode Mode = getLoadStoreMultipleSubMode(Opcode); + if (Mode == ARM_AM::ia && Offset == -Bytes) { + Mode = ARM_AM::db; + } else if (Mode == ARM_AM::ib && Offset == -Bytes) { + Mode = ARM_AM::da; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (((Mode != ARM_AM::ia && Mode != ARM_AM::ib) || Offset != Bytes) && + ((Mode != ARM_AM::da && Mode != ARM_AM::db) || Offset != -Bytes)) + return false; } - - if (!DoMerge) - return false; + MBB.erase(MergeInstr); unsigned NewOpc = getUpdatingLSMultipleOpcode(Opcode, Mode); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)) @@ -1283,7 +1274,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { unsigned Base = getLoadStoreBaseOp(*MI).getReg(); bool BaseKill = getLoadStoreBaseOp(*MI).isKill(); - unsigned Bytes = getLSMultipleTransferSize(MI); unsigned Opcode = MI->getOpcode(); DebugLoc DL = MI->getDebugLoc(); bool isAM5 = (Opcode == ARM::VLDRD || Opcode == ARM::VLDRS || @@ -1295,7 +1285,6 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0) return false; - bool isLd = isLoadSingle(Opcode); // Can't do the merge if the destination register is the same as the would-be // writeback register. if (MI->getOperand(0).getReg() == Base) @@ -1303,55 +1292,31 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { unsigned PredReg = 0; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); - bool DoMerge = false; - ARM_AM::AddrOpc AddSub = ARM_AM::add; - unsigned NewOpc = 0; - // AM2 - 12 bits, thumb2 - 8 bits. - unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100); - - // Try merging with the previous instruction. + int Bytes = getLSMultipleTransferSize(MI); MachineBasicBlock &MBB = *MI->getParent(); - MachineBasicBlock::iterator BeginMBBI = MBB.begin(); MachineBasicBlock::iterator MBBI(MI); - if (MBBI != BeginMBBI) { - MachineBasicBlock::iterator PrevMBBI = std::prev(MBBI); - while (PrevMBBI != BeginMBBI && PrevMBBI->isDebugValue()) - --PrevMBBI; - if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (!isAM5 && - isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPreIndexedLoadStoreOpcode(Opcode, AddSub); - MBB.erase(PrevMBBI); - } - } - - // Try merging with the next instruction. - MachineBasicBlock::iterator EndMBBI = MBB.end(); - if (!DoMerge && MBBI != EndMBBI) { - MachineBasicBlock::iterator NextMBBI = std::next(MBBI); - while (NextMBBI != EndMBBI && NextMBBI->isDebugValue()) - ++NextMBBI; - if (!isAM5 && - isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) { - DoMerge = true; - AddSub = ARM_AM::sub; - } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) { - DoMerge = true; - } - if (DoMerge) { - NewOpc = getPostIndexedLoadStoreOpcode(Opcode, AddSub); - MBB.erase(NextMBBI); - } + int Offset; + MachineBasicBlock::iterator MergeInstr + = findIncDecBefore(MBBI, Base, Pred, PredReg, Offset); + unsigned NewOpc; + if (!isAM5 && Offset == Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (Offset == -Bytes) { + NewOpc = getPreIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::add); + } else if (!isAM5 && Offset == -Bytes) { + NewOpc = getPostIndexedLoadStoreOpcode(Opcode, ARM_AM::sub); + } else + return false; } + MBB.erase(MergeInstr); - if (!DoMerge) - return false; + ARM_AM::AddrOpc AddSub = Offset < 0 ? ARM_AM::sub : ARM_AM::add; + bool isLd = isLoadSingle(Opcode); if (isAM5) { // VLDM[SD]_UPD, VSTM[SD]_UPD // (There are no base-updating versions of VLDR/VSTR instructions, but the @@ -1368,18 +1333,16 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { if (isAM2) { // LDR_PRE, LDR_POST if (NewOpc == ARM::LDR_PRE_IMM || NewOpc == ARM::LDRB_PRE_IMM) { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg); } else { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2LDR_PRE, t2LDR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), MI->getOperand(0).getReg()) .addReg(Base, RegState::Define) @@ -1391,13 +1354,12 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { // the vestigal zero-reg offset register. When that's fixed, this clause // can be removed entirely. if (isAM2 && NewOpc == ARM::STR_POST_IMM) { - int Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); + int Imm = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift); // STR_PRE, STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) - .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg); + .addReg(Base).addReg(0).addImm(Imm).addImm(Pred).addReg(PredReg); } else { - int Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes; // t2STR_PRE, t2STR_POST BuildMI(MBB, MBBI, DL, TII->get(NewOpc), Base) .addReg(MO.getReg(), getKillRegState(MO.isKill())) @@ -1409,46 +1371,75 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) { return true; } -/// Returns true if instruction is a memory operation that this pass is capable -/// of operating on. -static bool isMemoryOp(const MachineInstr *MI) { - // When no memory operands are present, conservatively assume unaligned, - // volatile, unfoldable. - if (!MI->hasOneMemOperand()) +bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert((Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) && + "Must have t2STRDi8 or t2LDRDi8"); + if (MI.getOperand(3).getImm() != 0) return false; - const MachineMemOperand *MMO = *MI->memoperands_begin(); - - // Don't touch volatile memory accesses - we may be changing their order. - if (MMO->isVolatile()) + // Behaviour for writeback is undefined if base register is the same as one + // of the others. + const MachineOperand &BaseOp = MI.getOperand(2); + unsigned Base = BaseOp.getReg(); + const MachineOperand &Reg0Op = MI.getOperand(0); + const MachineOperand &Reg1Op = MI.getOperand(1); + if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base) return false; - // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is - // not. - if (MMO->getAlignment() < 4) - return false; + unsigned PredReg; + ARMCC::CondCodes Pred = getInstrPredicate(&MI, PredReg); + MachineBasicBlock::iterator MBBI(MI); + MachineBasicBlock &MBB = *MI.getParent(); + int Offset; + MachineBasicBlock::iterator MergeInstr = findIncDecBefore(MBBI, Base, Pred, + PredReg, Offset); + unsigned NewOpc; + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_PRE : ARM::t2STRD_PRE; + } else { + MergeInstr = findIncDecAfter(MBBI, Base, Pred, PredReg, Offset); + if (Offset == 8 || Offset == -8) { + NewOpc = Opcode == ARM::t2LDRDi8 ? ARM::t2LDRD_POST : ARM::t2STRD_POST; + } else + return false; + } + MBB.erase(MergeInstr); - // str <undef> could probably be eliminated entirely, but for now we just want - // to avoid making a mess of it. - // FIXME: Use str <undef> as a wildcard to enable better stm folding. - if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && - MI->getOperand(0).isUndef()) - return false; + DebugLoc DL = MI.getDebugLoc(); + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); + if (NewOpc == ARM::t2LDRD_PRE || NewOpc == ARM::t2LDRD_POST) { + MIB.addOperand(Reg0Op).addOperand(Reg1Op) + .addReg(BaseOp.getReg(), RegState::Define); + } else { + assert(NewOpc == ARM::t2STRD_PRE || NewOpc == ARM::t2STRD_POST); + MIB.addReg(BaseOp.getReg(), RegState::Define) + .addOperand(Reg0Op).addOperand(Reg1Op); + } + MIB.addReg(BaseOp.getReg(), RegState::Kill) + .addImm(Offset).addImm(Pred).addReg(PredReg); + assert(TII->get(Opcode).getNumOperands() == 6 && + TII->get(NewOpc).getNumOperands() == 7 && + "Unexpected number of operands in Opcode specification."); - // Likewise don't mess with references to undefined addresses. - if (MI->getNumOperands() > 1 && MI->getOperand(1).isReg() && - MI->getOperand(1).isUndef()) - return false; + // Transfer implicit operands. + for (const MachineOperand &MO : MI.implicit_operands()) + MIB.addOperand(MO); + MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); - unsigned Opcode = MI->getOpcode(); + MBB.erase(MBBI); + return true; +} + +/// Returns true if instruction is a memory operation that this pass is capable +/// of operating on. +static bool isMemoryOp(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); switch (Opcode) { - default: break; case ARM::VLDRS: case ARM::VSTRS: - return MI->getOperand(1).isReg(); case ARM::VLDRD: case ARM::VSTRD: - return MI->getOperand(1).isReg(); case ARM::LDRi12: case ARM::STRi12: case ARM::tLDRi: @@ -1459,9 +1450,40 @@ static bool isMemoryOp(const MachineInstr *MI) { case ARM::t2LDRi12: case ARM::t2STRi8: case ARM::t2STRi12: - return MI->getOperand(1).isReg(); + break; + default: + return false; } - return false; + if (!MI.getOperand(1).isReg()) + return false; + + // When no memory operands are present, conservatively assume unaligned, + // volatile, unfoldable. + if (!MI.hasOneMemOperand()) + return false; + + const MachineMemOperand &MMO = **MI.memoperands_begin(); + + // Don't touch volatile memory accesses - we may be changing their order. + if (MMO.isVolatile()) + return false; + + // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is + // not. + if (MMO.getAlignment() < 4) + return false; + + // str <undef> could probably be eliminated entirely, but for now we just want + // to avoid making a mess of it. + // FIXME: Use str <undef> as a wildcard to enable better stm folding. + if (MI.getOperand(0).isReg() && MI.getOperand(0).isUndef()) + return false; + + // Likewise don't mess with references to undefined addresses. + if (MI.getOperand(1).isUndef()) + return false; + + return true; } static void InsertLDR_STR(MachineBasicBlock &MBB, @@ -1616,6 +1638,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { ARMCC::CondCodes CurrPred = ARMCC::AL; unsigned Position = 0; assert(Candidates.size() == 0); + assert(MergeBaseCandidates.size() == 0); LiveRegsValid = false; for (MachineBasicBlock::iterator I = MBB.end(), MBBI; I != MBB.begin(); @@ -1626,7 +1649,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { continue; ++Position; - if (isMemoryOp(MBBI)) { + if (isMemoryOp(*MBBI)) { unsigned Opcode = MBBI->getOpcode(); const MachineOperand &MO = MBBI->getOperand(0); unsigned Reg = MO.getReg(); @@ -1694,8 +1717,15 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { MBBI = I; --Position; // Fallthrough to look into existing chain. - } else if (MBBI->isDebugValue()) + } else if (MBBI->isDebugValue()) { continue; + } else if (MBBI->getOpcode() == ARM::t2LDRDi8 || + MBBI->getOpcode() == ARM::t2STRDi8) { + // ARMPreAllocLoadStoreOpt has already formed some LDRD/STRD instructions + // remember them because we may still be able to merge add/sub into them. + MergeBaseCandidates.push_back(MBBI); + } + // If we are here then the chain is broken; Extract candidates for a merge. if (MemOps.size() > 0) { @@ -1726,7 +1756,9 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { if (Merged) { Changed = true; unsigned Opcode = Merged->getOpcode(); - if (Opcode != ARM::t2STRDi8 && Opcode != ARM::t2LDRDi8) + if (Opcode == ARM::t2STRDi8 || Opcode == ARM::t2LDRDi8) + MergeBaseUpdateLSDouble(*Merged); + else MergeBaseUpdateLSMultiple(Merged); } else { for (MachineInstr *MI : Candidate->Instrs) { @@ -1741,6 +1773,10 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { } } Candidates.clear(); + // Try to fold add/sub into the LDRD/STRD formed by ARMPreAllocLoadStoreOpt. + for (MachineInstr *MI : MergeBaseCandidates) + MergeBaseUpdateLSDouble(*MI); + MergeBaseCandidates.clear(); return Changed; } @@ -1765,7 +1801,11 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET || MBBI->getOpcode() == ARM::MOVPCLR)) { - MachineInstr *PrevMI = std::prev(MBBI); + MachineBasicBlock::iterator PrevI = std::prev(MBBI); + // Ignore any DBG_VALUE instructions. + while (PrevI->isDebugValue() && PrevI != MBB.begin()) + --PrevI; + MachineInstr *PrevMI = PrevI; unsigned Opcode = PrevMI->getOpcode(); if (Opcode == ARM::LDMIA_UPD || Opcode == ARM::LDMDA_UPD || Opcode == ARM::LDMDB_UPD || Opcode == ARM::LDMIB_UPD || @@ -1786,6 +1826,30 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) { return false; } +bool ARMLoadStoreOpt::CombineMovBx(MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + if (MBBI == MBB.begin() || MBBI == MBB.end() || + MBBI->getOpcode() != ARM::tBX_RET) + return false; + + MachineBasicBlock::iterator Prev = MBBI; + --Prev; + if (Prev->getOpcode() != ARM::tMOVr || !Prev->definesRegister(ARM::LR)) + return false; + + for (auto Use : Prev->uses()) + if (Use.isKill()) { + AddDefaultPred(BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::tBX)) + .addReg(Use.getReg(), RegState::Kill)) + .copyImplicitOps(&*MBBI); + MBB.erase(MBBI); + MBB.erase(Prev); + return true; + } + + llvm_unreachable("tMOVr doesn't kill a reg before tBX_RET?"); +} + bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { MF = &Fn; STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget()); @@ -1793,7 +1857,7 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { AFI = Fn.getInfo<ARMFunctionInfo>(); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); - MRI = &Fn.getRegInfo(); + RegClassInfoValid = false; isThumb2 = AFI->isThumb2Function(); isThumb1 = AFI->isThumbFunction() && !isThumb2; @@ -1805,18 +1869,29 @@ bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { Modified |= LoadStoreMultipleOpti(MBB); if (STI->hasV5TOps()) Modified |= MergeReturnIntoLDM(MBB); + if (isThumb1) + Modified |= CombineMovBx(MBB); } Allocator.DestroyAll(); return Modified; } +namespace llvm { +void initializeARMPreAllocLoadStoreOptPass(PassRegistry &); +} + +#define ARM_PREALLOC_LOAD_STORE_OPT_NAME \ + "ARM pre- register allocation load / store optimization pass" + namespace { /// Pre- register allocation pass that move load / stores from consecutive /// locations close to make it more likely they will be combined later. struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{ static char ID; - ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {} + ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) { + initializeARMPreAllocLoadStoreOptPass(*PassRegistry::getPassRegistry()); + } const DataLayout *TD; const TargetInstrInfo *TII; @@ -1828,7 +1903,7 @@ namespace { bool runOnMachineFunction(MachineFunction &Fn) override; const char *getPassName() const override { - return "ARM pre- register allocation load / store optimization pass"; + return ARM_PREALLOC_LOAD_STORE_OPT_NAME; } private: @@ -1847,8 +1922,11 @@ namespace { char ARMPreAllocLoadStoreOpt::ID = 0; } +INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-load-store-opt", + ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false) + bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { - TD = Fn.getTarget().getDataLayout(); + TD = &Fn.getDataLayout(); STI = &static_cast<const ARMSubtarget &>(Fn.getSubtarget()); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); @@ -1856,9 +1934,8 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { MF = &Fn; bool Modified = false; - for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; - ++MFI) - Modified |= RescheduleLoadStoreInstrs(MFI); + for (MachineBasicBlock &MFI : Fn) + Modified |= RescheduleLoadStoreInstrs(&MFI); return Modified; } @@ -2187,7 +2264,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) { if (!MI->isDebugValue()) MI2LocMap[MI] = ++Loc; - if (!isMemoryOp(MI)) + if (!isMemoryOp(*MI)) continue; unsigned PredReg = 0; if (getInstrPredicate(MI, PredReg) != ARMCC::AL) @@ -2275,3 +2352,4 @@ FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) { return new ARMPreAllocLoadStoreOpt(); return new ARMLoadStoreOpt(); } + diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp index f5250ff..ac0330f 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- ARMMachineFuctionInfo.cpp - ARM machine function info -------------===// +//===-- ARMMachineFunctionInfo.cpp - ARM machine function info ------------===// // // The LLVM Compiler Infrastructure // @@ -20,5 +20,4 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) RestoreSPFromFP(false), LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), - PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), - GlobalBaseReg(0) {} + PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false) {} diff --git a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 14dd9ef..d644797 100644 --- a/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===-- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===// +//===-- ARMMachineFunctionInfo.h - ARM machine function info ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -52,7 +52,7 @@ class ARMFunctionInfo : public MachineFunctionInfo { unsigned ReturnRegsCount; /// HasStackFrame - True if this function has a stack frame. Set by - /// processFunctionBeforeCalleeSavedScan(). + /// determineCalleeSaves(). bool HasStackFrame; /// RestoreSPFromFP - True if epilogue should restore SP from FP. Set by @@ -110,11 +110,6 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// pass. DenseMap<unsigned, unsigned> CPEClones; - /// GlobalBaseReg - keeps track of the virtual register initialized for - /// use as the global base register. This is used for PIC in some PIC - /// relocation models. - unsigned GlobalBaseReg; - /// ArgumentStackSize - amount of bytes on stack consumed by the arguments /// being passed on the stack unsigned ArgumentStackSize; @@ -133,7 +128,7 @@ public: FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0), NumAlignedDPRCS2Regs(0), PICLabelUId(0), - VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {} + VarArgsFrameIndex(0), HasITBlocks(false) {} explicit ARMFunctionInfo(MachineFunction &MF); @@ -204,9 +199,6 @@ public: bool hasITBlocks() const { return HasITBlocks; } void setHasITBlocks(bool h) { HasITBlocks = h; } - unsigned getGlobalBaseReg() const { return GlobalBaseReg; } - void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; } - void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) llvm_unreachable("Duplicate entries!"); diff --git a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td index 45cc9ea..02cbfb1 100644 --- a/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/contrib/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -266,12 +266,19 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { } // Scalar single precision floating point register class.. -// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to -// avoid partial-write dependencies on D registers (S registers are -// renamed as portions of D registers). -def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate - (sequence "S%u", 0, 31), 2), - (sequence "S%u", 0, 31))>; +// FIXME: Allocation order changed to s0, s2, ... or s0, s4, ... as a quick hack +// to avoid partial-write dependencies on D or Q (depending on platform) +// registers (S registers are renamed as portions of D/Q registers). +def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> { + let AltOrders = [(add (decimate SPR, 2), SPR), + (add (decimate SPR, 4), + (decimate SPR, 2), + (decimate (rotl SPR, 1), 4), + (decimate (rotl SPR, 1), 2))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF); + }]; +} // Subset of SPR which can be used as a source of NEON scalars for 16-bit // operations @@ -281,25 +288,29 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>; // class. // ARM requires only word alignment for double. It's more performant if it // is double-word alignment though. -def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, +def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, (sequence "D%u", 0, 31)> { - // Allocate non-VFP2 registers D16-D31 first. - let AltOrders = [(rotl DPR, 16)]; - let AltOrderSelect = [{ return 1; }]; + // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on + // Darwin platforms. + let AltOrders = [(rotl DPR, 16), + (add (decimate (rotl DPR, 16), 2), (rotl DPR, 16))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF); + }]; } // Subset of DPR that are accessible with VFP2 (and so that also have // 32-bit SPR subregs). -def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, +def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, (trunc DPR, 16)>; // Subset of DPR which can be used as a source of NEON scalars for 16-bit // operations -def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64, +def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64, (trunc DPR, 8)>; // Generic 128-bit vector register class. -def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, +def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128, (sequence "Q%u", 0, 15)> { // Allocate non-VFP2 aliases Q8-Q15 first. let AltOrders = [(rotl QPR, 8)]; diff --git a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td index b03d5ff..3ad7730 100644 --- a/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td +++ b/contrib/llvm/lib/Target/ARM/ARMScheduleSwift.td @@ -37,1050 +37,13 @@ def SW_FDIV : FuncUnit; // FIXME: Add preload instruction when it is documented. // FIXME: Model non-pipelined nature of FP div / sqrt unit. -def SwiftItineraries : ProcessorItineraries< - [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [ - // - // Move instructions, unconditional - InstrItinData<IIC_iMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2]>, - InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3]>, - InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_LS]>], - [5]>, - // - // MVN instructions - InstrItinData<IIC_iMVNi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMVNr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMVNsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iMVNsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - // - // No operand cycles - InstrItinData<IIC_iALUx , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>]>, - // - // Binary Instructions that produce a result - InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1]>, - // - // Bitwise Instructions that produce a result - InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1]>, - // - // Unary Instructions that produce a result - - // CLZ, RBIT, etc. - InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - - // BFC, BFI, UBFX, SBFX - InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1]>, - - // - // Zero and sign extension instructions - InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1, 1, 1]>, - // - // Compare instructions - InstrItinData<IIC_iCMPi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iCMPr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMPsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMPsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - // - // Test instructions - InstrItinData<IIC_iTSTi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iTSTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iTSTsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iTSTsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<2, [SW_ALU0, SW_ALU1]>], - [1, 1, 1]>, - // - // Move instructions, conditional - // FIXME: Correctly model the extra input dep on the destination. - InstrItinData<IIC_iCMOVi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - InstrItinData<IIC_iCMOVr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1, 1]>, - InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2]>, - - // Integer multiply pipeline - // - InstrItinData<IIC_iMUL16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [3, 1, 1]>, - InstrItinData<IIC_iMAC16 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - InstrItinData<IIC_iMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - InstrItinData<IIC_iMUL64 , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0], 3>, - InstrStage<1, [SW_ALU0]>], - [5, 5, 1, 1]>, - InstrItinData<IIC_iMAC64 , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 6, 1, 1]>, - // - // Integer divide - InstrItinData<IIC_iDIV , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0], 0>, - InstrStage<14, [SW_IDIV]>], - [14, 1, 1]>, - - // Integer load pipeline - // FIXME: The timings are some rough approximations - // - // Immediate offset - InstrItinData<IIC_iLoad_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Register offset - InstrItinData<IIC_iLoad_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3, 4, 1, 1]>, - // - // Scaled register offset - InstrItinData<IIC_iLoad_si , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [5, 1, 1]>, - InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [5, 1, 1]>, - // - // Immediate offset with update - InstrItinData<IIC_iLoad_iu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1]>, - // - // Register offset with update - InstrItinData<IIC_iLoad_ru , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0], 1>, - InstrStage<1, [SW_LS]>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [3, 4, 1, 1]>, - // - // Scaled register offset with update - InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 3, 1, 1]>, - InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [5, 3, 1, 1]>, - // - // Load multiple, def is the 5th operand. - // FIXME: This assumes 3 to 4 registers. - InstrItinData<IIC_iLoad_m , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1, 3], [], -1>, // dynamic uops - - // - // Load multiple + update, defs are the 1st and 5th operands. - InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1, 3], [], -1>, // dynamic uops - // - // Load multiple plus branch - InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1, 3], [], -1>, // dynamic uops - // - // Pop, def is the 3rd operand. - InstrItinData<IIC_iPop , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 3], [], -1>, // dynamic uops - // - // Pop + branch, def is the 3rd operand. - InstrItinData<IIC_iPop_Br, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 3], [], -1>, // dynamic uops - - // - // iLoadi + iALUr for t2LDRpci_pic. - InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [4, 1]>, - - // Integer store pipeline - /// - // Immediate offset - InstrItinData<IIC_iStore_i , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // Register offset - InstrItinData<IIC_iStore_r , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Scaled register offset - InstrItinData<IIC_iStore_si , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Immediate offset with update - InstrItinData<IIC_iStore_iu , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1]>, - // - // Register offset with update - InstrItinData<IIC_iStore_ru , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 1]>, - // - // Scaled register offset with update - InstrItinData<IIC_iStore_siu, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>], - [3, 1, 1, 1]>, - InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 2>, - InstrStage<1, [SW_LS], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>], - [3, 1, 1, 1]>, - // - // Store multiple - InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [], [], -1>, // dynamic uops - // - // Store multiple + update - InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS], 1>, - InstrStage<1, [SW_ALU0, SW_ALU1], 1>, - InstrStage<1, [SW_LS]>], - [2], [], -1>, // dynamic uops - - // - // Preload - InstrItinData<IIC_Preload, [InstrStage<1, [SW_DIS0], 0>], [1, 1]>, - - // Branch - // - // no delay slots, so the latency of a branch is unimportant - InstrItinData<IIC_Br , [InstrStage<1, [SW_DIS0], 0>]>, - - // FP Special Register to Integer Register File Move - InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [1]>, - // - // Single-precision FP Unary - // - // Most floating-point moves get issued on ALU0. - InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Double-precision FP Unary - InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - - // - // Single-precision FP Compare - InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [1, 1]>, - // - // Double-precision FP Compare - InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [1, 1]>, - // - // Single to Double FP Convert - InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Double to Single FP Convert - InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - - // - // Single to Half FP Convert - InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 4>, - InstrStage<1, [SW_ALU1]>], - [6, 1]>, - // - // Half to Single FP Convert - InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - - // - // Single-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Double-Precision FP to Integer Convert - InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Integer to Single-Precision FP Convert - InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Integer to Double-Precision FP Convert - InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1]>, - // - // Single-precision FP ALU - InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-precision FP ALU - InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Single-precision FP Multiply - InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-precision FP Multiply - InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 1]>, - // - // Single-precision FP MAC - InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-precision FP MAC - InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [12, 1, 1]>, - // - // Single-precision Fused FP MAC - InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-precision Fused FP MAC - InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [12, 1, 1]>, - // - // Single-precision FP DIV - InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<15, [SW_FDIV]>], - [17, 1, 1]>, - // - // Double-precision FP DIV - InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<30, [SW_FDIV]>], - [32, 1, 1]>, - // - // Single-precision FP SQRT - InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<15, [SW_FDIV]>], - [17, 1]>, - // - // Double-precision FP SQRT - InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 0>, - InstrStage<30, [SW_FDIV]>], - [32, 1, 1]>, - - // - // Integer to Single-precision Move - InstrItinData<IIC_fpMOVIS, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - // - // Integer to Double-precision Move - InstrItinData<IIC_fpMOVID, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // Single-precision to Integer Move - InstrItinData<IIC_fpMOVSI, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - // - // Double-precision to Integer Move - InstrItinData<IIC_fpMOVDI, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Single-precision FP Load - InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // Double-precision FP Load - InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1]>, - // - // FP Load Multiple - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1, 4], [], -1>, // dynamic uops - // - // FP Load Multiple + update - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1, 4], [], -1>, // dynamic uops - // - // Single-precision FP Store - InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // Double-precision FP Store - InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1]>, - // - // FP Store Multiple - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [1, 1, 1], [], -1>, // dynamic uops - // - // FP Store Multiple + update - // FIXME: Assumes a single Q register. - InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0, SW_ALU1]>], - [2, 1, 1, 1], [], -1>, // dynamic uops - // NEON - // - // Double-register Integer Unary - InstrItinData<IIC_VUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Quad-register Integer Unary - InstrItinData<IIC_VUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Double-register Integer Q-Unary - InstrItinData<IIC_VQUNAiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Quad-register Integer CountQ-Unary - InstrItinData<IIC_VQUNAiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1]>, - // - // Double-register Integer Binary - InstrItinData<IIC_VBINiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Binary - InstrItinData<IIC_VBINiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Subtract - InstrItinData<IIC_VSUBiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Subtract - InstrItinData<IIC_VSUBiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Shift - InstrItinData<IIC_VSHLiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Shift - InstrItinData<IIC_VSHLiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Shift (4 cycle) - InstrItinData<IIC_VSHLi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Binary (4 cycle) - InstrItinData<IIC_VBINi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register Integer Subtract (4 cycle) - InstrItinData<IIC_VSUBi4D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Subtract (4 cycle) - InstrItinData<IIC_VSUBi4Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // Double-register Integer Count - InstrItinData<IIC_VCNTiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Quad-register Integer Count - InstrItinData<IIC_VCNTiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1, 1]>, - // - // Double-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - // - // Quad-register Absolute Difference and Accumulate - InstrItinData<IIC_VABAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1, 1]>, - // - // Double-register Integer Pair Add Long - InstrItinData<IIC_VPALiD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register Integer Pair Add Long - InstrItinData<IIC_VPALiQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // Double-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register Integer Multiply (.8, .16) - InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - - // - // Double-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register Integer Multiply (.32) - InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Double-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Quad-register Integer Multiply-Accumulate (.8, .16) - InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - // - // Quad-register Integer Multiply-Accumulate (.32) - InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1, 1]>, - - // - // Move - InstrItinData<IIC_VMOV, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Move Immediate - InstrItinData<IIC_VMOVImm, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2]>, - // - // Double-register Permute Move - InstrItinData<IIC_VMOVD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Quad-register Permute Move - InstrItinData<IIC_VMOVQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Integer to Single-precision Move - InstrItinData<IIC_VMOVIS , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - // - // Integer to Double-precision Move - InstrItinData<IIC_VMOVID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [4, 1, 1]>, - // - // Single-precision to Integer Move - InstrItinData<IIC_VMOVSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_LS]>], - [3, 1]>, - // - // Double-precision to Integer Move - InstrItinData<IIC_VMOVDI , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 3>, - InstrStage<1, [SW_LS]>], - [3, 4, 1]>, - // - // Integer to Lane Move - // FIXME: I think this is correct, but it is not clear from the tuning guide. - InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_LS], 4>, - InstrStage<1, [SW_ALU0]>], - [6, 1]>, - - // - // Vector narrow move - InstrItinData<IIC_VMOVN, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1]>, - // - // Double-register FP Unary - // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, - // and they issue on a different pipeline. - InstrItinData<IIC_VUNAD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Quad-register FP Unary - // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here, - // and they issue on a different pipeline. - InstrItinData<IIC_VUNAQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [2, 1]>, - // - // Double-register FP Binary - // FIXME: We're using this itin for many instructions. - InstrItinData<IIC_VBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - - // - // VPADD, etc. - InstrItinData<IIC_VPBIND, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Double-register FP VMUL - InstrItinData<IIC_VFMULD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Quad-register FP Binary - InstrItinData<IIC_VBINQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU0]>], - [4, 1, 1]>, - // - // Quad-register FP VMUL - InstrItinData<IIC_VFMULQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 1]>, - // - // Double-register FP Multiple-Accumulate - InstrItinData<IIC_VMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register FP Multiple-Accumulate - InstrItinData<IIC_VMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Fused FP Multiple-Accumulate - InstrItinData<IIC_VFMACD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register FusedF P Multiple-Accumulate - InstrItinData<IIC_VFMACQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Reciprical Step - InstrItinData<IIC_VRECSD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Quad-register Reciprical Step - InstrItinData<IIC_VRECSQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 1]>, - // - // Double-register Permute - // FIXME: The latencies are unclear from the documentation. - InstrItinData<IIC_VPERMD, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - // - // Quad-register Permute - // FIXME: The latencies are unclear from the documentation. - InstrItinData<IIC_VPERMQ, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - // - // Quad-register Permute (3 cycle issue on A9) - InstrItinData<IIC_VPERMQ3, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [3, 4, 3, 4]>, - - // - // Double-register VEXT - InstrItinData<IIC_VEXTD, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - // - // Quad-register VEXT - InstrItinData<IIC_VEXTQ, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - // - // VTB - InstrItinData<IIC_VTB1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_VTB2, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 3, 3]>, - InstrItinData<IIC_VTB3, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 3, 5, 5]>, - InstrItinData<IIC_VTB4, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 3, 5, 7, 7]>, - // - // VTBX - InstrItinData<IIC_VTBX1, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>, - InstrStage<1, [SW_ALU1]>], - [2, 1, 1]>, - InstrItinData<IIC_VTBX2, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [4, 1, 3, 3]>, - InstrItinData<IIC_VTBX3, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [6, 1, 3, 5, 5]>, - InstrItinData<IIC_VTBX4, [InstrStage<1, [SW_DIS0], 0>, - InstrStage<1, [SW_DIS1], 0>, - InstrStage<1, [SW_DIS2], 0>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1], 2>, - InstrStage<1, [SW_ALU1]>], - [8, 1, 3, 5, 7, 7]> -]>; - -// ===---------------------------------------------------------------------===// -// This following definitions describe the simple machine model which -// will replace itineraries. - // Swift machine model for scheduling and other instruction cost heuristics. def SwiftModel : SchedMachineModel { let IssueWidth = 3; // 3 micro-ops are dispatched per cycle. let MicroOpBufferSize = 45; // Based on NEON renamed registers. let LoadLatency = 3; let MispredictPenalty = 14; // A branch direction mispredict. - - let Itineraries = SwiftItineraries; + let CompleteModel = 0; // FIXME: Remove if all instructions are covered. } // Swift predicates. @@ -1558,6 +521,13 @@ let SchedModel = SwiftModel in { (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD", "PUSH", "tPUSH")>; + // LDRLIT pseudo instructions, they expand to LDR + PICADD + def : InstRW<[SwiftWriteP2ThreeCycle, WriteALU], + (instregex "t?LDRLIT_ga_abs", "t?LDRLIT_ga_pcrel")>; + // LDRLIT_ga_pcrel_ldr expands to LDR + PICLDR + def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2ThreeCycle], + (instregex "LDRLIT_ga_pcrel_ldr")>; + // 4.2.26 Branch def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; } def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; } diff --git a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp index 6cafbbb..6fded9c 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -160,41 +160,39 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, unsigned VTSize = 4; unsigned i = 0; // Emit a maximum of 4 loads in Thumb1 since we have fewer registers - const unsigned MAX_LOADS_IN_LDM = Subtarget.isThumb1Only() ? 4 : 6; + const unsigned MaxLoadsInLDM = Subtarget.isThumb1Only() ? 4 : 6; SDValue TFOps[6]; SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; - // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the - // same number of stores. The loads and stores will get combined into - // ldm/stm later on. - while (EmittedNumMemOps < NumMemOps) { - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - Loads[i] = DAG.getLoad(VT, dl, Chain, - DAG.getNode(ISD::ADD, dl, MVT::i32, Src, - DAG.getConstant(SrcOff, dl, MVT::i32)), - SrcPtrInfo.getWithOffset(SrcOff), isVolatile, - false, false, 0); - TFOps[i] = Loads[i].getValue(1); - SrcOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + // FIXME: We should invent a VMEMCPY pseudo-instruction that lowers to + // VLDM/VSTM and make this code emit it when appropriate. This would reduce + // pressure on the general purpose registers. However this seems harder to map + // onto the register allocator's view of the world. - for (i = 0; - i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) { - TFOps[i] = DAG.getStore(Chain, dl, Loads[i], - DAG.getNode(ISD::ADD, dl, MVT::i32, Dst, - DAG.getConstant(DstOff, dl, MVT::i32)), - DstPtrInfo.getWithOffset(DstOff), - isVolatile, false, 0); - DstOff += VTSize; - } - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - makeArrayRef(TFOps, i)); + // The number of MEMCPY pseudo-instructions to emit. We use up to + // MaxLoadsInLDM registers per mcopy, which will get lowered into ldm/stm + // later on. This is a lower bound on the number of MEMCPY operations we must + // emit. + unsigned NumMEMCPYs = (NumMemOps + MaxLoadsInLDM - 1) / MaxLoadsInLDM; + + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other, MVT::Glue); + + for (unsigned I = 0; I != NumMEMCPYs; ++I) { + // Evenly distribute registers among MEMCPY operations to reduce register + // pressure. + unsigned NextEmittedNumMemOps = NumMemOps * (I + 1) / NumMEMCPYs; + unsigned NumRegs = NextEmittedNumMemOps - EmittedNumMemOps; + + Dst = DAG.getNode(ARMISD::MEMCPY, dl, VTs, Chain, Dst, Src, + DAG.getConstant(NumRegs, dl, MVT::i32)); + Src = Dst.getValue(1); + Chain = Dst.getValue(2); + + DstPtrInfo = DstPtrInfo.getWithOffset(NumRegs * VTSize); + SrcPtrInfo = SrcPtrInfo.getWithOffset(NumRegs * VTSize); - EmittedNumMemOps += i; + EmittedNumMemOps = NextEmittedNumMemOps; } if (BytesLeft == 0) diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp index 002c3e9..bb6ae28 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -26,6 +26,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetOptions.h" @@ -40,37 +41,9 @@ using namespace llvm; #include "ARMGenSubtargetInfo.inc" static cl::opt<bool> -ReserveR9("arm-reserve-r9", cl::Hidden, - cl::desc("Reserve R9, making it unavailable as GPR")); - -static cl::opt<bool> -ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden); - -static cl::opt<bool> UseFusedMulOps("arm-use-mulops", cl::init(true), cl::Hidden); -namespace { -enum AlignMode { - DefaultAlign, - StrictAlign, - NoStrictAlign -}; -} - -static cl::opt<AlignMode> -Align(cl::desc("Load/store alignment support"), - cl::Hidden, cl::init(DefaultAlign), - cl::values( - clEnumValN(DefaultAlign, "arm-default-align", - "Generate unaligned accesses only on hardware/OS " - "combinations that are known to support them"), - clEnumValN(StrictAlign, "arm-strict-align", - "Disallow all unaligned memory accesses"), - clEnumValN(NoStrictAlign, "arm-no-strict-align", - "Allow unaligned memory accesses"), - clEnumValEnd)); - enum ITMode { DefaultIT, RestrictedIT, @@ -88,6 +61,12 @@ IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), "Allow IT blocks based on ARMv7"), clEnumValEnd)); +/// ForceFastISel - Use the fast-isel, even for subtargets where it is not +/// currently supported (for testing only). +static cl::opt<bool> +ForceFastISel("arm-force-fast-isel", + cl::init(false), cl::Hidden); + /// initializeSubtargetDependencies - Initializes using a CPU and feature string /// so that we can use initializer lists for subtarget initialization. ARMSubtarget &ARMSubtarget::initializeSubtargetDependencies(StringRef CPU, @@ -110,8 +89,8 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle) : ARMGenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others), - ARMProcClass(None), stackAlignment(4), CPUString(CPU), IsLittle(IsLittle), - TargetTriple(TT), Options(TM.Options), TM(TM), + ARMProcClass(None), ARMArch(ARMv4t), stackAlignment(4), CPUString(CPU), + IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. @@ -133,6 +112,7 @@ void ARMSubtarget::initializeEnvironment() { HasV7Ops = false; HasV8Ops = false; HasV8_1aOps = false; + HasV8_2aOps = false; HasVFPv2 = false; HasVFPv3 = false; HasVFPv4 = false; @@ -147,10 +127,11 @@ void ARMSubtarget::initializeEnvironment() { UseSoftFloat = false; HasThumb2 = false; NoARM = false; - IsR9Reserved = ReserveR9; - UseMovt = false; + ReserveR9 = false; + NoMovt = false; SupportsTailCall = false; HasFP16 = false; + HasFullFP16 = false; HasD16 = false; HasHardwareDivide = false; HasHardwareDivideInARM = false; @@ -168,20 +149,36 @@ void ARMSubtarget::initializeEnvironment() { HasCrypto = false; HasCRC = false; HasZeroCycleZeroing = false; - AllowsUnalignedMem = false; - Thumb2DSP = false; + StrictAlign = false; + HasDSP = false; UseNaClTrap = false; GenLongCalls = false; UnsafeFPMath = false; + + // MCAsmInfo isn't always present (e.g. in opt) so we can't initialize this + // directly from it, but we can try to make sure they're consistent when both + // available. + UseSjLjEH = isTargetDarwin() && !isTargetWatchOS(); + assert((!TM.getMCAsmInfo() || + (TM.getMCAsmInfo()->getExceptionHandlingType() == + ExceptionHandling::SjLj) == UseSjLjEH) && + "inconsistent sjlj choice between CodeGen and MC"); } void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (CPUString.empty()) { - if (isTargetDarwin() && TargetTriple.getArchName().endswith("v7s")) - // Default to the Swift CPU when targeting armv7s/thumbv7s. - CPUString = "swift"; - else - CPUString = "generic"; + CPUString = "generic"; + + if (isTargetDarwin()) { + StringRef ArchName = TargetTriple.getArchName(); + if (ArchName.endswith("v7s")) + // Default to the Swift CPU when targeting armv7s/thumbv7s. + CPUString = "swift"; + else if (ArchName.endswith("v7k")) + // Default to the Cortex-a7 CPU when targeting armv7k/thumbv7k. + // ARMv7k does not use SjLj exception handling. + CPUString = "cortex-a7"; + } } // Insert the architecture feature derived from the target triple into the @@ -212,44 +209,31 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { if (isAAPCS_ABI()) stackAlignment = 8; - if (isTargetNaCl()) + if (isTargetNaCl() || isAAPCS16_ABI()) stackAlignment = 16; - UseMovt = hasV6T2Ops() && ArmUseMOVT; - - if (isTargetMachO()) { - IsR9Reserved = ReserveR9 || !HasV6Ops; - SupportsTailCall = !isTargetIOS() || !getTargetTriple().isOSVersionLT(5, 0); - } else { - IsR9Reserved = ReserveR9; - SupportsTailCall = !isThumb1Only(); - } - - if (Align == DefaultAlign) { - // Assume pre-ARMv6 doesn't support unaligned accesses. - // - // ARMv6 may or may not support unaligned accesses depending on the - // SCTLR.U bit, which is architecture-specific. We assume ARMv6 - // Darwin and NetBSD targets support unaligned accesses, and others don't. - // - // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit - // which raises an alignment fault on unaligned accesses. Linux - // defaults this bit to 0 and handles it as a system-wide (not - // per-process) setting. It is therefore safe to assume that ARMv7+ - // Linux targets support unaligned accesses. The same goes for NaCl. - // - // The above behavior is consistent with GCC. - AllowsUnalignedMem = - (hasV7Ops() && (isTargetLinux() || isTargetNaCl() || - isTargetNetBSD())) || - (hasV6Ops() && (isTargetMachO() || isTargetNetBSD())); - } else { - AllowsUnalignedMem = !(Align == StrictAlign); - } - - // No v6M core supports unaligned memory access (v6M ARM ARM A3.2) - if (isV6M()) - AllowsUnalignedMem = false; + // FIXME: Completely disable sibcall for Thumb1 since ThumbRegisterInfo:: + // emitEpilogue is not ready for them. Thumb tail calls also use t2B, as + // the Thumb1 16-bit unconditional branch doesn't have sufficient relocation + // support in the assembler and linker to be used. This would need to be + // fixed to fully support tail calls in Thumb1. + // + // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take + // LR. This means if we need to reload LR, it takes an extra instructions, + // which outweighs the value of the tail call; but here we don't know yet + // whether LR is going to be used. Probably the right approach is to + // generate the tail call here and turn it back into CALL/RET in + // emitEpilogue if LR is used. + + // Thumb1 PIC calls to external symbols use BX, so they can be tail calls, + // but we need to make sure there are enough registers; the only valid + // registers are the 4 used for parameters. We don't currently do this + // case. + + SupportsTailCall = !isThumb1Only(); + + if (isTargetMachO() && isTargetIOS() && getTargetTriple().isOSVersionLT(5, 0)) + SupportsTailCall = false; switch (IT) { case DefaultIT: @@ -276,9 +260,15 @@ bool ARMSubtarget::isAPCS_ABI() const { } bool ARMSubtarget::isAAPCS_ABI() const { assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN); - return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS; + return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS || + TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16; +} +bool ARMSubtarget::isAAPCS16_ABI() const { + assert(TM.TargetABI != ARMBaseTargetMachine::ARM_ABI_UNKNOWN); + return TM.TargetABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16; } + /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol. bool ARMSubtarget::GVIsIndirectSymbol(const GlobalValue *GV, @@ -321,11 +311,23 @@ unsigned ARMSubtarget::getMispredictionPenalty() const { } bool ARMSubtarget::hasSinCos() const { - return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0); + return isTargetWatchOS() || + (isTargetIOS() && !getTargetTriple().isOSVersionLT(7, 0)); +} + +bool ARMSubtarget::enableMachineScheduler() const { + // Enable the MachineScheduler before register allocation for out-of-order + // architectures where we do not use the PostRA scheduler anymore (for now + // restricted to swift). + return getSchedModel().isOutOfOrder() && isSwift(); } // This overrides the PostRAScheduler bit in the SchedModel for any CPU. bool ARMSubtarget::enablePostRAScheduler() const { + // No need for PostRA scheduling on out of order CPUs (for now restricted to + // swift). + if (getSchedModel().isOutOfOrder() && isSwift()) + return false; return (!isThumb() || hasThumb2()); } @@ -333,15 +335,30 @@ bool ARMSubtarget::enableAtomicExpand() const { return hasAnyDataBarrier() && !isThumb1Only(); } +bool ARMSubtarget::useStride4VFPs(const MachineFunction &MF) const { + // For general targets, the prologue can grow when VFPs are allocated with + // stride 4 (more vpush instructions). But WatchOS uses a compact unwind + // format which it's more important to get right. + return isTargetWatchOS() || (isSwift() && !MF.getFunction()->optForMinSize()); +} + bool ARMSubtarget::useMovt(const MachineFunction &MF) const { // NOTE Windows on ARM needs to use mov.w/mov.t pairs to materialise 32-bit // immediates as it is inherently position independent, and may be out of // range otherwise. - return UseMovt && (isTargetWindows() || - !MF.getFunction()->hasFnAttribute(Attribute::MinSize)); + return !NoMovt && hasV6T2Ops() && + (isTargetWindows() || !MF.getFunction()->optForMinSize()); } bool ARMSubtarget::useFastISel() const { + // Enable fast-isel for any target, for testing only. + if (ForceFastISel) + return true; + + // Limit fast-isel to the targets that are or have been tested. + if (!hasV6Ops()) + return false; + // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl. return TM.Options.EnableFastISel && ((isTargetMachO() && !isThumb1Only()) || diff --git a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h index dd101df..a8b2801 100644 --- a/contrib/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/contrib/llvm/lib/Target/ARM/ARMSubtarget.h @@ -43,11 +43,17 @@ class ARMSubtarget : public ARMGenSubtargetInfo { protected: enum ARMProcFamilyEnum { Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, - CortexA17, CortexR4, CortexR4F, CortexR5, Swift, CortexA53, CortexA57, Krait, + CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexA35, CortexA53, + CortexA57, CortexA72, Krait, Swift }; enum ARMProcClassEnum { None, AClass, RClass, MClass }; + enum ARMArchEnum { + ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te, + ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r, + ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a + }; /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. ARMProcFamilyEnum ARMProcFamily; @@ -55,6 +61,9 @@ protected: /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass. ARMProcClassEnum ARMProcClass; + /// ARMArch - ARM architecture + ARMArchEnum ARMArch; + /// HasV4TOps, HasV5TOps, HasV5TEOps, /// HasV6Ops, HasV6MOps, HasV6KOps, HasV6T2Ops, HasV7Ops, HasV8Ops - /// Specify whether target support specific ARM ISA variants. @@ -68,6 +77,7 @@ protected: bool HasV7Ops; bool HasV8Ops; bool HasV8_1aOps; + bool HasV8_2aOps; /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what /// floating point ISAs are supported. @@ -109,22 +119,24 @@ protected: /// NoARM - True if subtarget does not support ARM mode execution. bool NoARM; - /// IsR9Reserved - True if R9 is a not available as general purpose register. - bool IsR9Reserved; + /// ReserveR9 - True if R9 is not available as a general purpose register. + bool ReserveR9; - /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit - /// imms (including global addresses). - bool UseMovt; + /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of + /// 32-bit imms (including global addresses). + bool NoMovt; /// SupportsTailCall - True if the OS supports tail call. The dynamic linker /// must be able to synthesize call stubs for interworking between ARM and /// Thumb. bool SupportsTailCall; - /// HasFP16 - True if subtarget supports half-precision FP (We support VFP+HF - /// only so far) + /// HasFP16 - True if subtarget supports half-precision FP conversions bool HasFP16; + /// HasFullFP16 - True if subtarget supports half-precision FP operations + bool HasFullFP16; + /// HasD16 - True if subtarget is limited to 16 double precision /// FP registers for VFPv3. bool HasD16; @@ -190,18 +202,18 @@ protected: /// particularly effective at zeroing a VFP register. bool HasZeroCycleZeroing; - /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory + /// StrictAlign - If true, the subtarget disallows unaligned memory /// accesses for some types. For details, see /// ARMTargetLowering::allowsMisalignedMemoryAccesses(). - bool AllowsUnalignedMem; + bool StrictAlign; /// RestrictIT - If true, the subtarget disallows generation of deprecated IT /// blocks to conform to ARMv8 rule. bool RestrictIT; - /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith - /// and such) instructions in Thumb2 code. - bool Thumb2DSP; + /// HasDSP - If true, the subtarget supports the DSP (saturating arith + /// and such) instructions. + bool HasDSP; /// NaCl TRAP instruction is generated instead of the regular TRAP. bool UseNaClTrap; @@ -212,6 +224,9 @@ protected: /// Target machine allowed unsafe FP math (such as use of NEON fp) bool UnsafeFPMath; + /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS). + bool UseSjLjEH; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -297,6 +312,7 @@ public: bool hasV7Ops() const { return HasV7Ops; } bool hasV8Ops() const { return HasV8Ops; } bool hasV8_1aOps() const { return HasV8_1aOps; } + bool hasV8_2aOps() const { return HasV8_2aOps; } bool isCortexA5() const { return ARMProcFamily == CortexA5; } bool isCortexA7() const { return ARMProcFamily == CortexA7; } @@ -343,17 +359,20 @@ public: bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } bool hasRAS() const { return HasRAS; } bool hasMPExtension() const { return HasMPExtension; } - bool hasThumb2DSP() const { return Thumb2DSP; } + bool hasDSP() const { return HasDSP; } bool useNaClTrap() const { return UseNaClTrap; } + bool useSjLjEH() const { return UseSjLjEH; } bool genLongCalls() const { return GenLongCalls; } bool hasFP16() const { return HasFP16; } bool hasD16() const { return HasD16; } + bool hasFullFP16() const { return HasFullFP16; } const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } bool isTargetIOS() const { return TargetTriple.isiOS(); } + bool isTargetWatchOS() const { return TargetTriple.isWatchOS(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetNetBSD() const { return TargetTriple.isOSNetBSD(); } @@ -375,6 +394,11 @@ public: TargetTriple.getEnvironment() == Triple::EABIHF) && !isTargetDarwin() && !isTargetWindows(); } + bool isTargetGNUAEABI() const { + return (TargetTriple.getEnvironment() == Triple::GNUEABI || + TargetTriple.getEnvironment() == Triple::GNUEABIHF) && + !isTargetDarwin() && !isTargetWindows(); + } // ARM Targets that support EHABI exception handling standard // Darwin uses SjLj. Other targets might need more checks. @@ -383,7 +407,7 @@ public: TargetTriple.getEnvironment() == Triple::GNUEABI || TargetTriple.getEnvironment() == Triple::EABIHF || TargetTriple.getEnvironment() == Triple::GNUEABIHF || - TargetTriple.getEnvironment() == Triple::Android) && + isTargetAndroid()) && !isTargetDarwin() && !isTargetWindows(); } @@ -391,14 +415,13 @@ public: // FIXME: this is invalid for WindowsCE return TargetTriple.getEnvironment() == Triple::GNUEABIHF || TargetTriple.getEnvironment() == Triple::EABIHF || - isTargetWindows(); - } - bool isTargetAndroid() const { - return TargetTriple.getEnvironment() == Triple::Android; + isTargetWindows() || isAAPCS16_ABI(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isAPCS_ABI() const; bool isAAPCS_ABI() const; + bool isAAPCS16_ABI() const; bool useSoftFloat() const { return UseSoftFloat; } bool isThumb() const { return InThumbMode; } @@ -409,17 +432,17 @@ public: bool isRClass() const { return ARMProcClass == RClass; } bool isAClass() const { return ARMProcClass == AClass; } - bool isV6M() const { - return isThumb1Only() && isMClass(); + bool isR9Reserved() const { + return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9; } - bool isR9Reserved() const { return IsR9Reserved; } + bool useStride4VFPs(const MachineFunction &MF) const; bool useMovt(const MachineFunction &MF) const; bool supportsTailCall() const { return SupportsTailCall; } - bool allowsUnalignedMem() const { return AllowsUnalignedMem; } + bool allowsUnalignedMem() const { return !StrictAlign; } bool restrictIT() const { return RestrictIT; } @@ -433,6 +456,9 @@ public: /// compiler runtime or math libraries. bool hasSinCos() const; + /// Returns true if machine scheduler should be enabled. + bool enableMachineScheduler() const override; + /// True for some subtargets at > -O0. bool enablePostRAScheduler() const override; diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 93495d6..fca1901 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -66,7 +66,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { static ARMBaseTargetMachine::ARMABI computeTargetABI(const Triple &TT, StringRef CPU, const TargetOptions &Options) { - if (Options.MCOptions.getABIName().startswith("aapcs")) + if (Options.MCOptions.getABIName() == "aapcs16") + return ARMBaseTargetMachine::ARM_ABI_AAPCS16; + else if (Options.MCOptions.getABIName().startswith("aapcs")) return ARMBaseTargetMachine::ARM_ABI_AAPCS; else if (Options.MCOptions.getABIName().startswith("apcs")) return ARMBaseTargetMachine::ARM_ABI_APCS; @@ -83,6 +85,8 @@ computeTargetABI(const Triple &TT, StringRef CPU, (TT.getOS() == llvm::Triple::UnknownOS && TT.isOSBinFormatMachO()) || CPU.startswith("cortex-m")) { TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + } else if (TT.isWatchOS()) { + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS16; } else { TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; } @@ -106,7 +110,7 @@ computeTargetABI(const Triple &TT, StringRef CPU, if (TT.isOSNetBSD()) TargetABI = ARMBaseTargetMachine::ARM_ABI_APCS; else - TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; + TargetABI = ARMBaseTargetMachine::ARM_ABI_AAPCS; break; } } @@ -145,7 +149,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, // to 64. We always ty to give them natural alignment. if (ABI == ARMBaseTargetMachine::ARM_ABI_APCS) Ret += "-v64:32:64-v128:32:128"; - else + else if (ABI != ARMBaseTargetMachine::ARM_ABI_AAPCS16) Ret += "-v128:64:128"; // Try to align aggregates to 32 bits (the default is 64 bits, which has no @@ -157,7 +161,7 @@ static std::string computeDataLayout(const Triple &TT, StringRef CPU, // The stack is 128 bit aligned on NaCl, 64 bit aligned on AAPCS and 32 bit // aligned everywhere else. - if (TT.isOSNaCl()) + if (TT.isOSNaCl() || ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS16) Ret += "-S128"; else if (ABI == ARMBaseTargetMachine::ARM_ABI_AAPCS) Ret += "-S64"; @@ -184,6 +188,15 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, if (Options.FloatABIType == FloatABI::Default) this->Options.FloatABIType = Subtarget.isTargetHardFloat() ? FloatABI::Hard : FloatABI::Soft; + + // Default to triple-appropriate EABI + if (Options.EABIVersion == EABI::Default || + Options.EABIVersion == EABI::Unknown) { + if (Subtarget.isTargetGNUAEABI()) + this->Options.EABIVersion = EABI::GNU; + else + this->Options.EABIVersion = EABI::EABI5; + } } ARMBaseTargetMachine::~ARMBaseTargetMachine() {} @@ -225,12 +238,12 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { } TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(ARMTTIImpl(this, F)); }); + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(ARMTTIImpl(this, F)); + }); } - -void ARMTargetMachine::anchor() { } +void ARMTargetMachine::anchor() {} ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -244,7 +257,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT, "support ARM mode execution!"); } -void ARMLETargetMachine::anchor() { } +void ARMLETargetMachine::anchor() {} ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -253,7 +266,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL) : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} -void ARMBETargetMachine::anchor() { } +void ARMBETargetMachine::anchor() {} ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -262,7 +275,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL) : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} -void ThumbTargetMachine::anchor() { } +void ThumbTargetMachine::anchor() {} ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -273,7 +286,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT, initAsmInfo(); } -void ThumbLETargetMachine::anchor() { } +void ThumbLETargetMachine::anchor() {} ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -282,7 +295,7 @@ ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT, CodeGenOpt::Level OL) : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} -void ThumbBETargetMachine::anchor() { } +void ThumbBETargetMachine::anchor() {} ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -348,7 +361,13 @@ bool ARMPassConfig::addPreISel() { // tricky when doing code gen per function. bool OnlyOptimizeForSize = (TM->getOptLevel() < CodeGenOpt::Aggressive) && (EnableGlobalMerge == cl::BOU_UNSET); - addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize)); + // Merging of extern globals is enabled by default on non-Mach-O as we + // expect it to be generally either beneficial or harmless. On Mach-O it + // is disabled as we emit the .subsections_via_symbols directive which + // means that merging extern globals is not safe. + bool MergeExternalByDefault = !TM->getTargetTriple().isOSBinFormatMachO(); + addPass(createGlobalMergePass(TM, 127, OnlyOptimizeForSize, + MergeExternalByDefault)); } return false; @@ -356,9 +375,6 @@ bool ARMPassConfig::addPreISel() { bool ARMPassConfig::addInstSelector() { addPass(createARMISelDag(getARMTargetMachine(), getOptLevel())); - - if (TM->getTargetTriple().isOSBinFormatELF() && TM->Options.EnableFastISel) - addPass(createARMGlobalBaseRegPass()); return false; } diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h index 8c98e08..8ad1f3d 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetMachine.h @@ -26,7 +26,8 @@ public: enum ARMABI { ARM_ABI_UNKNOWN, ARM_ABI_APCS, - ARM_ABI_AAPCS // ARM EABI + ARM_ABI_AAPCS, // ARM EABI + ARM_ABI_AAPCS16 } TargetABI; protected: diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 2f194cf..c152011 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -15,7 +15,7 @@ using namespace llvm; #define DEBUG_TYPE "armtti" -unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned Bits = Ty->getPrimitiveSizeInBits(); @@ -47,12 +47,12 @@ unsigned ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 3; } -unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); // Single to/from double precision conversions. - static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = { + static const CostTblEntry NEONFltDblTbl[] = { // Vector fptrunc/fpext conversions. { ISD::FP_ROUND, MVT::v2f64, 2 }, { ISD::FP_EXTEND, MVT::v2f32, 2 }, @@ -61,10 +61,9 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND)) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); - int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second); - if (Idx != -1) - return LT.first * NEONFltDblTbl[Idx].Cost; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second)) + return LT.first * Entry->Cost; } EVT SrcTy = TLI->getValueType(DL, Src); @@ -76,8 +75,7 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { // Some arithmetic, load and store operations have specific instructions // to cast up/down their types automatically at no extra cost. // TODO: Get these tables to know at least what the related operations are. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONVectorConversionTbl[] = { + static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 }, { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 }, @@ -153,15 +151,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { }; if (SrcTy.isVector() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return NEONVectorConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } // Scalar float to integer conversions. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONFloatConversionTbl[] = { + static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = { { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 }, { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 }, { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 }, @@ -184,15 +181,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 } }; if (SrcTy.isFloatingPoint() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return NEONFloatConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } // Scalar integer to float conversions. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONIntegerConversionTbl[] = { + static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = { { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 }, { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 }, { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 }, @@ -216,15 +212,14 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { }; if (SrcTy.isInteger() && ST->hasNEON()) { - int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return NEONIntegerConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl, + ISD, DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } // Scalar integer conversion costs. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - ARMIntegerConversionTbl[] = { + static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = { // i16 -> i64 requires two dependent operations. { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 }, @@ -236,17 +231,17 @@ unsigned ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { }; if (SrcTy.isInteger()) { - int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return ARMIntegerConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, - unsigned Index) { +int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, + unsigned Index) { // Penalize inserting into an D-subregister. We end up with a three times // lower estimated throughput on swift. if (ST->isSwift() && @@ -255,28 +250,30 @@ unsigned ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, ValTy->getScalarSizeInBits() <= 32) return 3; - // Cross-class copies are expensive on many microarchitectures, - // so assume they are expensive by default. if ((Opcode == Instruction::InsertElement || - Opcode == Instruction::ExtractElement) && - ValTy->getVectorElementType()->isIntegerTy()) - return 3; + Opcode == Instruction::ExtractElement)) { + // Cross-class copies are expensive on many microarchitectures, + // so assume they are expensive by default. + if (ValTy->getVectorElementType()->isIntegerTy()) + return 3; + + // Even if it's not a cross class copy, this likely leads to mixing + // of NEON and VFP code and should be therefore penalized. + if (ValTy->isVectorTy() && + ValTy->getScalarSizeInBits() <= 32) + return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), 2U); + } return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } -unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { int ISD = TLI->InstructionOpcodeToISD(Opcode); // On NEON a a vector select gets lowered to vbsl. if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) { // Lowering of some vector selects is currently far from perfect. - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - NEONVectorSelectTbl[] = { - { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 }, - { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 }, - { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 }, + static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = { { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 }, { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 }, { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 } @@ -285,21 +282,20 @@ unsigned ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, EVT SelCondTy = TLI->getValueType(DL, CondTy); EVT SelValTy = TLI->getValueType(DL, ValTy); if (SelCondTy.isSimple() && SelValTy.isSimple()) { - int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, - SelCondTy.getSimpleVT(), - SelValTy.getSimpleVT()); - if (Idx != -1) - return NEONVectorSelectTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD, + SelCondTy.getSimpleVT(), + SelValTy.getSimpleVT())) + return Entry->Cost; } - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); return LT.first; } return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -314,7 +310,7 @@ unsigned ARMTTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return 1; } -unsigned ARMTTIImpl::getFPOpCost(Type *Ty) { +int ARMTTIImpl::getFPOpCost(Type *Ty) { // Use similar logic that's in ARMISelLowering: // Any ARM CPU with VFP2 has floating point, but Thumb1 didn't have access // to VFP. @@ -333,14 +329,14 @@ unsigned ARMTTIImpl::getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Expensive; } -unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { // We only handle costs of reverse and alternate shuffles for now. if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { - static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = { + static const CostTblEntry NEONShuffleTbl[] = { // Reverse shuffle cost one instruction if we are shuffling within a // double word (vrev) or two if we shuffle a quad word (vrev, vext). {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1}, @@ -353,16 +349,16 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2}, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}}; - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); + if (const auto *Entry = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; - return LT.first * NEONShuffleTbl[Idx].Cost; + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } if (Kind == TTI::SK_Alternate) { - static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = { + static const CostTblEntry NEONAltShuffleTbl[] = { // Alt shuffle cost table for ARM. Cost is the number of instructions // required to create the shuffled vector. @@ -379,27 +375,26 @@ unsigned ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}}; - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - int Idx = - CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx == -1) - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); - return LT.first * NEONAltShuffleTbl[Idx].Cost; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + if (const auto *Entry = CostTableLookup(NEONAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned ARMTTIImpl::getArithmeticInstrCost( +int ARMTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode); - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); const unsigned FunctionCallDivCost = 20; const unsigned ReciprocalDivCost = 10; - static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = { + static const CostTblEntry CostTbl[] = { // Division. // These costs are somewhat random. Choose a cost of 20 to indicate that // vectorizing devision (added function call) is going to be very expensive. @@ -440,16 +435,12 @@ unsigned ARMTTIImpl::getArithmeticInstrCost( // Multiplication. }; - int Idx = -1; - if (ST->hasNEON()) - Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second); - - if (Idx != -1) - return LT.first * CostTbl[Idx].Cost; + if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second)) + return LT.first * Entry->Cost; - unsigned Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, - Opd1PropInfo, Opd2PropInfo); + int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info, + Opd1PropInfo, Opd2PropInfo); // This is somewhat of a hack. The problem that we are facing is that SROA // creates a sequence of shift, and, or instructions to construct values. @@ -465,10 +456,9 @@ unsigned ARMTTIImpl::getArithmeticInstrCost( return Cost; } -unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); +int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); if (Src->isVectorTy() && Alignment != 16 && Src->getVectorElementType()->isDoubleTy()) { @@ -479,21 +469,21 @@ unsigned ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return LT.first; } -unsigned ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace) { +int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { assert(Factor >= 2 && "Invalid interleave factor"); assert(isa<VectorType>(VecTy) && "Expect a vector type"); // vldN/vstN doesn't support vector types of i64/f64 element. - bool EltIs64Bits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()) == 64; + bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64; if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) { unsigned NumElts = VecTy->getVectorNumElements(); Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor); - unsigned SubVecSize = DL.getTypeAllocSize(SubVecTy); + unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy); // vldN/vstN only support legal vector types of size 64 or 128 in bits. if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128)) diff --git a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 84f256f..7d8d238 100644 --- a/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -41,7 +41,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { const ARMTargetLowering *getTLI() const { return TLI; } public: - explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, Function &F) + explicit ARMTTIImpl(const ARMBaseTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -52,11 +52,13 @@ public: : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), TLI(std::move(Arg.TLI)) {} + bool enableInterleavedAccessVectorization() { return true; } + /// \name Scalar TTI Implementations /// @{ using BaseT::getIntImmCost; - unsigned getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty); /// @} @@ -92,34 +94,31 @@ public: return 1; } - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getAddressComputationCost(Type *Val, bool IsComplex); + int getAddressComputationCost(Type *Val, bool IsComplex); - unsigned getFPOpCost(Type *Ty); + int getFPOpCost(Type *Ty); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info = TTI::OK_AnyValue, TTI::OperandValueKind Op2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); - unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, - unsigned Factor, - ArrayRef<unsigned> Indices, - unsigned Alignment, - unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, + ArrayRef<unsigned> Indices, unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index cf6b892..c69a741 100644 --- a/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/contrib/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -129,7 +129,6 @@ public: }; class ARMAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; const MCRegisterInfo *MRI; UnwindContext UC; @@ -247,48 +246,49 @@ class ARMAsmParser : public MCTargetAsmParser { OperandVector &Operands); bool isThumb() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[ARM::ModeThumb]; + return getSTI().getFeatureBits()[ARM::ModeThumb]; } bool isThumbOne() const { - return isThumb() && !STI.getFeatureBits()[ARM::FeatureThumb2]; + return isThumb() && !getSTI().getFeatureBits()[ARM::FeatureThumb2]; } bool isThumbTwo() const { - return isThumb() && STI.getFeatureBits()[ARM::FeatureThumb2]; + return isThumb() && getSTI().getFeatureBits()[ARM::FeatureThumb2]; } bool hasThumb() const { - return STI.getFeatureBits()[ARM::HasV4TOps]; + return getSTI().getFeatureBits()[ARM::HasV4TOps]; } bool hasV6Ops() const { - return STI.getFeatureBits()[ARM::HasV6Ops]; + return getSTI().getFeatureBits()[ARM::HasV6Ops]; } bool hasV6MOps() const { - return STI.getFeatureBits()[ARM::HasV6MOps]; + return getSTI().getFeatureBits()[ARM::HasV6MOps]; } bool hasV7Ops() const { - return STI.getFeatureBits()[ARM::HasV7Ops]; + return getSTI().getFeatureBits()[ARM::HasV7Ops]; } bool hasV8Ops() const { - return STI.getFeatureBits()[ARM::HasV8Ops]; + return getSTI().getFeatureBits()[ARM::HasV8Ops]; } bool hasARM() const { - return !STI.getFeatureBits()[ARM::FeatureNoARM]; + return !getSTI().getFeatureBits()[ARM::FeatureNoARM]; } - bool hasThumb2DSP() const { - return STI.getFeatureBits()[ARM::FeatureDSPThumb2]; + bool hasDSP() const { + return getSTI().getFeatureBits()[ARM::FeatureDSP]; } bool hasD16() const { - return STI.getFeatureBits()[ARM::FeatureD16]; + return getSTI().getFeatureBits()[ARM::FeatureD16]; } bool hasV8_1aOps() const { - return STI.getFeatureBits()[ARM::HasV8_1aOps]; + return getSTI().getFeatureBits()[ARM::HasV8_1aOps]; } void SwitchMode() { + MCSubtargetInfo &STI = copySTI(); uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb)); setAvailableFeatures(FB); } bool isMClass() const { - return STI.getFeatureBits()[ARM::FeatureMClass]; + return getSTI().getFeatureBits()[ARM::FeatureMClass]; } /// @name Auto-generated Match Functions @@ -343,14 +343,15 @@ public: Match_RequiresNotITBlock, Match_RequiresV6, Match_RequiresThumb2, + Match_RequiresV8, #define GET_OPERAND_DIAGNOSTIC_TYPES #include "ARMGenAsmMatcher.inc" }; - ARMAsmParser(MCSubtargetInfo &STI, MCAsmParser &Parser, + ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : STI(STI), MII(MII), UC(Parser) { + : MCTargetAsmParser(Options, STI), MII(MII), UC(Parser) { MCAsmParserExtension::Initialize(Parser); // Cache the MCRegisterInfo. @@ -564,87 +565,6 @@ class ARMOperand : public MCParsedAsmOperand { public: ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} - ARMOperand(const ARMOperand &o) : MCParsedAsmOperand() { - Kind = o.Kind; - StartLoc = o.StartLoc; - EndLoc = o.EndLoc; - switch (Kind) { - case k_CondCode: - CC = o.CC; - break; - case k_ITCondMask: - ITMask = o.ITMask; - break; - case k_Token: - Tok = o.Tok; - break; - case k_CCOut: - case k_Register: - Reg = o.Reg; - break; - case k_RegisterList: - case k_DPRRegisterList: - case k_SPRRegisterList: - Registers = o.Registers; - break; - case k_VectorList: - case k_VectorListAllLanes: - case k_VectorListIndexed: - VectorList = o.VectorList; - break; - case k_CoprocNum: - case k_CoprocReg: - Cop = o.Cop; - break; - case k_CoprocOption: - CoprocOption = o.CoprocOption; - break; - case k_Immediate: - Imm = o.Imm; - break; - case k_MemBarrierOpt: - MBOpt = o.MBOpt; - break; - case k_InstSyncBarrierOpt: - ISBOpt = o.ISBOpt; - case k_Memory: - Memory = o.Memory; - break; - case k_PostIndexRegister: - PostIdxReg = o.PostIdxReg; - break; - case k_MSRMask: - MMask = o.MMask; - break; - case k_BankedReg: - BankedReg = o.BankedReg; - break; - case k_ProcIFlags: - IFlags = o.IFlags; - break; - case k_ShifterImmediate: - ShifterImm = o.ShifterImm; - break; - case k_ShiftedRegister: - RegShiftedReg = o.RegShiftedReg; - break; - case k_ShiftedImmediate: - RegShiftedImm = o.RegShiftedImm; - break; - case k_RotateImmediate: - RotImm = o.RotImm; - break; - case k_ModifiedImmediate: - ModImm = o.ModImm; - break; - case k_BitfieldDescriptor: - Bitfield = o.Bitfield; - break; - case k_VectorIndex: - VectorIndex = o.VectorIndex; - break; - } - } /// getStartLoc - Get the location of the first token of this operand. SMLoc getStartLoc() const override { return StartLoc; } @@ -4054,7 +3974,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) { if (FlagsVal == ~0U) return MatchOperand_NoMatch; - if (!hasThumb2DSP() && (FlagsVal & 0x400)) + if (!hasDSP() && (FlagsVal & 0x400)) // The _g and _nzcvqg versions are only valid if the DSP extension is // available. return MatchOperand_NoMatch; @@ -5202,6 +5122,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { // FALLTHROUGH } case AsmToken::Colon: { + S = Parser.getTok().getLoc(); // ":lower16:" and ":upper16:" expression prefixes // FIXME: Check it's an expression prefix, // e.g. (FOO - :lower16:BAR) isn't legal. @@ -5220,8 +5141,9 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { return false; } case AsmToken::Equal: { + S = Parser.getTok().getLoc(); if (Mnemonic != "ldr") // only parse for ldr pseudo (e.g. ldr r0, =val) - return Error(Parser.getTok().getLoc(), "unexpected token in operand"); + return Error(S, "unexpected token in operand"); Parser.Lex(); // Eat '=' const MCExpr *SubExprVal; @@ -5229,7 +5151,8 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { return true; E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); - const MCExpr *CPLoc = getTargetStreamer().addConstantPoolEntry(SubExprVal); + const MCExpr *CPLoc = + getTargetStreamer().addConstantPoolEntry(SubExprVal, S); Operands.push_back(ARMOperand::CreateImm(CPLoc, S, E)); return false; } @@ -5682,9 +5605,11 @@ bool ARMAsmParser::shouldOmitPredicateOperand(StringRef Mnemonic, // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON unsigned RegIdx = 3; if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") && - static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32") { + (static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f32" || + static_cast<ARMOperand &>(*Operands[2]).getToken() == ".f16")) { if (static_cast<ARMOperand &>(*Operands[3]).isToken() && - static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32") + (static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f32" || + static_cast<ARMOperand &>(*Operands[3]).getToken() == ".f16")) RegIdx = 4; if (static_cast<ARMOperand &>(*Operands[RegIdx]).isReg() && @@ -8610,18 +8535,29 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) { if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR && inITBlock()) return Match_RequiresNotITBlock; + } else if (isThumbOne()) { + // Some high-register supporting Thumb1 encodings only allow both registers + // to be from r0-r7 when in Thumb2. + if (Opc == ARM::tADDhirr && !hasV6MOps() && + isARMLowRegister(Inst.getOperand(1).getReg()) && + isARMLowRegister(Inst.getOperand(2).getReg())) + return Match_RequiresThumb2; + // Others only require ARMv6 or later. + else if (Opc == ARM::tMOVr && !hasV6Ops() && + isARMLowRegister(Inst.getOperand(0).getReg()) && + isARMLowRegister(Inst.getOperand(1).getReg())) + return Match_RequiresV6; } - // Some high-register supporting Thumb1 encodings only allow both registers - // to be from r0-r7 when in Thumb2. - else if (Opc == ARM::tADDhirr && isThumbOne() && !hasV6MOps() && - isARMLowRegister(Inst.getOperand(1).getReg()) && - isARMLowRegister(Inst.getOperand(2).getReg())) - return Match_RequiresThumb2; - // Others only require ARMv6 or later. - else if (Opc == ARM::tMOVr && isThumbOne() && !hasV6Ops() && - isARMLowRegister(Inst.getOperand(0).getReg()) && - isARMLowRegister(Inst.getOperand(1).getReg())) - return Match_RequiresV6; + + for (unsigned I = 0; I < MCID.NumOperands; ++I) + if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) { + // rGPRRegClass excludes PC, and also excluded SP before ARMv8 + if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops()) + return Match_RequiresV8; + else if (Inst.getOperand(I).getReg() == ARM::PC) + return Match_InvalidOperand; + } + return Match_Success; } @@ -8680,7 +8616,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return false; Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: { assert(ErrorInfo && "Unknown missing feature!"); @@ -8720,6 +8656,8 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "instruction variant requires ARMv6 or later"); case Match_RequiresThumb2: return Error(IDLoc, "instruction variant requires Thumb2"); + case Match_RequiresV8: + return Error(IDLoc, "instruction variant requires ARMv8 or later"); case Match_ImmRange0_15: { SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; @@ -8868,7 +8806,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) { return false; } - getParser().getStreamer().EmitValue(Value, Size); + getParser().getStreamer().EmitValue(Value, Size, L); if (getLexer().is(AsmToken::EndOfStatement)) break; @@ -9098,7 +9036,7 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) { bool ARMAsmParser::parseDirectiveArch(SMLoc L) { StringRef Arch = getParser().parseStringToEndOfStatement().trim(); - unsigned ID = ARMTargetParser::parseArch(Arch); + unsigned ID = ARM::parseArch(Arch); if (ID == ARM::AK_INVALID) { Error(L, "Unknown arch name"); @@ -9106,7 +9044,8 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) { } Triple T; - STI.setDefaultFeatures(T.getARMCPUForArch(Arch)); + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures("", ("+" + ARM::getArchName(ID)).str()); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); getTargetStreamer().emitArch(ID); @@ -9233,12 +9172,13 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) { // FIXME: This is using table-gen data, but should be moved to // ARMTargetParser once that is table-gen'd. - if (!STI.isCPUStringValid(CPU)) { + if (!getSTI().isCPUStringValid(CPU)) { Error(L, "Unknown CPU name"); return false; } - STI.setDefaultFeatures(CPU); + MCSubtargetInfo &STI = copySTI(); + STI.setDefaultFeatures(CPU, ""); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); return false; @@ -9249,13 +9189,14 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) { SMLoc FPUNameLoc = getTok().getLoc(); StringRef FPU = getParser().parseStringToEndOfStatement().trim(); - unsigned ID = ARMTargetParser::parseFPU(FPU); + unsigned ID = ARM::parseFPU(FPU); std::vector<const char *> Features; - if (!ARMTargetParser::getFPUFeatures(ID, Features)) { + if (!ARM::getFPUFeatures(ID, Features)) { Error(FPUNameLoc, "Unknown FPU name"); return false; } + MCSubtargetInfo &STI = copySTI(); for (auto Feature : Features) STI.ApplyFeatureFlag(Feature); setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); @@ -9895,7 +9836,7 @@ bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) { SMLoc ArchLoc = Parser.getTok().getLoc(); getLexer().Lex(); - unsigned ID = ARMTargetParser::parseArch(Arch); + unsigned ID = ARM::parseArch(Arch); if (ID == ARM::AK_INVALID) { Error(ArchLoc, "unknown architecture '" + Arch + "'"); @@ -9976,22 +9917,22 @@ extern "C" void LLVMInitializeARMAsmParser() { // when we start to table-generate them, and we can use the ARM // flags below, that were generated by table-gen. static const struct { - const ARM::ArchExtKind Kind; - const unsigned ArchCheck; + const unsigned Kind; + const uint64_t ArchCheck; const FeatureBitset Features; } Extensions[] = { { ARM::AEK_CRC, Feature_HasV8, {ARM::FeatureCRC} }, { ARM::AEK_CRYPTO, Feature_HasV8, {ARM::FeatureCrypto, ARM::FeatureNEON, ARM::FeatureFPARMv8} }, { ARM::AEK_FP, Feature_HasV8, {ARM::FeatureFPARMv8} }, - { ARM::AEK_HWDIV, Feature_HasV7 | Feature_IsNotMClass, + { (ARM::AEK_HWDIV | ARM::AEK_HWDIVARM), Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureHWDiv, ARM::FeatureHWDivARM} }, { ARM::AEK_MP, Feature_HasV7 | Feature_IsNotMClass, {ARM::FeatureMP} }, { ARM::AEK_SIMD, Feature_HasV8, {ARM::FeatureNEON, ARM::FeatureFPARMv8} }, - // FIXME: Also available in ARMv6-K - { ARM::AEK_SEC, Feature_HasV7, {ARM::FeatureTrustZone} }, + { ARM::AEK_SEC, Feature_HasV6K, {ARM::FeatureTrustZone} }, // FIXME: Only available in A-class, isel not predicated { ARM::AEK_VIRT, Feature_HasV7, {ARM::FeatureVirtualization} }, + { ARM::AEK_FP16, Feature_HasV8_2a, {ARM::FeatureFPARMv8, ARM::FeatureFullFP16} }, // FIXME: Unsupported extensions. { ARM::AEK_OS, Feature_None, {} }, { ARM::AEK_IWMMXT, Feature_None, {} }, @@ -10020,7 +9961,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { EnableFeature = false; Name = Name.substr(2); } - unsigned FeatureKind = ARMTargetParser::parseArchExt(Name); + unsigned FeatureKind = ARM::parseArchExt(Name); if (FeatureKind == ARM::AEK_INVALID) Error(ExtLoc, "unknown architectural extension: " + Name); @@ -10037,6 +9978,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) { return false; } + MCSubtargetInfo &STI = copySTI(); FeatureBitset ToggleFeatures = EnableFeature ? (~STI.getFeatureBits() & Extension.Features) : ( STI.getFeatureBits() & Extension.Features); @@ -10078,6 +10020,10 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, "expression value must be representable in 32 bits"); } break; + case MCK_rGPR: + if (hasV8Ops() && Op.isReg() && Op.getReg() == ARM::SP) + return Match_Success; + break; case MCK_GPRPair: if (Op.isReg() && MRI->getRegClass(ARM::GPRRegClassID).contains(Op.getReg())) diff --git a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 097ec04..e63defe 100644 --- a/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/contrib/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -59,7 +59,7 @@ namespace { } // Called when decoding an IT instruction. Sets the IT state for the following - // instructions that for the IT block. Firstcond and Mask correspond to the + // instructions that for the IT block. Firstcond and Mask correspond to the // fields in the IT instruction encoding. void setITState(char Firstcond, char Mask) { // (3 - the number of trailing zeros) is the number of then / else. @@ -459,21 +459,18 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, // VFP and NEON instructions, similarly, are shared between ARM // and Thumb modes. - MI.clear(); Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -485,7 +482,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -497,7 +493,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -509,7 +504,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -517,7 +511,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -525,7 +518,6 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Size = 0; return MCDisassembler::Fail; } @@ -718,7 +710,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableThumbSBit16, MI, Insn16, Address, this, STI); if (Result) { @@ -729,7 +720,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableThumb216, MI, Insn16, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -763,7 +753,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, uint32_t Insn32 = (Bytes[3] << 8) | (Bytes[2] << 0) | (Bytes[1] << 24) | (Bytes[0] << 16); - MI.clear(); Result = decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -774,7 +763,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); Result = decodeInstruction(DecoderTableThumb232, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -784,7 +772,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { - MI.clear(); Result = decodeInstruction(DecoderTableVFP32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -794,7 +781,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - MI.clear(); Result = decodeInstruction(DecoderTableVFPV832, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -803,7 +789,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 28, 4) == 0xE) { - MI.clear(); Result = decodeInstruction(DecoderTableNEONDup32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -814,7 +799,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 24, 8) == 0xF9) { - MI.clear(); uint32_t NEONLdStInsn = Insn32; NEONLdStInsn &= 0xF0FFFFFF; NEONLdStInsn |= 0x04000000; @@ -828,7 +812,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } if (fieldFromInstruction(Insn32, 24, 4) == 0xF) { - MI.clear(); uint32_t NEONDataInsn = Insn32; NEONDataInsn &= 0xF0FFFFFF; // Clear bits 27-24 NEONDataInsn |= (NEONDataInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 @@ -841,7 +824,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); uint32_t NEONCryptoInsn = Insn32; NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24 NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24 @@ -853,7 +835,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, return Result; } - MI.clear(); uint32_t NEONv8Insn = Insn32; NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26 Result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address, @@ -864,7 +845,6 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size, } } - MI.clear(); Size = 0; return MCDisassembler::Fail; } @@ -902,7 +882,7 @@ static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; - + if (RegNo == 15) S = MCDisassembler::SoftFail; @@ -986,8 +966,13 @@ static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo, static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder) { DecodeStatus S = MCDisassembler::Success; - if (RegNo == 13 || RegNo == 15) + + const FeatureBitset &featureBits = + ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits(); + + if ((RegNo == 13 && !featureBits[ARM::HasV8Ops]) || RegNo == 15) S = MCDisassembler::SoftFail; + Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder)); return S; } @@ -1147,7 +1132,7 @@ static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val, unsigned imm = fieldFromInstruction(Val, 7, 5); // Register-immediate - if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder))) + if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder))) return MCDisassembler::Fail; ARM_AM::ShiftOpc Shift = ARM_AM::lsl; @@ -1658,7 +1643,7 @@ DecodeAddrMode3Instruction(MCInst &Inst, unsigned Insn, case ARM::STRD_POST: if (P == 0 && W == 1) S = MCDisassembler::SoftFail; - + if (writeback && (Rn == 15 || Rn == Rt || Rn == Rt2)) S = MCDisassembler::SoftFail; if (type && Rm == 15) @@ -4131,7 +4116,7 @@ static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val, // indicates the move for the GE{3:0} bits, the mask{0} bit can be set // only if the processor includes the DSP extension. if (Mask == 0 || (Mask != 2 && ValLow > 3) || - (!(FeatureBits[ARM::FeatureDSPThumb2]) && (Mask & 1))) + (!(FeatureBits[ARM::FeatureDSP]) && (Mask & 1))) S = MCDisassembler::SoftFail; } } @@ -5065,6 +5050,10 @@ static DecodeStatus DecodeSwap(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { + const FeatureBitset &featureBits = + ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; + unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); Vd |= (fieldFromInstruction(Insn, 22, 1) << 4); unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0); @@ -5075,10 +5064,35 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, DecodeStatus S = MCDisassembler::Success; - // VMOVv2f32 is ambiguous with these decodings. - if (!(imm & 0x38) && cmode == 0xF) { - if (op == 1) return MCDisassembler::Fail; - Inst.setOpcode(ARM::VMOVv2f32); + // If the top 3 bits of imm are clear, this is a VMOV (immediate) + if (!(imm & 0x38)) { + if (cmode == 0xF) { + if (op == 1) return MCDisassembler::Fail; + Inst.setOpcode(ARM::VMOVv2f32); + } + if (hasFullFP16) { + if (cmode == 0xE) { + if (op == 1) { + Inst.setOpcode(ARM::VMOVv1i64); + } else { + Inst.setOpcode(ARM::VMOVv8i8); + } + } + if (cmode == 0xD) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv2i32); + } else { + Inst.setOpcode(ARM::VMOVv2i32); + } + } + if (cmode == 0xC) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv2i32); + } else { + Inst.setOpcode(ARM::VMOVv2i32); + } + } + } return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); } @@ -5095,6 +5109,10 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn, static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { + const FeatureBitset &featureBits = + ((const MCDisassembler *)Decoder)->getSubtargetInfo().getFeatureBits(); + bool hasFullFP16 = featureBits[ARM::FeatureFullFP16]; + unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 0); Vd |= (fieldFromInstruction(Insn, 22, 1) << 4); unsigned Vm = (fieldFromInstruction(Insn, 0, 4) << 0); @@ -5105,10 +5123,35 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn, DecodeStatus S = MCDisassembler::Success; - // VMOVv4f32 is ambiguous with these decodings. - if (!(imm & 0x38) && cmode == 0xF) { - if (op == 1) return MCDisassembler::Fail; - Inst.setOpcode(ARM::VMOVv4f32); + // If the top 3 bits of imm are clear, this is a VMOV (immediate) + if (!(imm & 0x38)) { + if (cmode == 0xF) { + if (op == 1) return MCDisassembler::Fail; + Inst.setOpcode(ARM::VMOVv4f32); + } + if (hasFullFP16) { + if (cmode == 0xE) { + if (op == 1) { + Inst.setOpcode(ARM::VMOVv2i64); + } else { + Inst.setOpcode(ARM::VMOVv16i8); + } + } + if (cmode == 0xD) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv4i32); + } else { + Inst.setOpcode(ARM::VMOVv4i32); + } + } + if (cmode == 0xC) { + if (op == 1) { + Inst.setOpcode(ARM::VMVNv4i32); + } else { + Inst.setOpcode(ARM::VMOVv4i32); + } + } + } return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder); } @@ -5132,7 +5175,7 @@ static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val, unsigned Rm = fieldFromInstruction(Val, 0, 4); Rm |= (fieldFromInstruction(Val, 23, 1) << 4); unsigned Cond = fieldFromInstruction(Val, 28, 4); - + if (fieldFromInstruction(Val, 8, 4) != 0 || Rn == Rt) S = MCDisassembler::SoftFail; diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp index 0bff521..33fc85a 100644 --- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp @@ -19,6 +19,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -804,7 +805,7 @@ void ARMInstPrinter::printMSRMaskOperand(const MCInst *MI, unsigned OpNum, unsigned Opcode = MI->getOpcode(); // For writes, handle extended mask bits if the DSP extension is present. - if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSPThumb2]) { + if (Opcode == ARM::t2MSR_M && FeatureBits[ARM::FeatureDSP]) { switch (SYSm) { case 0x400: O << "apsr_g"; diff --git a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h index 3927c9f..52f7115 100644 --- a/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h +++ b/contrib/llvm/lib/Target/ARM/InstPrinter/ARMInstPrinter.h @@ -15,12 +15,9 @@ #define LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class ARMInstPrinter : public MCInstPrinter { public: ARMInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 1114635..fa52c93 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -25,13 +25,17 @@ #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/ELF.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/Format.h" #include "llvm/Support/MachO.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -180,9 +184,8 @@ bool ARMAsmBackend::mayNeedRelaxation(const MCInst &Inst) const { return false; } -bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, - const MCRelaxableFragment *DF, - const MCAsmLayout &Layout) const { +const char *ARMAsmBackend::reasonForFixupRelaxation(const MCFixup &Fixup, + uint64_t Value) const { switch ((unsigned)Fixup.getKind()) { case ARM::fixup_arm_thumb_br: { // Relaxing tB to t2B. tB has a signed 12-bit displacement with the @@ -192,7 +195,9 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, // // Relax if the value is too big for a (signed) i8. int64_t Offset = int64_t(Value) - 4; - return Offset > 2046 || Offset < -2048; + if (Offset > 2046 || Offset < -2048) + return "out of range pc-relative fixup value"; + break; } case ARM::fixup_arm_thumb_bcc: { // Relaxing tBcc to t2Bcc. tBcc has a signed 9-bit displacement with the @@ -202,23 +207,40 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, // // Relax if the value is too big for a (signed) i8. int64_t Offset = int64_t(Value) - 4; - return Offset > 254 || Offset < -256; + if (Offset > 254 || Offset < -256) + return "out of range pc-relative fixup value"; + break; } case ARM::fixup_thumb_adr_pcrel_10: case ARM::fixup_arm_thumb_cp: { // If the immediate is negative, greater than 1020, or not a multiple // of four, the wide version of the instruction must be used. int64_t Offset = int64_t(Value) - 4; - return Offset > 1020 || Offset < 0 || Offset & 3; + if (Offset & 3) + return "misaligned pc-relative fixup value"; + else if (Offset > 1020 || Offset < 0) + return "out of range pc-relative fixup value"; + break; } - case ARM::fixup_arm_thumb_cb: + case ARM::fixup_arm_thumb_cb: { // If we have a Thumb CBZ or CBNZ instruction and its target is the next // instruction it is is actually out of range for the instruction. // It will be changed to a NOP. int64_t Offset = (Value & ~1); - return Offset == 2; + if (Offset == 2) + return "will be converted to nop"; + break; } - llvm_unreachable("Unexpected fixup kind in fixupNeedsRelaxation()!"); + default: + llvm_unreachable("Unexpected fixup kind in reasonForFixupRelaxation()!"); + } + return nullptr; +} + +bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const { + return reasonForFixupRelaxation(Fixup, Value); } void ARMAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { @@ -317,9 +339,10 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf, return Value; } -static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, - bool IsPCRel, MCContext *Ctx, - bool IsLittleEndian) { +unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value, + bool IsPCRel, MCContext *Ctx, + bool IsLittleEndian, + bool IsResolved) const { unsigned Kind = Fixup.getKind(); switch (Kind) { default: @@ -372,8 +395,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Value = -Value; isAdd = false; } - if (Ctx && Value >= 4096) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && Value >= 4096) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } Value |= isAdd << 23; // Same addressing mode as fixup_arm_pcrel_10, @@ -383,8 +408,6 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, return Value; } - case ARM::fixup_thumb_adr_pcrel_10: - return ((Value - 4) >> 2) & 0xff; case ARM::fixup_arm_adr_pcrel_12: { // ARM PC-relative values are offset by 8. Value -= 8; @@ -393,8 +416,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, Value = -Value; opc = 2; // 0b0010 } - if (Ctx && ARM_AM::getSOImmVal(Value) == -1) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && ARM_AM::getSOImmVal(Value) == -1) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } // Encode the immediate and shift the opcode into place. return ARM_AM::getSOImmVal(Value) | (opc << 21); } @@ -517,21 +542,44 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, ((uint16_t)imm10LBits) << 1); return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian); } + case ARM::fixup_thumb_adr_pcrel_10: case ARM::fixup_arm_thumb_cp: - // Offset by 4, and don't encode the low two bits. Two bytes of that - // 'off by 4' is implicitly handled by the half-word ordering of the - // Thumb encoding, so we only need to adjust by 2 here. - return ((Value - 2) >> 2) & 0xff; + // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we + // could have an error on our hands. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } + // Offset by 4, and don't encode the low two bits. + return ((Value - 4) >> 2) & 0xff; case ARM::fixup_arm_thumb_cb: { // Offset by 4 and don't encode the lower bit, which is always 0. + // FIXME: diagnose if no Thumb2 uint32_t Binary = (Value - 4) >> 1; return ((Binary & 0x20) << 4) | ((Binary & 0x1f) << 3); } case ARM::fixup_arm_thumb_br: // Offset by 4 and don't encode the lower bit, which is always 0. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } return ((Value - 4) >> 1) & 0x7ff; case ARM::fixup_arm_thumb_bcc: // Offset by 4 and don't encode the lower bit, which is always 0. + if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) { + const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value); + if (FixupDiagnostic) { + Ctx->reportError(Fixup.getLoc(), FixupDiagnostic); + return 0; + } + } return ((Value - 4) >> 1) & 0xff; case ARM::fixup_arm_pcrel_10_unscaled: { Value = Value - 8; // ARM fixups offset by an additional word and don't @@ -542,8 +590,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, isAdd = false; } // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8]. - if (Ctx && Value >= 256) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && Value >= 256) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } Value = (Value & 0xf) | ((Value & 0xf0) << 4); return Value | (isAdd << 23); } @@ -561,8 +611,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, } // These values don't encode the low two bits since they're always zero. Value >>= 2; - if (Ctx && Value >= 256) - Ctx->reportFatalError(Fixup.getLoc(), "out of range pc-relative fixup value"); + if (Ctx && Value >= 256) { + Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value"); + return 0; + } Value |= isAdd << 23; // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords @@ -582,6 +634,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, const MCValue &Target, uint64_t &Value, bool &IsResolved) { const MCSymbolRefExpr *A = Target.getSymA(); + const MCSymbol *Sym = A ? &A->getSymbol() : nullptr; // Some fixups to thumb function symbols need the low bit (thumb bit) // twiddled. if ((unsigned)Fixup.getKind() != ARM::fixup_arm_ldst_pcrel_12 && @@ -590,18 +643,21 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, (unsigned)Fixup.getKind() != ARM::fixup_thumb_adr_pcrel_10 && (unsigned)Fixup.getKind() != ARM::fixup_t2_adr_pcrel_12 && (unsigned)Fixup.getKind() != ARM::fixup_arm_thumb_cp) { - if (A) { - const MCSymbol &Sym = A->getSymbol(); - if (Asm.isThumbFunc(&Sym)) + if (Sym) { + if (Asm.isThumbFunc(Sym)) Value |= 1; } } - // For Thumb1 BL instruction, it is possible to be a long jump between - // the basic blocks of the same function. Thus, we would like to resolve - // the offset when the destination has the same MCFragment. - if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { - const MCSymbol &Sym = A->getSymbol(); - IsResolved = (Sym.getFragment() == DF); + if (IsResolved && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) { + assert(Sym && "How did we resolve this?"); + + // If the symbol is external the linker will handle it. + // FIXME: Should we handle it as an optimization? + + // If the symbol is out of range, produce a relocation and hope the + // linker can handle it. GNU AS produces an error in this case. + if (Sym->isExternal() || Value >= 0x400004) + IsResolved = false; } // We must always generate a relocation for BL/BLX instructions if we have // a symbol to reference, as the linker relies on knowing the destination @@ -616,7 +672,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm, // the instruction. This allows adjustFixupValue() to issue a diagnostic // if the value aren't invalid. (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(), - IsLittleEndian); + IsLittleEndian, IsResolved); } /// getFixupKindNumBytes - The number of bytes the fixup may change. @@ -719,7 +775,8 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); - Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian); + Value = + adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true); if (!Value) return; // Doesn't change encoding. @@ -743,6 +800,249 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } } +namespace CU { + +/// \brief Compact unwind encoding values. +enum CompactUnwindEncodings { + UNWIND_ARM_MODE_MASK = 0x0F000000, + UNWIND_ARM_MODE_FRAME = 0x01000000, + UNWIND_ARM_MODE_FRAME_D = 0x02000000, + UNWIND_ARM_MODE_DWARF = 0x04000000, + + UNWIND_ARM_FRAME_STACK_ADJUST_MASK = 0x00C00000, + + UNWIND_ARM_FRAME_FIRST_PUSH_R4 = 0x00000001, + UNWIND_ARM_FRAME_FIRST_PUSH_R5 = 0x00000002, + UNWIND_ARM_FRAME_FIRST_PUSH_R6 = 0x00000004, + + UNWIND_ARM_FRAME_SECOND_PUSH_R8 = 0x00000008, + UNWIND_ARM_FRAME_SECOND_PUSH_R9 = 0x00000010, + UNWIND_ARM_FRAME_SECOND_PUSH_R10 = 0x00000020, + UNWIND_ARM_FRAME_SECOND_PUSH_R11 = 0x00000040, + UNWIND_ARM_FRAME_SECOND_PUSH_R12 = 0x00000080, + + UNWIND_ARM_FRAME_D_REG_COUNT_MASK = 0x00000F00, + + UNWIND_ARM_DWARF_SECTION_OFFSET = 0x00FFFFFF +}; + +} // end CU namespace + +/// Generate compact unwind encoding for the function based on the CFI +/// instructions. If the CFI instructions describe a frame that cannot be +/// encoded in compact unwind, the method returns UNWIND_ARM_MODE_DWARF which +/// tells the runtime to fallback and unwind using dwarf. +uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const { + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "generateCU()\n"); + // Only armv7k uses CFI based unwinding. + if (Subtype != MachO::CPU_SUBTYPE_ARM_V7K) + return 0; + // No .cfi directives means no frame. + if (Instrs.empty()) + return 0; + // Start off assuming CFA is at SP+0. + int CFARegister = ARM::SP; + int CFARegisterOffset = 0; + // Mark savable registers as initially unsaved + DenseMap<unsigned, int> RegOffsets; + int FloatRegCount = 0; + // Process each .cfi directive and build up compact unwind info. + for (size_t i = 0, e = Instrs.size(); i != e; ++i) { + int Reg; + const MCCFIInstruction &Inst = Instrs[i]; + switch (Inst.getOperation()) { + case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa + CFARegisterOffset = -Inst.getOffset(); + CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + break; + case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset + CFARegisterOffset = -Inst.getOffset(); + break; + case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register + CFARegister = MRI.getLLVMRegNum(Inst.getRegister(), true); + break; + case MCCFIInstruction::OpOffset: // DW_CFA_offset + Reg = MRI.getLLVMRegNum(Inst.getRegister(), true); + if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg)) + RegOffsets[Reg] = Inst.getOffset(); + else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) { + RegOffsets[Reg] = Inst.getOffset(); + ++FloatRegCount; + } else { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << ".cfi_offset on unknown register=" + << Inst.getRegister() << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + break; + case MCCFIInstruction::OpRelOffset: // DW_CFA_advance_loc + // Ignore + break; + default: + // Directive not convertable to compact unwind, bail out. + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() + << "CFI directive not compatiable with comact " + "unwind encoding, opcode=" << Inst.getOperation() + << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + break; + } + } + + // If no frame set up, return no unwind info. + if ((CFARegister == ARM::SP) && (CFARegisterOffset == 0)) + return 0; + + // Verify standard frame (lr/r7) was used. + if (CFARegister != ARM::R7) { + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() << "frame register is " + << CFARegister + << " instead of r7\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + int StackAdjust = CFARegisterOffset - 8; + if (RegOffsets.lookup(ARM::LR) != (-4 - StackAdjust)) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() + << "LR not saved as standard frame, StackAdjust=" + << StackAdjust + << ", CFARegisterOffset=" << CFARegisterOffset + << ", lr save at offset=" << RegOffsets[14] << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + if (RegOffsets.lookup(ARM::R7) != (-8 - StackAdjust)) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << "r7 not saved as standard frame\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + uint32_t CompactUnwindEncoding = CU::UNWIND_ARM_MODE_FRAME; + + // If var-args are used, there may be a stack adjust required. + switch (StackAdjust) { + case 0: + break; + case 4: + CompactUnwindEncoding |= 0x00400000; + break; + case 8: + CompactUnwindEncoding |= 0x00800000; + break; + case 12: + CompactUnwindEncoding |= 0x00C00000; + break; + default: + DEBUG_WITH_TYPE("compact-unwind", llvm::dbgs() + << ".cfi_def_cfa stack adjust (" + << StackAdjust << ") out of range\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + + // If r6 is saved, it must be right below r7. + static struct { + unsigned Reg; + unsigned Encoding; + } GPRCSRegs[] = {{ARM::R6, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R6}, + {ARM::R5, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R5}, + {ARM::R4, CU::UNWIND_ARM_FRAME_FIRST_PUSH_R4}, + {ARM::R12, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R12}, + {ARM::R11, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R11}, + {ARM::R10, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R10}, + {ARM::R9, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R9}, + {ARM::R8, CU::UNWIND_ARM_FRAME_SECOND_PUSH_R8}}; + + int CurOffset = -8 - StackAdjust; + for (auto CSReg : GPRCSRegs) { + auto Offset = RegOffsets.find(CSReg.Reg); + if (Offset == RegOffsets.end()) + continue; + + int RegOffset = Offset->second; + if (RegOffset != CurOffset - 4) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << MRI.getName(CSReg.Reg) << " saved at " + << RegOffset << " but only supported at " + << CurOffset << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + CompactUnwindEncoding |= CSReg.Encoding; + CurOffset -= 4; + } + + // If no floats saved, we are done. + if (FloatRegCount == 0) + return CompactUnwindEncoding; + + // Switch mode to include D register saving. + CompactUnwindEncoding &= ~CU::UNWIND_ARM_MODE_MASK; + CompactUnwindEncoding |= CU::UNWIND_ARM_MODE_FRAME_D; + + // FIXME: supporting more than 4 saved D-registers compactly would be trivial, + // but needs coordination with the linker and libunwind. + if (FloatRegCount > 4) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << "unsupported number of D registers saved (" + << FloatRegCount << ")\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + + // Floating point registers must either be saved sequentially, or we defer to + // DWARF. No gaps allowed here so check that each saved d-register is + // precisely where it should be. + static unsigned FPRCSRegs[] = { ARM::D8, ARM::D10, ARM::D12, ARM::D14 }; + for (int Idx = FloatRegCount - 1; Idx >= 0; --Idx) { + auto Offset = RegOffsets.find(FPRCSRegs[Idx]); + if (Offset == RegOffsets.end()) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << FloatRegCount << " D-regs saved, but " + << MRI.getName(FPRCSRegs[Idx]) + << " not saved\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } else if (Offset->second != CurOffset - 8) { + DEBUG_WITH_TYPE("compact-unwind", + llvm::dbgs() << FloatRegCount << " D-regs saved, but " + << MRI.getName(FPRCSRegs[Idx]) + << " saved at " << Offset->second + << ", expected at " << CurOffset - 8 + << "\n"); + return CU::UNWIND_ARM_MODE_DWARF; + } + CurOffset -= 8; + } + + return CompactUnwindEncoding | ((FloatRegCount - 1) << 8); +} + +static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) { + unsigned AK = ARM::parseArch(Arch); + switch (AK) { + default: + return MachO::CPU_SUBTYPE_ARM_V7; + case ARM::AK_ARMV4T: + return MachO::CPU_SUBTYPE_ARM_V4T; + case ARM::AK_ARMV5T: + case ARM::AK_ARMV5TE: + case ARM::AK_ARMV5TEJ: + return MachO::CPU_SUBTYPE_ARM_V5; + case ARM::AK_ARMV6: + case ARM::AK_ARMV6K: + return MachO::CPU_SUBTYPE_ARM_V6; + case ARM::AK_ARMV7A: + return MachO::CPU_SUBTYPE_ARM_V7; + case ARM::AK_ARMV7S: + return MachO::CPU_SUBTYPE_ARM_V7S; + case ARM::AK_ARMV7K: + return MachO::CPU_SUBTYPE_ARM_V7K; + case ARM::AK_ARMV6M: + return MachO::CPU_SUBTYPE_ARM_V6M; + case ARM::AK_ARMV7M: + return MachO::CPU_SUBTYPE_ARM_V7M; + case ARM::AK_ARMV7EM: + return MachO::CPU_SUBTYPE_ARM_V7EM; + } +} + MCAsmBackend *llvm::createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TheTriple, StringRef CPU, @@ -751,19 +1051,8 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, default: llvm_unreachable("unsupported object format"); case Triple::MachO: { - MachO::CPUSubTypeARM CS = - StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName()) - .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T) - .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ) - .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6) - .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M) - .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM) - .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K) - .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M) - .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S) - .Default(MachO::CPU_SUBTYPE_ARM_V7); - - return new ARMAsmBackendDarwin(T, TheTriple, CS); + MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName()); + return new ARMAsmBackendDarwin(T, TheTriple, MRI, CS); } case Triple::COFF: assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported"); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 6b4abd5..28a6213 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -45,6 +45,10 @@ public: const MCValue &Target, uint64_t &Value, bool &IsResolved) override; + unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel, + MCContext *Ctx, bool IsLittleEndian, + bool IsResolved) const; + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; @@ -52,6 +56,9 @@ public: bool mayNeedRelaxation(const MCInst &Inst) const override; + const char *reasonForFixupRelaxation(const MCFixup &Fixup, + uint64_t Value) const; + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override; diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h index a6206e3..995dd0f 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h @@ -16,11 +16,12 @@ using namespace llvm; namespace { class ARMAsmBackendDarwin : public ARMAsmBackend { + const MCRegisterInfo &MRI; public: const MachO::CPUSubTypeARM Subtype; ARMAsmBackendDarwin(const Target &T, const Triple &TT, - MachO::CPUSubTypeARM st) - : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), Subtype(st) { + const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st) + : ARMAsmBackend(T, TT, /* IsLittleEndian */ true), MRI(MRI), Subtype(st) { HasDataInCodeSupport = true; } @@ -28,6 +29,9 @@ public: return createARMMachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_ARM, Subtype); } + + uint32_t generateCompactUnwindEncoding( + ArrayRef<MCCFIInstruction> Instrs) const override; }; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp index 804d353..52eba8be 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp @@ -95,7 +95,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case MCSymbolRefExpr::VK_GOTTPOFF: Type = ELF::R_ARM_TLS_IE32; break; - case MCSymbolRefExpr::VK_GOTPCREL: + case MCSymbolRefExpr::VK_ARM_GOT_PREL: Type = ELF::R_ARM_GOT_PREL; break; } @@ -192,7 +192,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target, case MCSymbolRefExpr::VK_GOTOFF: Type = ELF::R_ARM_GOTOFF32; break; - case MCSymbolRefExpr::VK_GOTPCREL: + case MCSymbolRefExpr::VK_ARM_GOT_PREL: Type = ELF::R_ARM_GOT_PREL; break; case MCSymbolRefExpr::VK_ARM_TARGET1: diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp index d17fdb9..6084f22 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp @@ -79,7 +79,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer { void emitAttribute(unsigned Attribute, unsigned Value) override; void emitTextAttribute(unsigned Attribute, StringRef String) override; void emitIntTextAttribute(unsigned Attribute, unsigned IntValue, - StringRef StrinValue) override; + StringRef StringValue) override; void emitArch(unsigned Arch) override; void emitArchExtension(unsigned ArchExt) override; void emitObjectArch(unsigned Arch) override; @@ -195,16 +195,16 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute, OS << "\n"; } void ARMTargetAsmStreamer::emitArch(unsigned Arch) { - OS << "\t.arch\t" << ARMTargetParser::getArchName(Arch) << "\n"; + OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n"; } void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) { - OS << "\t.arch_extension\t" << ARMTargetParser::getArchExtName(ArchExt) << "\n"; + OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n"; } void ARMTargetAsmStreamer::emitObjectArch(unsigned Arch) { - OS << "\t.object_arch\t" << ARMTargetParser::getArchName(Arch) << '\n'; + OS << "\t.object_arch\t" << ARM::getArchName(Arch) << '\n'; } void ARMTargetAsmStreamer::emitFPU(unsigned FPU) { - OS << "\t.fpu\t" << ARMTargetParser::getFPUName(FPU) << "\n"; + OS << "\t.fpu\t" << ARM::getFPUName(FPU) << "\n"; } void ARMTargetAsmStreamer::finishAttributeSection() { } @@ -243,7 +243,7 @@ void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset, class ARMTargetELFStreamer : public ARMTargetStreamer { private: // This structure holds all attributes, accounting for - // their string/numeric value, so we can later emmit them + // their string/numeric value, so we can later emit them // in declaration order, keeping all in the same vector struct AttributeItem { enum { @@ -254,7 +254,7 @@ private: } Type; unsigned Tag; unsigned IntValue; - StringRef StringValue; + std::string StringValue; static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) { // The conformance tag must be emitted first when serialised @@ -507,14 +507,15 @@ public: /// This is one of the functions used to emit data into an ELF section, so the /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if /// necessary. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override { + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override { if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) - if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) - getContext().reportFatalError(Loc, "relocated expression must be 32-bit"); + if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) { + getContext().reportError(Loc, "relocated expression must be 32-bit"); + return; + } EmitDataMappingSymbol(); - MCELFStreamer::EmitValueImpl(Value, Size); + MCELFStreamer::EmitValueImpl(Value, Size, Loc); } void EmitAssemblerFlag(MCAssemblerFlag Flag) override { @@ -684,16 +685,16 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { using namespace ARMBuildAttrs; setAttributeItem(CPU_name, - ARMTargetParser::getCPUAttr(Arch), + ARM::getCPUAttr(Arch), false); if (EmittedArch == ARM::AK_INVALID) setAttributeItem(CPU_arch, - ARMTargetParser::getArchAttr(Arch), + ARM::getArchAttr(Arch), false); else setAttributeItem(CPU_arch, - ARMTargetParser::getArchAttr(EmittedArch), + ARM::getArchAttr(EmittedArch), false); switch (Arch) { @@ -702,7 +703,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV3: case ARM::AK_ARMV3M: case ARM::AK_ARMV4: - case ARM::AK_ARMV5: setAttributeItem(ARM_ISA_use, Allowed, false); break; @@ -710,7 +710,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV5T: case ARM::AK_ARMV5TE: case ARM::AK_ARMV6: - case ARM::AK_ARMV6J: setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, Allowed, false); break; @@ -721,8 +720,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { break; case ARM::AK_ARMV6K: - case ARM::AK_ARMV6Z: - case ARM::AK_ARMV6ZK: + case ARM::AK_ARMV6KZ: setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, Allowed, false); setAttributeItem(Virtualization_use, AllowTZ, false); @@ -732,10 +730,6 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { setAttributeItem(THUMB_ISA_use, Allowed, false); break; - case ARM::AK_ARMV7: - setAttributeItem(THUMB_ISA_use, AllowThumb32, false); - break; - case ARM::AK_ARMV7A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); @@ -755,6 +749,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() { case ARM::AK_ARMV8A: case ARM::AK_ARMV8_1A: + case ARM::AK_ARMV8_2A: setAttributeItem(CPU_arch_profile, ApplicationProfile, false); setAttributeItem(ARM_ISA_use, Allowed, false); setAttributeItem(THUMB_ISA_use, AllowThumb32, false); @@ -1084,19 +1079,14 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix, } inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) { - SwitchToEHSection(".ARM.extab", - ELF::SHT_PROGBITS, - ELF::SHF_ALLOC, - SectionKind::getDataRel(), - FnStart); + SwitchToEHSection(".ARM.extab", ELF::SHT_PROGBITS, ELF::SHF_ALLOC, + SectionKind::getData(), FnStart); } inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) { - SwitchToEHSection(".ARM.exidx", - ELF::SHT_ARM_EXIDX, + SwitchToEHSection(".ARM.exidx", ELF::SHT_ARM_EXIDX, ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER, - SectionKind::getDataRel(), - FnStart); + SectionKind::getData(), FnStart); } void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) { MCDataFragment *Frag = getOrCreateDataFragment(); diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp index 1ac0815..bda37f6 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp @@ -33,7 +33,9 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) { SupportsDebugInformation = true; // Exceptions handling - ExceptionsType = ExceptionHandling::SjLj; + ExceptionsType = TheTriple.isOSDarwin() && !TheTriple.isWatchOS() + ? ExceptionHandling::SjLj + : ExceptionHandling::DwarfCFI; UseIntegratedAssembler = true; } diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h index 99a5fff..5e54816 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h @@ -19,34 +19,37 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; - - class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - - public: - explicit ARMMCAsmInfoDarwin(const Triple &TheTriple); - }; - - class ARMELFMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit ARMELFMCAsmInfo(const Triple &TT); - - void setUseIntegratedAssembler(bool Value) override; - }; - - class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { - void anchor() override; - public: - explicit ARMCOFFMCAsmInfoMicrosoft(); - }; - - class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF { - void anchor() override; - public: - explicit ARMCOFFMCAsmInfoGNU(); - }; +class Triple; + +class ARMMCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit ARMMCAsmInfoDarwin(const Triple &TheTriple); +}; + +class ARMELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit ARMELFMCAsmInfo(const Triple &TT); + + void setUseIntegratedAssembler(bool Value) override; +}; + +class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit ARMCOFFMCAsmInfoMicrosoft(); +}; + +class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF { + void anchor() override; + +public: + explicit ARMCOFFMCAsmInfoGNU(); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h index 9146d4d..75dde80 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h @@ -63,8 +63,8 @@ public: return false; } void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } // There are no TLS ARMMCExprs at the moment. diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp index 21c9fc1..8c8c249 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp @@ -24,6 +24,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetParser.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -134,101 +135,11 @@ std::string ARM_MC::ParseARMTriple(const Triple &TT, StringRef CPU) { bool isThumb = TT.getArch() == Triple::thumb || TT.getArch() == Triple::thumbeb; - bool NoCPU = CPU == "generic" || CPU.empty(); std::string ARMArchFeature; - switch (TT.getSubArch()) { - default: - llvm_unreachable("invalid sub-architecture for ARM"); - case Triple::ARMSubArch_v8: - if (NoCPU) - // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, - // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, - // FeatureT2XtPk, FeatureCrypto, FeatureCRC - ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm," - "+trustzone,+t2xtpk,+crypto,+crc"; - else - // Use CPU to figure out the exact features - ARMArchFeature = "+v8"; - break; - case Triple::ARMSubArch_v8_1a: - if (NoCPU) - // v8.1a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, - // FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, - // FeatureT2XtPk, FeatureCrypto, FeatureCRC, FeatureV8_1a - ARMArchFeature = "+v8.1a,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm," - "+trustzone,+t2xtpk,+crypto,+crc"; - else - // Use CPU to figure out the exact features - ARMArchFeature = "+v8.1a"; - break; - case Triple::ARMSubArch_v7m: - isThumb = true; - if (NoCPU) - // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v7em: - if (NoCPU) - // v7em: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureDSPThumb2, - // FeatureT2XtPk, FeatureMClass - ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+t2dsp,+t2xtpk,+mclass"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v7s: - if (NoCPU) - // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureHasRAS - // Swift - ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+ras"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v7: - // v7 CPUs have lots of different feature sets. If no CPU is specified, - // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return - // the "minimum" feature set and use CPU string to figure out the exact - // features. - if (NoCPU) - // v7a: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk - ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk"; - else - // Use CPU to figure out the exact features. - ARMArchFeature = "+v7"; - break; - case Triple::ARMSubArch_v6t2: - ARMArchFeature = "+v6t2"; - break; - case Triple::ARMSubArch_v6k: - ARMArchFeature = "+v6k"; - break; - case Triple::ARMSubArch_v6m: - isThumb = true; - if (NoCPU) - // v6m: FeatureNoARM, FeatureMClass - ARMArchFeature = "+v6m,+noarm,+mclass"; - else - ARMArchFeature = "+v6"; - break; - case Triple::ARMSubArch_v6: - ARMArchFeature = "+v6"; - break; - case Triple::ARMSubArch_v5te: - ARMArchFeature = "+v5te"; - break; - case Triple::ARMSubArch_v5: - ARMArchFeature = "+v5t"; - break; - case Triple::ARMSubArch_v4t: - ARMArchFeature = "+v4t"; - break; - case Triple::NoSubArch: - break; - } + + unsigned ArchID = ARM::parseArch(TT.getArchName()); + if (ArchID != ARM::AK_INVALID && (CPU.empty() || CPU == "generic")) + ARMArchFeature = (ARMArchFeature + "+" + ARM::getArchName(ArchID)).str(); if (isThumb) { if (ARMArchFeature.empty()) diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h index fd30623..c2bbc8e 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h @@ -86,7 +86,8 @@ MCAsmBackend *createThumbBEAsmBackend(const Target &T, // object file. MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, bool RelaxAll); + MCCodeEmitter *Emitter, bool RelaxAll, + bool IncrementalLinkerCompatible); /// Construct an ELF Mach-O object writer. MCObjectWriter *createARMELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 95d7ea7..cfd504e 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -150,10 +150,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - if (!A->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!A->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + A->getName() + "' can not be undefined in a subtraction expression"); + return; + } uint32_t Value = Writer->getSymbolAddress(*A, Layout); uint32_t Value2 = 0; @@ -163,10 +165,12 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer, if (const MCSymbolRefExpr *B = Target.getSymB()) { const MCSymbol *SB = &B->getSymbol(); - if (!SB->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!SB->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + B->getSymbol().getName() + "' can not be undefined in a subtraction expression"); + return; + } // Select the appropriate difference relocation type. Type = MachO::ARM_RELOC_HALF_SECTDIFF; @@ -251,10 +255,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - if (!A->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!A->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + A->getName() + "' can not be undefined in a subtraction expression"); + return; + } uint32_t Value = Writer->getSymbolAddress(*A, Layout); uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); @@ -265,10 +271,12 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer, assert(Type == MachO::ARM_RELOC_VANILLA && "invalid reloc for 2 symbols"); const MCSymbol *SB = &B->getSymbol(); - if (!SB->getFragment()) - Asm.getContext().reportFatalError(Fixup.getLoc(), + if (!SB->getFragment()) { + Asm.getContext().reportError(Fixup.getLoc(), "symbol '" + B->getSymbol().getName() + "' can not be undefined in a subtraction expression"); + return; + } // Select the appropriate difference relocation type. Type = MachO::ARM_RELOC_SECTDIFF; @@ -346,13 +354,15 @@ void ARMMachObjectWriter::recordRelocation(MachObjectWriter *Writer, unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind()); unsigned Log2Size; unsigned RelocType = MachO::ARM_RELOC_VANILLA; - if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) + if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size)) { // If we failed to get fixup kind info, it's because there's no legal // relocation type for the fixup kind. This happens when it's a fixup that's // expected to always be resolvable at assembly time and not have any // relocations needed. - Asm.getContext().reportFatalError(Fixup.getLoc(), - "unsupported relocation on symbol"); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation on symbol"); + return; + } // If this is a difference or a defined symbol plus an offset, then we need a // scattered relocation entry. Differences always require scattered diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp index b680db5..dad50f2 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp @@ -27,8 +27,8 @@ ARMTargetStreamer::~ARMTargetStreamer() {} // The constant pool handling is shared by all ARMTargetStreamer // implementations. -const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr) { - return ConstantPools->addEntry(Streamer, Expr, 4); +const MCExpr *ARMTargetStreamer::addConstantPoolEntry(const MCExpr *Expr, SMLoc Loc) { + return ConstantPools->addEntry(Streamer, Expr, 4, Loc); } void ARMTargetStreamer::emitCurrentConstantPool() { diff --git a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp index b993b1b..83fa084 100644 --- a/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp @@ -37,11 +37,11 @@ void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) { } } -MCStreamer *llvm::createARMWinCOFFStreamer(MCContext &Context, - MCAsmBackend &MAB, - raw_pwrite_stream &OS, - MCCodeEmitter *Emitter, - bool RelaxAll) { - return new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS); +MCStreamer *llvm::createARMWinCOFFStreamer( + MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, bool RelaxAll, bool IncrementalLinkerCompatible) { + auto *S = new ARMWinCOFFStreamer(Context, MAB, *Emitter, OS); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); + return S; } diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp index 3b4358b..93e0ac4 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -13,6 +13,7 @@ #include "Thumb1FrameLowering.h" #include "ARMMachineFunctionInfo.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -84,7 +85,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - assert(&MBB == &MF.front() && "Shrink-wrapping not yet implemented"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); @@ -100,7 +100,11 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, assert(NumBytes >= ArgRegsSaveSize && "ArgRegsSaveSize is included in NumBytes"); const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned BasePtr = RegInfo->getBaseRegister(); int CFAOffset = 0; @@ -168,8 +172,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) { ++MBBI; - if (MBBI != MBB.end()) - dl = MBBI->getDebugLoc(); } // Determine starting offsets of spill areas. @@ -232,11 +234,10 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, } } - // Adjust FP so it point to the stack slot that contains the previous FP. if (HasFP) { - FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) - + GPRCS1Size + ArgRegsSaveSize; + FramePtrOffsetInBlock += + MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size + ArgRegsSaveSize; AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr) .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4) .setMIFlags(MachineInstr::FrameSetup)); @@ -321,11 +322,8 @@ static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) { void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert((MBBI->getOpcode() == ARM::tBX_RET || - MBBI->getOpcode() == ARM::tPOP_RET) && - "Can only insert epilog into returning blocks"); - DebugLoc dl = MBBI->getDebugLoc(); + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); MachineFrameInfo *MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); const ThumbRegisterInfo *RegInfo = @@ -377,9 +375,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, ARM::SP) .addReg(FramePtr)); } else { - if (MBBI->getOpcode() == ARM::tBX_RET && - &MBB.front() != MBBI && - std::prev(MBBI)->getOpcode() == ARM::tPOP) { + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tBX_RET && + &MBB.front() != MBBI && std::prev(MBBI)->getOpcode() == ARM::tPOP) { MachineBasicBlock::iterator PMBBI = std::prev(MBBI); if (!tryFoldSPUpdateIntoPushPop(STI, MF, PMBBI, NumBytes)) emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes); @@ -388,66 +385,189 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF, } } - bool IsV4PopReturn = false; - for (const CalleeSavedInfo &CSI : MFI->getCalleeSavedInfo()) + if (needPopSpecialFixUp(MF)) { + bool Done = emitPopSpecialFixUp(MBB, /* DoIt */ true); + (void)Done; + assert(Done && "Emission of the special fixup failed!?"); + } +} + +bool Thumb1FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + if (!needPopSpecialFixUp(*MBB.getParent())) + return true; + + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + return emitPopSpecialFixUp(*TmpMBB, /* DoIt */ false); +} + +bool Thumb1FrameLowering::needPopSpecialFixUp(const MachineFunction &MF) const { + ARMFunctionInfo *AFI = + const_cast<MachineFunction *>(&MF)->getInfo<ARMFunctionInfo>(); + if (AFI->getArgRegsSaveSize()) + return true; + + // LR cannot be encoded with Thumb1, i.e., it requires a special fix-up. + for (const CalleeSavedInfo &CSI : MF.getFrameInfo()->getCalleeSavedInfo()) if (CSI.getReg() == ARM::LR) - IsV4PopReturn = true; - IsV4PopReturn &= STI.hasV4TOps() && !STI.hasV5TOps(); - - // Unlike T2 and ARM mode, the T1 pop instruction cannot restore - // to LR, and we can't pop the value directly to the PC since - // we need to update the SP after popping the value. So instead - // we have to emit: - // POP {r3} - // ADD sp, #offset - // BX r3 - // If this would clobber a return value, then generate this sequence instead: - // MOV ip, r3 - // POP {r3} - // ADD sp, #offset - // MOV lr, r3 - // MOV r3, ip - // BX lr - if (ArgRegsSaveSize || IsV4PopReturn) { - // Get the last instruction, tBX_RET - MBBI = MBB.getLastNonDebugInstr(); - assert (MBBI->getOpcode() == ARM::tBX_RET); - DebugLoc dl = MBBI->getDebugLoc(); - - if (AFI->getReturnRegsCount() <= 3) { - // Epilogue: pop saved LR to R3 and branch off it. - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) - .addReg(ARM::R3, RegState::Define); - - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); - - MachineInstrBuilder MIB = - BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX)) - .addReg(ARM::R3, RegState::Kill); - AddDefaultPred(MIB); - MIB.copyImplicitOps(&*MBBI); - // erase the old tBX_RET instruction - MBB.erase(MBBI); - } else { - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) - .addReg(ARM::R12, RegState::Define) - .addReg(ARM::R3, RegState::Kill)); + return true; + + return false; +} + +bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB, + bool DoIt) const { + MachineFunction &MF = *MBB.getParent(); + ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>(); + unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const ThumbRegisterInfo *RegInfo = + static_cast<const ThumbRegisterInfo *>(STI.getRegisterInfo()); - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) - .addReg(ARM::R3, RegState::Define); + // If MBBI is a return instruction, or is a tPOP followed by a return + // instruction in the successor BB, we may be able to directly restore + // LR in the PC. + // This is only possible with v5T ops (v4T can't change the Thumb bit via + // a POP PC instruction), and only if we do not need to emit any SP update. + // Otherwise, we need a temporary register to pop the value + // and copy that value into LR. + auto MBBI = MBB.getFirstTerminator(); + bool CanRestoreDirectly = STI.hasV5TOps() && !ArgRegsSaveSize; + if (CanRestoreDirectly) { + if (MBBI != MBB.end() && MBBI->getOpcode() != ARM::tB) + CanRestoreDirectly = (MBBI->getOpcode() == ARM::tBX_RET || + MBBI->getOpcode() == ARM::tPOP_RET); + else { + auto MBBI_prev = MBBI; + MBBI_prev--; + assert(MBBI_prev->getOpcode() == ARM::tPOP); + assert(MBB.succ_size() == 1); + if ((*MBB.succ_begin())->begin()->getOpcode() == ARM::tBX_RET) + MBBI = MBBI_prev; // Replace the final tPOP with a tPOP_RET. + else + CanRestoreDirectly = false; + } + } - emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + if (CanRestoreDirectly) { + if (!DoIt || MBBI->getOpcode() == ARM::tPOP_RET) + return true; + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP_RET))); + // Copy implicit ops and popped registers, if any. + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef())) + MIB.addOperand(MO); + MIB.addReg(ARM::PC, RegState::Define); + // Erase the old instruction (tBX_RET or tPOP). + MBB.erase(MBBI); + return true; + } - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) - .addReg(ARM::LR, RegState::Define) - .addReg(ARM::R3, RegState::Kill)); + // Look for a temporary register to use. + // First, compute the liveness information. + LivePhysRegs UsedRegs(STI.getRegisterInfo()); + UsedRegs.addLiveOuts(&MBB, /*AddPristines*/ true); + // The semantic of pristines changed recently and now, + // the callee-saved registers that are touched in the function + // are not part of the pristines set anymore. + // Add those callee-saved now. + const TargetRegisterInfo *TRI = STI.getRegisterInfo(); + const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF); + for (unsigned i = 0; CSRegs[i]; ++i) + UsedRegs.addReg(CSRegs[i]); + + DebugLoc dl = DebugLoc(); + if (MBBI != MBB.end()) { + dl = MBBI->getDebugLoc(); + auto InstUpToMBBI = MBB.end(); + while (InstUpToMBBI != MBBI) + // The pre-decrement is on purpose here. + // We want to have the liveness right before MBBI. + UsedRegs.stepBackward(*--InstUpToMBBI); + } - AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) - .addReg(ARM::R3, RegState::Define) - .addReg(ARM::R12, RegState::Kill)); - // Keep the tBX_RET instruction + // Look for a register that can be directly use in the POP. + unsigned PopReg = 0; + // And some temporary register, just in case. + unsigned TemporaryReg = 0; + BitVector PopFriendly = + TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID)); + assert(PopFriendly.any() && "No allocatable pop-friendly register?!"); + // Rebuild the GPRs from the high registers because they are removed + // form the GPR reg class for thumb1. + BitVector GPRsNoLRSP = + TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID)); + GPRsNoLRSP |= PopFriendly; + GPRsNoLRSP.reset(ARM::LR); + GPRsNoLRSP.reset(ARM::SP); + GPRsNoLRSP.reset(ARM::PC); + for (int Register = GPRsNoLRSP.find_first(); Register != -1; + Register = GPRsNoLRSP.find_next(Register)) { + if (!UsedRegs.contains(Register)) { + // Remember the first pop-friendly register and exit. + if (PopFriendly.test(Register)) { + PopReg = Register; + TemporaryReg = 0; + break; + } + // Otherwise, remember that the register will be available to + // save a pop-friendly register. + TemporaryReg = Register; } } + + if (!DoIt && !PopReg && !TemporaryReg) + return false; + + assert((PopReg || TemporaryReg) && "Cannot get LR"); + + if (TemporaryReg) { + assert(!PopReg && "Unnecessary MOV is about to be inserted"); + PopReg = PopFriendly.find_first(); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(TemporaryReg, RegState::Define) + .addReg(PopReg, RegState::Kill)); + } + + if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPOP_RET) { + // We couldn't use the direct restoration above, so + // perform the opposite conversion: tPOP_RET to tPOP. + MachineInstrBuilder MIB = + AddDefaultPred( + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII.get(ARM::tPOP))); + bool Popped = false; + for (auto MO: MBBI->operands()) + if (MO.isReg() && (MO.isImplicit() || MO.isDef()) && + MO.getReg() != ARM::PC) { + MIB.addOperand(MO); + if (!MO.isImplicit()) + Popped = true; + } + // Is there anything left to pop? + if (!Popped) + MBB.erase(MIB.getInstr()); + // Erase the old instruction. + MBB.erase(MBBI); + MBBI = AddDefaultPred(BuildMI(MBB, MBB.end(), dl, TII.get(ARM::tBX_RET))); + } + + assert(PopReg && "Do not know how to get LR"); + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP))) + .addReg(PopReg, RegState::Define); + + emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, ArgRegsSaveSize); + + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(ARM::LR, RegState::Define) + .addReg(PopReg, RegState::Kill)); + + if (TemporaryReg) + AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr)) + .addReg(PopReg, RegState::Define) + .addReg(TemporaryReg, RegState::Kill)); + + return true; } bool Thumb1FrameLowering:: @@ -461,8 +581,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, DebugLoc DL; const TargetInstrInfo &TII = *STI.getInstrInfo(); - if (MI != MBB.end()) DL = MI->getDebugLoc(); - MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(ARM::tPUSH)); AddDefaultPred(MIB); for (unsigned i = CSI.size(); i != 0; --i) { @@ -501,31 +619,38 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB, const TargetInstrInfo &TII = *STI.getInstrInfo(); bool isVarArg = AFI->getArgRegsSaveSize() > 0; - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); MachineInstrBuilder MIB = BuildMI(MF, DL, TII.get(ARM::tPOP)); AddDefaultPred(MIB); - bool NumRegs = false; + bool NeedsPop = false; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (Reg == ARM::LR) { - // Special epilogue for vararg functions. See emitEpilogue - if (isVarArg) - continue; - // ARMv4T requires BX, see emitEpilogue - if (STI.hasV4TOps() && !STI.hasV5TOps()) + if (MBB.succ_empty()) { + // Special epilogue for vararg functions. See emitEpilogue + if (isVarArg) + continue; + // ARMv4T requires BX, see emitEpilogue + if (!STI.hasV5TOps()) + continue; + Reg = ARM::PC; + (*MIB).setDesc(TII.get(ARM::tPOP_RET)); + if (MI != MBB.end()) + MIB.copyImplicitOps(&*MI); + MI = MBB.erase(MI); + } else + // LR may only be popped into PC, as part of return sequence. + // If this isn't the return sequence, we'll need emitPopSpecialFixUp + // to restore LR the hard way. continue; - Reg = ARM::PC; - (*MIB).setDesc(TII.get(ARM::tPOP_RET)); - MIB.copyImplicitOps(&*MI); - MI = MBB.erase(MI); } MIB.addReg(Reg, getDefRegState(true)); - NumRegs = true; + NeedsPop = true; } // It's illegal to emit pop instruction without operands. - if (NumRegs) + if (NeedsPop) MBB.insert(MI, &*MIB); else MF.DeleteMachineInstr(MIB); diff --git a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h index 31d5732..812f983 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h +++ b/contrib/llvm/lib/Target/ARM/Thumb1FrameLowering.h @@ -45,6 +45,42 @@ public: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + + /// Check whether or not the given \p MBB can be used as a epilogue + /// for the target. + /// The epilogue will be inserted before the first terminator of that block. + /// This method is used by the shrink-wrapping pass to decide if + /// \p MBB will be correctly handled by the target. + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + +private: + /// Check if the frame lowering of \p MF needs a special fixup + /// code sequence for the epilogue. + /// Unlike T2 and ARM mode, the T1 pop instruction cannot restore + /// to LR, and we can't pop the value directly to the PC when + /// we need to update the SP after popping the value. So instead + /// we have to emit: + /// POP {r3} + /// ADD sp, #offset + /// BX r3 + /// If this would clobber a return value, then generate this sequence instead: + /// MOV ip, r3 + /// POP {r3} + /// ADD sp, #offset + /// MOV lr, r3 + /// MOV r3, ip + /// BX lr + bool needPopSpecialFixUp(const MachineFunction &MF) const; + + /// Emit the special fixup code sequence for the epilogue. + /// \see needPopSpecialFixUp for more details. + /// \p DoIt, tells this method whether or not to actually insert + /// the code sequence in \p MBB. I.e., when \p DoIt is false, + /// \p MBB is left untouched. + /// \returns For \p DoIt == true: True when the emission succeeded + /// false otherwise. For \p DoIt == false: True when the emission + /// would have been possible, false otherwise. + bool emitPopSpecialFixUp(MachineBasicBlock &MBB, bool DoIt) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp index 216e776..530e1d3 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp @@ -84,11 +84,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSTRspi)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); @@ -112,11 +110,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO)); } diff --git a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp index 68736bc..bf0498d 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp @@ -256,8 +256,8 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) { LastITMI->findRegisterUseOperand(ARM::ITSTATE)->setIsKill(); // Finalize the bundle. - MachineBasicBlock::instr_iterator LI = LastITMI; - finalizeBundle(MBB, InsertPos.getInstrIterator(), std::next(LI)); + finalizeBundle(MBB, InsertPos.getInstrIterator(), + ++LastITMI->getIterator()); Modified = true; ++NumITs; diff --git a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp index dc74f4e..4da769f 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp @@ -131,11 +131,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); if (RC == &ARM::GPRRegClass || RC == &ARM::tGPRRegClass || RC == &ARM::tcGPRRegClass || RC == &ARM::rGPRRegClass || @@ -171,11 +169,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); DebugLoc DL; if (I != MBB.end()) DL = I->getDebugLoc(); diff --git a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp index d9ab824..bcd0e57 100644 --- a/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/contrib/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -125,7 +125,10 @@ namespace { { ARM::t2LDMIA, ARM::tLDMIA, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_RET,0, ARM::tPOP_RET, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2LDMIA_UPD,ARM::tLDMIA_UPD,ARM::tPOP,0, 0, 1, 1, 1,1, 0,1,0 }, - // ARM::t2STM (with no basereg writeback) has no Thumb1 equivalent + // ARM::t2STMIA (with no basereg writeback) has no Thumb1 equivalent. + // tSTMIA_UPD is a change in semantics which can only be used if the base + // register is killed. This difference is correctly handled elsewhere. + { ARM::t2STMIA, ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMIA_UPD,ARM::tSTMIA_UPD, 0, 0, 0, 1, 1, 1,1, 0,1,0 }, { ARM::t2STMDB_UPD, 0, ARM::tPUSH, 0, 0, 1, 1, 1,1, 0,1,0 } }; @@ -210,12 +213,12 @@ Thumb2SizeReduce::Thumb2SizeReduce(std::function<bool(const Function &)> Ftor) for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) { unsigned FromOpc = ReduceTable[i].WideOpc; if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second) - assert(false && "Duplicated entries?"); + llvm_unreachable("Duplicated entries?"); } } static bool HasImplicitCPSRDef(const MCInstrDesc &MCID) { - for (const uint16_t *Regs = MCID.getImplicitDefs(); *Regs; ++Regs) + for (const MCPhysReg *Regs = MCID.getImplicitDefs(); *Regs; ++Regs) if (*Regs == ARM::CPSR) return true; return false; @@ -435,6 +438,14 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, isLdStMul = true; break; } + case ARM::t2STMIA: { + // If the base register is killed, we don't care what its value is after the + // instruction, so we can use an updating STMIA. + if (!MI->getOperand(0).isKill()) + return false; + + break; + } case ARM::t2LDMIA_RET: { unsigned BaseReg = MI->getOperand(1).getReg(); if (BaseReg != ARM::SP) @@ -492,6 +503,12 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI, // Add the 16-bit load / store instruction. DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(MBB, MI, dl, TII->get(Opc)); + + // tSTMIA_UPD takes a defining register operand. We've already checked that + // the register is killed, so mark it as dead here. + if (Entry.WideOpc == ARM::t2STMIA) + MIB.addReg(MI->getOperand(0).getReg(), RegState::Define | RegState::Dead); + if (!isLdStMul) { MIB.addOperand(MI->getOperand(0)); MIB.addOperand(MI->getOperand(1)); @@ -633,10 +650,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr)) return false; - if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs && - STI->avoidMOVsShifterOperand()) + if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand()) // Don't issue movs with shifter operand for some CPUs unless we - // are optimizing / minimizing for size. + // are optimizing for size. return false; unsigned Reg0 = MI->getOperand(0).getReg(); @@ -660,11 +676,13 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI, } } else if (Reg0 != Reg1) { // Try to commute the operands to make it a 2-address instruction. - unsigned CommOpIdx1, CommOpIdx2; + unsigned CommOpIdx1 = 1; + unsigned CommOpIdx2 = TargetInstrInfo::CommuteAnyOperandIndex; if (!TII->findCommutedOpIndices(MI, CommOpIdx1, CommOpIdx2) || - CommOpIdx1 != 1 || MI->getOperand(CommOpIdx2).getReg() != Reg0) + MI->getOperand(CommOpIdx2).getReg() != Reg0) return false; - MachineInstr *CommutedMI = TII->commuteInstruction(MI); + MachineInstr *CommutedMI = + TII->commuteInstruction(MI, false, CommOpIdx1, CommOpIdx2); if (!CommutedMI) return false; } @@ -750,10 +768,9 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI, if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit)) return false; - if (!MinimizeSize && !OptimizeSize && Entry.AvoidMovs && - STI->avoidMOVsShifterOperand()) + if (!OptimizeSize && Entry.AvoidMovs && STI->avoidMOVsShifterOperand()) // Don't issue movs with shifter operand for some CPUs unless we - // are optimizing / minimizing for size. + // are optimizing for size. return false; unsigned Limit = ~0U; @@ -1012,9 +1029,9 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) { TII = static_cast<const Thumb2InstrInfo *>(STI->getInstrInfo()); - // Optimizing / minimizing size? - OptimizeSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); - MinimizeSize = MF.getFunction()->hasFnAttribute(Attribute::MinSize); + // Optimizing / minimizing size? Minimizing size implies optimizing for size. + OptimizeSize = MF.getFunction()->optForSize(); + MinimizeSize = MF.getFunction()->optForMinSize(); BlockInfo.clear(); BlockInfo.resize(MF.getNumBlockIDs()); diff --git a/contrib/llvm/lib/Target/AVR/AVR.td b/contrib/llvm/lib/Target/AVR/AVR.td new file mode 100644 index 0000000..9e80717 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVR.td @@ -0,0 +1,563 @@ +//===-- AVR.td - Describe the AVR Target Machine ----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// This is the top level entry point for the AVR target. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===---------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===---------------------------------------------------------------------===// +// AVR Subtarget Features. +//===---------------------------------------------------------------------===// + +// :TODO: Implement the skip errata, see `gcc/config/avr/avr-arch.h` for details +// :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD. +// In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`. +// avr2 (with SRAM) adds the rest of the variants. +// :TODO: s/AVRTiny/Tiny + + +// A feature set aggregates features, grouping them. We don't want to create a +// new member in AVRSubtarget (to store a value) for each set because we do not +// care if the set is supported, only the subfeatures inside the set. We fix +// this by simply setting the same dummy member for all feature sets, which is +// then ignored. +class FeatureSet<string name, string desc, list<SubtargetFeature> i> + : SubtargetFeature<name, "m_FeatureSetDummy", "true", desc, i>; + +// A family of microcontrollers, defining a set of supported features. +class Family<string name, list<SubtargetFeature> i> + : FeatureSet<name, !strconcat("The device is a part of the ", + name, " family"), i>; + +// The device has SRAM, and supports the bare minimum of +// SRAM-relevant instructions. +// +// These are: +// LD - all 9 variants +// ST - all 9 variants +// LDD - two variants for Y and Z +// STD - two variants for Y and Z +// `LDS Rd, K` +// `STS k, Rr` +// `PUSH`/`POP` +def FeatureSRAM : SubtargetFeature<"sram", "m_hasSRAM", "true", + "The device has random access memory">; + +// The device supports the `JMP k` and `CALL k` instructions. +def FeatureJMPCALL : SubtargetFeature<"jmpcall", "m_hasJMPCALL", "true", + "The device supports the `JMP` and " + "`CALL` instructions">; + + +// The device supports the indirect branches `IJMP` and `ICALL`. +def FeatureIJMPCALL : SubtargetFeature<"ijmpcall", "m_hasIJMPCALL", + "true", + "The device supports `IJMP`/`ICALL`" + "instructions">; + +// The device supports the extended indirect branches `EIJMP` and `EICALL`. +def FeatureEIJMPCALL : SubtargetFeature<"eijmpcall", "m_hasEIJMPCALL", + "true", "The device supports the " + "`EIJMP`/`EICALL` instructions">; + +// The device supports `ADDI Rd, K`, `SUBI Rd, K`. +def FeatureADDSUBIW : SubtargetFeature<"addsubiw", "m_hasADDSUBIW", + "true", "Enable 16-bit register-immediate " + "addition and subtraction instructions">; + +// The device has an 8-bit stack pointer (SP) register. +def FeatureSmallStack : SubtargetFeature<"smallstack", "m_hasSmallStack", + "true", "The device has an 8-bit " + "stack pointer">; + +// The device supports the 16-bit GPR pair MOVW instruction. +def FeatureMOVW : SubtargetFeature<"movw", "m_hasMOVW", "true", + "The device supports the 16-bit MOVW " + "instruction">; + +// The device supports the `LPM` instruction, with implied destination being r0. +def FeatureLPM : SubtargetFeature<"lpm", "m_hasLPM", "true", + "The device supports the `LPM` instruction">; + +// The device supports the `LPM Rd, Z[+] instruction. +def FeatureLPMX : SubtargetFeature<"lpmx", "m_hasLPMX", "true", + "The device supports the `LPM Rd, Z[+]` " + "instruction">; + +// The device supports the `ELPM` instruction. +def FeatureELPM : SubtargetFeature<"elpm", "m_hasELPM", "true", + "The device supports the ELPM instruction">; + +// The device supports the `ELPM Rd, Z[+]` instructions. +def FeatureELPMX : SubtargetFeature<"elpmx", "m_hasELPMX", "true", + "The device supports the `ELPM Rd, Z[+]` " + "instructions">; + +// The device supports the `SPM` instruction. +def FeatureSPM : SubtargetFeature<"spm", "m_hasSPM", "true", + "The device supports the `SPM` instruction">; + +// The device supports the `SPM Z+` instruction. +def FeatureSPMX : SubtargetFeature<"spmx", "m_hasSPMX", "true", + "The device supports the `SPM Z+` " + "instruction">; + +// The device supports the `DES k` instruction. +def FeatureDES : SubtargetFeature<"des", "m_hasDES", "true", + "The device supports the `DES k` encryption " + "instruction">; + +// The device supports the Read-Write-Modify instructions +// XCH, LAS, LAC, and LAT. +def FeatureRMW : SubtargetFeature<"rmw", "m_supportsRMW", "true", + "The device supports the read-write-modify " + "instructions: XCH, LAS, LAC, LAT">; + +// The device supports the `[F]MUL[S][U]` family of instructions. +def FeatureMultiplication : SubtargetFeature<"mul", "m_supportsMultiplication", + "true", "The device supports the " + "multiplication instructions">; + +// The device supports the `BREAK` instruction. +def FeatureBREAK : SubtargetFeature<"break", "m_hasBREAK", "true", + "The device supports the `BREAK` debugging " + "instruction">; + +// The device has instruction encodings specific to the Tiny core. +def FeatureTinyEncoding : SubtargetFeature<"tinyencoding", + "m_hasTinyEncoding", "true", + "The device has Tiny core specific " + "instruction encodings">; + +class ELFArch<string name> : SubtargetFeature<"", "ELFArch", + !strconcat("ELF::",name), "">; + +// ELF e_flags architecture values +def ELFArchAVR1 : ELFArch<"EF_AVR_ARCH_AVR1">; +def ELFArchAVR2 : ELFArch<"EF_AVR_ARCH_AVR2">; +def ELFArchAVR25 : ELFArch<"EF_AVR_ARCH_AVR25">; +def ELFArchAVR3 : ELFArch<"EF_AVR_ARCH_AVR3">; +def ELFArchAVR31 : ELFArch<"EF_AVR_ARCH_AVR31">; +def ELFArchAVR35 : ELFArch<"EF_AVR_ARCH_AVR35">; +def ELFArchAVR4 : ELFArch<"EF_AVR_ARCH_AVR4">; +def ELFArchAVR5 : ELFArch<"EF_AVR_ARCH_AVR5">; +def ELFArchAVR51 : ELFArch<"EF_AVR_ARCH_AVR51">; +def ELFArchAVR6 : ELFArch<"EF_AVR_ARCH_AVR6">; +def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">; +def ELFArchXMEGA1 : ELFArch<"EF_AVR_ARCH_XMEGA1">; +def ELFArchXMEGA2 : ELFArch<"EF_AVR_ARCH_XMEGA2">; +def ELFArchXMEGA3 : ELFArch<"EF_AVR_ARCH_XMEGA3">; +def ELFArchXMEGA4 : ELFArch<"EF_AVR_ARCH_XMEGA4">; +def ELFArchXMEGA5 : ELFArch<"EF_AVR_ARCH_XMEGA5">; +def ELFArchXMEGA6 : ELFArch<"EF_AVR_ARCH_XMEGA6">; +def ELFArchXMEGA7 : ELFArch<"EF_AVR_ARCH_XMEGA7">; + +//===---------------------------------------------------------------------===// +// AVR Families +//===---------------------------------------------------------------------===// + +// The device has at least the bare minimum that **every** single AVR +// device should have. +def FamilyAVR0 : Family<"avr0", []>; + +def FamilyAVR1 : Family<"avr1", [FamilyAVR0, FeatureLPM]>; + +def FamilyAVR2 : Family<"avr2", + [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW, + FeatureSRAM]>; + +def FamilyAVR25 : Family<"avr25", + [FamilyAVR2, FeatureMOVW, FeatureLPMX, + FeatureSPM, FeatureBREAK]>; + +def FamilyAVR3 : Family<"avr3", + [FamilyAVR2, FeatureJMPCALL]>; + +def FamilyAVR31 : Family<"avr31", + [FamilyAVR3, FeatureELPM]>; + +def FamilyAVR35 : Family<"avr35", + [FamilyAVR3, FeatureMOVW, FeatureLPMX, + FeatureSPM, FeatureBREAK]>; + +def FamilyAVR4 : Family<"avr4", + [FamilyAVR2, FeatureMultiplication, + FeatureMOVW, FeatureLPMX, FeatureSPM, + FeatureBREAK]>; + +def FamilyAVR5 : Family<"avr5", + [FamilyAVR3, FeatureMultiplication, + FeatureMOVW, FeatureLPMX, FeatureSPM, + FeatureBREAK]>; + +def FamilyAVR51 : Family<"avr51", + [FamilyAVR5, FeatureELPM, FeatureELPMX]>; + +def FamilyAVR6 : Family<"avr6", + [FamilyAVR51]>; + +def FamilyAVRTiny : Family<"avrtiny", + [FamilyAVR0, FeatureBREAK, FeatureSRAM, + FeatureTinyEncoding]>; + +def FamilyXMEGA : Family<"xmega", + [FamilyAVR51, FeatureEIJMPCALL, FeatureSPMX, + FeatureDES]>; + +def FamilyXMEGAU : Family<"xmegau", + [FamilyXMEGA, FeatureRMW]>; + +def FeatureSetSpecial : FeatureSet<"special", + "Enable use of the entire instruction " + "set - used for debugging", + [FeatureSRAM, FeatureJMPCALL, + FeatureIJMPCALL, FeatureEIJMPCALL, + FeatureADDSUBIW, FeatureMOVW, + FeatureLPM, FeatureLPMX, FeatureELPM, + FeatureELPMX, FeatureSPM, FeatureSPMX, + FeatureDES, FeatureRMW, + FeatureMultiplication, FeatureBREAK]>; + +//===---------------------------------------------------------------------===// +// AVR microcontrollers supported. +//===---------------------------------------------------------------------===// + +class Device<string Name, Family Fam, ELFArch Arch, + list<SubtargetFeature> ExtraFeatures = []> + : Processor<Name, NoItineraries, !listconcat([Fam,Arch],ExtraFeatures)>; + +// Generic MCUs +// Note that several versions of GCC has strange ELF architecture +// settings for backwards compatibility - see `gas/config/tc-avr.c` +// in AVR binutils. We do not replicate this. +def : Device<"avr1", FamilyAVR1, ELFArchAVR1>; +def : Device<"avr2", FamilyAVR2, ELFArchAVR2>; +def : Device<"avr25", FamilyAVR25, ELFArchAVR25>; +def : Device<"avr3", FamilyAVR3, ELFArchAVR3>; +def : Device<"avr31", FamilyAVR31, ELFArchAVR31>; +def : Device<"avr35", FamilyAVR35, ELFArchAVR35>; +def : Device<"avr4", FamilyAVR4, ELFArchAVR4>; +def : Device<"avr5", FamilyAVR5, ELFArchAVR5>; +def : Device<"avr51", FamilyAVR51, ELFArchAVR51>; +def : Device<"avr6", FamilyAVR6, ELFArchAVR6>; +def : Device<"avrxmega1", FamilyXMEGA, ELFArchXMEGA1>; +def : Device<"avrxmega2", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"avrxmega3", FamilyXMEGA, ELFArchXMEGA3>; +def : Device<"avrxmega4", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"avrxmega5", FamilyXMEGA, ELFArchXMEGA5>; +def : Device<"avrxmega6", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"avrxmega7", FamilyXMEGA, ELFArchXMEGA7>; +def : Device<"avrtiny", FamilyAVRTiny, ELFArchAVRTiny>; + +// Specific MCUs +def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1>; +def : Device<"attiny11", FamilyAVR1, ELFArchAVR1>; +def : Device<"attiny12", FamilyAVR1, ELFArchAVR1>; +def : Device<"attiny15", FamilyAVR1, ELFArchAVR1>; +def : Device<"attiny28", FamilyAVR1, ELFArchAVR1>; +def : Device<"at90s2313", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s2323", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s2333", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s2343", FamilyAVR2, ELFArchAVR2>; +def : Device<"attiny22", FamilyAVR2, ELFArchAVR2>; +def : Device<"attiny26", FamilyAVR2, ELFArchAVR2, [FeatureLPMX]>; +def : Device<"at86rf401", FamilyAVR2, ELFArchAVR25, + [FeatureMOVW, FeatureLPMX]>; +def : Device<"at90s4414", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s4433", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s4434", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s8515", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90c8534", FamilyAVR2, ELFArchAVR2>; +def : Device<"at90s8535", FamilyAVR2, ELFArchAVR2>; +def : Device<"ata5272", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny13", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny13a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny2313", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny2313a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny24", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny24a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny4313", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny44", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny44a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny84", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny84a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny25", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny45", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny85", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny261", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny261a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny461", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny461a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny861", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny861a", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny87", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny43u", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny48", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny88", FamilyAVR25, ELFArchAVR25>; +def : Device<"attiny828", FamilyAVR25, ELFArchAVR25>; +def : Device<"at43usb355", FamilyAVR3, ELFArchAVR3>; +def : Device<"at76c711", FamilyAVR3, ELFArchAVR3>; +def : Device<"atmega103", FamilyAVR31, ELFArchAVR31>; +def : Device<"at43usb320", FamilyAVR31, ELFArchAVR31>; +def : Device<"attiny167", FamilyAVR35, ELFArchAVR35>; +def : Device<"at90usb82", FamilyAVR35, ELFArchAVR35>; +def : Device<"at90usb162", FamilyAVR35, ELFArchAVR35>; +def : Device<"ata5505", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega8u2", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega16u2", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega32u2", FamilyAVR35, ELFArchAVR35>; +def : Device<"attiny1634", FamilyAVR35, ELFArchAVR35>; +def : Device<"atmega8", FamilyAVR4, ELFArchAVR4>; // FIXME: family may be wrong +def : Device<"ata6289", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega8a", FamilyAVR4, ELFArchAVR4>; +def : Device<"ata6285", FamilyAVR4, ELFArchAVR4>; +def : Device<"ata6286", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48a", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48pa", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega48p", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88a", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88p", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega88pa", FamilyAVR4, ELFArchAVR4>; +def : Device<"atmega8515", FamilyAVR2, ELFArchAVR4, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega8535", FamilyAVR2, ELFArchAVR4, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega8hva", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm1", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm2", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm2b", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm3", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm3b", FamilyAVR4, ELFArchAVR4>; +def : Device<"at90pwm81", FamilyAVR4, ELFArchAVR4>; +def : Device<"ata5790", FamilyAVR5, ELFArchAVR5>; +def : Device<"ata5795", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega161", FamilyAVR3, ELFArchAVR5, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega162", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega163", FamilyAVR3, ELFArchAVR5, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX, FeatureSPM]>; +def : Device<"atmega164a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega164p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega164pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega165pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega168pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega169pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega323", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega324a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega324p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega324pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega325pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3250pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega328", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega328p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega329pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega3290pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega406", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega640", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644pa", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega645", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega645a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega645p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega649", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega649a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega649p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6450", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6450a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6450p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6490", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6490a", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega6490p", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64rfr2", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega644rfr2", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hva", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hva2", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hvb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16hvbrevb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32hvb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32hvbrevb", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64hve", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90can32", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90can64", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90pwm161", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90pwm216", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90pwm316", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32c1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64c1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16m1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32m1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega64m1", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega16u4", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32u4", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega32u6", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90usb646", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90usb647", FamilyAVR5, ELFArchAVR5>; +def : Device<"at90scr100", FamilyAVR5, ELFArchAVR5>; +def : Device<"at94k", FamilyAVR3, ELFArchAVR5, + [FeatureMultiplication, FeatureMOVW, FeatureLPMX]>; +def : Device<"m3000", FamilyAVR5, ELFArchAVR5>; +def : Device<"atmega128", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega128a", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1280", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1281", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1284", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1284p", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega128rfa1", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega128rfr2", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega1284rfr2", FamilyAVR51, ELFArchAVR51>; +def : Device<"at90can128", FamilyAVR51, ELFArchAVR51>; +def : Device<"at90usb1286", FamilyAVR51, ELFArchAVR51>; +def : Device<"at90usb1287", FamilyAVR51, ELFArchAVR51>; +def : Device<"atmega2560", FamilyAVR6, ELFArchAVR6>; +def : Device<"atmega2561", FamilyAVR6, ELFArchAVR6>; +def : Device<"atmega256rfr2", FamilyAVR6, ELFArchAVR6>; +def : Device<"atmega2564rfr2", FamilyAVR6, ELFArchAVR6>; +def : Device<"atxmega16a4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega16a4u", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega16c4", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega16d4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32a4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32a4u", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega32c4", FamilyXMEGAU, ELFArchXMEGA2>; +def : Device<"atxmega32d4", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32e5", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega16e5", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega8e5", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega32x1", FamilyXMEGA, ELFArchXMEGA2>; +def : Device<"atxmega64a3", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"atxmega64a3u", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64a4u", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64b1", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64b3", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64c3", FamilyXMEGAU, ELFArchXMEGA4>; +def : Device<"atxmega64d3", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"atxmega64d4", FamilyXMEGA, ELFArchXMEGA4>; +def : Device<"atxmega64a1", FamilyXMEGA, ELFArchXMEGA5>; +def : Device<"atxmega64a1u", FamilyXMEGAU, ELFArchXMEGA5>; +def : Device<"atxmega128a3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega128a3u", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128b1", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128b3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega128d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega128d4", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega192a3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega192a3u", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega192c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega192d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega256a3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega256a3u", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega256a3b", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega256a3bu", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega256c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega256d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega384c3", FamilyXMEGAU, ELFArchXMEGA6>; +def : Device<"atxmega384d3", FamilyXMEGA, ELFArchXMEGA6>; +def : Device<"atxmega128a1", FamilyXMEGA, ELFArchXMEGA7>; +def : Device<"atxmega128a1u", FamilyXMEGAU, ELFArchXMEGA7>; +def : Device<"atxmega128a4u", FamilyXMEGAU, ELFArchXMEGA7>; +def : Device<"attiny4", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny5", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny9", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny10", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny20", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny40", FamilyAVRTiny, ELFArchAVRTiny>; + +//===---------------------------------------------------------------------===// +// Register File Description +//===---------------------------------------------------------------------===// + +include "AVRRegisterInfo.td" + +//===---------------------------------------------------------------------===// +// Instruction Descriptions +//===---------------------------------------------------------------------===// + +//include "AVRInstrInfo.td" + +//def AVRInstrInfo : InstrInfo; + +//===---------------------------------------------------------------------===// +// Calling Conventions +//===---------------------------------------------------------------------===// + +include "AVRCallingConv.td" + +//===---------------------------------------------------------------------===// +// Assembly Printers +//===---------------------------------------------------------------------===// + +// def AVRAsmWriter : AsmWriter { +// string AsmWriterClassName = "InstPrinter"; +// bit isMCAsmWriter = 1; +// } + +//===---------------------------------------------------------------------===// +// Assembly Parsers +//===---------------------------------------------------------------------===// + +// def AVRAsmParser : AsmParser { +// let ShouldEmitMatchRegisterName = 1; +// let ShouldEmitMatchRegisterAltName = 1; +// } + +// def AVRAsmParserVariant : AsmParserVariant { +// int Variant = 0; +// +// // Recognize hard coded registers. +// string RegisterPrefix = "$"; +// } + +//===---------------------------------------------------------------------===// +// Target Declaration +//===---------------------------------------------------------------------===// + +def AVR : Target { +// let InstructionSet = AVRInstrInfo; +// let AssemblyWriters = [AVRAsmWriter]; +// +// let AssemblyParsers = [AVRAsmParser]; +// let AssemblyParserVariants = [AVRAsmParserVariant]; +} + diff --git a/contrib/llvm/lib/Target/AVR/AVRCallingConv.td b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td new file mode 100644 index 0000000..d8cb3fe --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRCallingConv.td @@ -0,0 +1,65 @@ +//===-- AVRCallingConv.td - Calling Conventions for AVR ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This describes the calling conventions for AVR architecture. +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AVR Return Value Calling Convention +//===----------------------------------------------------------------------===// + +def RetCC_AVR : CallingConv +<[ + // i8 is returned in R24. + CCIfType<[i8], CCAssignToReg<[R24]>>, + + // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18. + CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>> +]>; + +// Special return value calling convention for runtime functions. +def RetCC_AVR_RT : CallingConv +<[ + CCIfType<[i8], CCAssignToReg<[R24,R25]>>, + CCIfType<[i16], CCAssignToReg<[R23R22, R25R24]>> +]>; + +//===----------------------------------------------------------------------===// +// AVR Argument Calling Conventions +//===----------------------------------------------------------------------===// + +// The calling conventions are implemented in custom C++ code + +// Calling convention for variadic functions. +def ArgCC_AVR_Vararg : CallingConv +<[ + // i16 are always passed through the stack with an alignment of 1. + CCAssignToStack<2, 1> +]>; + +// Special argument calling convention for +// multiplication runtime functions. +def ArgCC_AVR_RT_MUL : CallingConv +<[ + CCIfType<[i16], CCAssignToReg<[R27R26,R19R18]>> +]>; + +// Special argument calling convention for +// division runtime functions. +def ArgCC_AVR_RT_DIV : CallingConv +<[ + CCIfType<[i8], CCAssignToReg<[R24,R22]>>, + CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>> +]>; + +//===----------------------------------------------------------------------===// +// Callee-saved register lists. +//===----------------------------------------------------------------------===// + +def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>; +def CSR_Interrupts : CalleeSavedRegs<(add (sequence "R%u", 31, 0))>; diff --git a/contrib/llvm/lib/Target/AVR/AVRConfig.h b/contrib/llvm/lib/Target/AVR/AVRConfig.h new file mode 100644 index 0000000..65588bc --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRConfig.h @@ -0,0 +1,15 @@ +//===-- AVRConfig.h - AVR Backend Configuration Header ----------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_CONFIG_H +#define LLVM_AVR_CONFIG_H + +#define LLVM_AVR_GCC_COMPAT + +#endif // LLVM_AVR_CONFIG_H diff --git a/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h new file mode 100644 index 0000000..6571d5d --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h @@ -0,0 +1,73 @@ +//===-- AVRMachineFuctionInfo.h - AVR machine function info -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file declares AVR-specific per-machine-function information. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_AVR_MACHINE_FUNCTION_INFO_H +#define LLVM_AVR_MACHINE_FUNCTION_INFO_H + +#include "AVRConfig.h" + +#include "llvm/CodeGen/MachineFunction.h" + +namespace llvm { + +/** + * Contains AVR-specific information for each MachineFunction. + */ +class AVRMachineFunctionInfo : public MachineFunctionInfo { + /// Indicates if a register has been spilled by the register + /// allocator. + bool HasSpills; + + /// Indicates if there are any fixed size allocas present. + /// Note that if there are only variable sized allocas this is set to false. + bool HasAllocas; + + /// Indicates if arguments passed using the stack are being + /// used inside the function. + bool HasStackArgs; + + /// Size of the callee-saved register portion of the + /// stack frame in bytes. + unsigned CalleeSavedFrameSize; + + /// FrameIndex for start of varargs area. + int VarArgsFrameIndex; + +public: + AVRMachineFunctionInfo() + : HasSpills(false), HasAllocas(false), HasStackArgs(false), + CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {} + + explicit AVRMachineFunctionInfo(MachineFunction &MF) + : HasSpills(false), HasAllocas(false), HasStackArgs(false), + CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {} + + bool getHasSpills() const { return HasSpills; } + void setHasSpills(bool B) { HasSpills = B; } + + bool getHasAllocas() const { return HasAllocas; } + void setHasAllocas(bool B) { HasAllocas = B; } + + bool getHasStackArgs() const { return HasStackArgs; } + void setHasStackArgs(bool B) { HasStackArgs = B; } + + unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } + void setCalleeSavedFrameSize(unsigned Bytes) { CalleeSavedFrameSize = Bytes; } + + int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } + void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; } +}; + +} // end llvm namespace + +#endif // LLVM_AVR_MACHINE_FUNCTION_INFO_H diff --git a/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td new file mode 100644 index 0000000..32650fc --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRRegisterInfo.td @@ -0,0 +1,216 @@ +//===-- AVRRegisterInfo.td - AVR Register defs -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Declarations that describe the AVR register file +//===----------------------------------------------------------------------===// + +// 8-bit General purpose register definition. +class AVRReg<bits<16> num, + string name, + list<Register> subregs = [], + list<string> altNames = []> + : RegisterWithSubRegs<name, subregs> +{ + field bits<16> Num = num; + + let HWEncoding = num; + let Namespace = "AVR"; + let SubRegs = subregs; + let AltNames = altNames; +} + +// Subregister indices. +let Namespace = "AVR" in +{ + def sub_lo : SubRegIndex<8>; + def sub_hi : SubRegIndex<8, 8>; +} + +let Namespace = "AVR" in { + def ptr : RegAltNameIndex; +} + + +//===----------------------------------------------------------------------===// +// 8-bit general purpose registers +//===----------------------------------------------------------------------===// + +def R0 : AVRReg<0, "r0">, DwarfRegNum<[0]>; +def R1 : AVRReg<1, "r1">, DwarfRegNum<[1]>; +def R2 : AVRReg<2, "r2">, DwarfRegNum<[2]>; +def R3 : AVRReg<3, "r3">, DwarfRegNum<[3]>; +def R4 : AVRReg<4, "r4">, DwarfRegNum<[4]>; +def R5 : AVRReg<5, "r5">, DwarfRegNum<[5]>; +def R6 : AVRReg<6, "r6">, DwarfRegNum<[6]>; +def R7 : AVRReg<7, "r7">, DwarfRegNum<[7]>; +def R8 : AVRReg<8, "r8">, DwarfRegNum<[8]>; +def R9 : AVRReg<9, "r9">, DwarfRegNum<[9]>; +def R10 : AVRReg<10, "r10">, DwarfRegNum<[10]>; +def R11 : AVRReg<11, "r11">, DwarfRegNum<[11]>; +def R12 : AVRReg<12, "r12">, DwarfRegNum<[12]>; +def R13 : AVRReg<13, "r13">, DwarfRegNum<[13]>; +def R14 : AVRReg<14, "r14">, DwarfRegNum<[14]>; +def R15 : AVRReg<15, "r15">, DwarfRegNum<[15]>; +def R16 : AVRReg<16, "r16">, DwarfRegNum<[16]>; +def R17 : AVRReg<17, "r17">, DwarfRegNum<[17]>; +def R18 : AVRReg<18, "r18">, DwarfRegNum<[18]>; +def R19 : AVRReg<19, "r19">, DwarfRegNum<[19]>; +def R20 : AVRReg<20, "r20">, DwarfRegNum<[20]>; +def R21 : AVRReg<21, "r21">, DwarfRegNum<[21]>; +def R22 : AVRReg<22, "r22">, DwarfRegNum<[22]>; +def R23 : AVRReg<23, "r23">, DwarfRegNum<[23]>; +def R24 : AVRReg<24, "r24">, DwarfRegNum<[24]>; +def R25 : AVRReg<25, "r25">, DwarfRegNum<[25]>; +def R26 : AVRReg<26, "r26">, DwarfRegNum<[26]>; +def R27 : AVRReg<27, "r27">, DwarfRegNum<[27]>; +def R28 : AVRReg<28, "r28">, DwarfRegNum<[28]>; +def R29 : AVRReg<29, "r29">, DwarfRegNum<[29]>; +def R30 : AVRReg<30, "r30">, DwarfRegNum<[30]>; +def R31 : AVRReg<31, "r31">, DwarfRegNum<[31]>; +def SPL : AVRReg<32, "SPL">, DwarfRegNum<[32]>; +def SPH : AVRReg<33, "SPH">, DwarfRegNum<[33]>; + +let SubRegIndices = [sub_lo, sub_hi], +CoveredBySubRegs = 1 in +{ + // 16 bit GPR pairs. + def SP : AVRReg<32, "SP", [SPL, SPH]>, DwarfRegNum<[32]>; + + // The pointer registers (X,Y,Z) are a special case because they + // are printed as a `high:low` pair when a DREG is expected, + // but printed using `X`, `Y`, `Z` when a pointer register is expected. + let RegAltNameIndices = [ptr] in { + def R31R30 : AVRReg<30, "r31:r30", [R30, R31], ["Z"]>, DwarfRegNum<[30]>; + def R29R28 : AVRReg<28, "r29:r28", [R28, R29], ["Y"]>, DwarfRegNum<[28]>; + def R27R26 : AVRReg<26, "r27:r26", [R26, R27], ["X"]>, DwarfRegNum<[26]>; + } + def R25R24 : AVRReg<24, "r25:r24", [R24, R25]>, DwarfRegNum<[24]>; + def R23R22 : AVRReg<22, "r23:r22", [R22, R23]>, DwarfRegNum<[22]>; + def R21R20 : AVRReg<20, "r21:r20", [R20, R21]>, DwarfRegNum<[20]>; + def R19R18 : AVRReg<18, "r19:r18", [R18, R19]>, DwarfRegNum<[18]>; + def R17R16 : AVRReg<16, "r17:r16", [R16, R17]>, DwarfRegNum<[16]>; + def R15R14 : AVRReg<14, "r15:r14", [R14, R15]>, DwarfRegNum<[14]>; + def R13R12 : AVRReg<12, "r13:r12", [R12, R13]>, DwarfRegNum<[12]>; + def R11R10 : AVRReg<10, "r11:r10", [R10, R11]>, DwarfRegNum<[10]>; + def R9R8 : AVRReg<8, "r9:r8", [R8, R9]>, DwarfRegNum<[8]>; + def R7R6 : AVRReg<6, "r7:r6", [R6, R7]>, DwarfRegNum<[6]>; + def R5R4 : AVRReg<4, "r5:r4", [R4, R5]>, DwarfRegNum<[4]>; + def R3R2 : AVRReg<2, "r3:r2", [R2, R3]>, DwarfRegNum<[2]>; + def R1R0 : AVRReg<0, "r1:r0", [R0, R1]>, DwarfRegNum<[0]>; +} + +//===----------------------------------------------------------------------===// +// Register Classes +//===----------------------------------------------------------------------===// + +//:TODO: use proper set instructions instead of using always "add" + +// Main 8-bit register class. +def GPR8 : RegisterClass<"AVR", [i8], 8, + ( + // Return value and argument registers. + add R24, R25, R18, R19, R20, R21, R22, R23, + // Scratch registers. + R30, R31, R26, R27, + // Callee saved registers. + R28, R29, R17, R16, R15, R14, R13, R12, R11, R10, + R9, R8, R7, R6, R5, R4, R3, R2, R0, R1 + )>; + +// Simple lower registers r0..r15 +def GPR8lo : RegisterClass<"AVR", [i8], 8, + ( + add R15, R14, R13, R12, R11, R10, R9, R8, R7, R6, R5, R4, R3, R2, R0, R1 + )>; + +// 8-bit register class for instructions which take immediates. +def LD8 : RegisterClass<"AVR", [i8], 8, + ( + // Return value and arguments. + add R24, R25, R18, R19, R20, R21, R22, R23, + // Scratch registers. + R30, R31, R26, R27, + // Callee saved registers. + R28, R29, R17, R16 + )>; + +// Simple lower registers r16..r23 +def LD8lo : RegisterClass<"AVR", [i8], 8, + ( + add R23, R22, R21, R20, R19, R18, R17, R16 + )>; + +// Main 16-bit pair register class. +def DREGS : RegisterClass<"AVR", [i16], 8, + ( + // Return value and arguments. + add R25R24, R19R18, R21R20, R23R22, + // Scratch registers. + R31R30, R27R26, + // Callee saved registers. + R29R28, R17R16, R15R14, R13R12, R11R10, + R9R8, R7R6, R5R4, R3R2, R1R0 + )>; + +// 16-bit register class for immediate instructions. +def DLDREGS : RegisterClass<"AVR", [i16], 8, + ( + // Return value and arguments. + add R25R24, R19R18, R21R20, R23R22, + // Scratch registers. + R31R30, R27R26, + // Callee saved registers. + R29R28, R17R16 + )>; + +// 16-bit register class for the adiw/sbiw instructions. +def IWREGS : RegisterClass<"AVR", [i16], 8, + ( + // Return value and arguments. + add R25R24, + // Scratch registers. + R31R30, R27R26, + // Callee saved registers. + R29R28 + )>; + +// 16-bit register class for the ld and st instructions. +// AKA X,Y, and Z +def PTRREGS : RegisterClass<"AVR", [i16], 8, + ( + add R27R26, // X + R29R28, // Y + R31R30 // Z + ), ptr>; + +// 16-bit register class for the ldd and std instructions. +// AKA Y and Z. +def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, + ( + add R31R30, R29R28 + ), ptr>; + +// We have a bunch of instructions with an explicit Z register argument. We +// model this using a register class containing only the Z register. +// :TODO: Rename to 'ZREG'. +def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>; + +// Register class used for the stack read pseudo instruction. +def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>; + +//:TODO: if we remove this we get an error in tablegen +//:TODO: this is just a hack, remove it once add16 works! +// Status register. +def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>; +def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)> +{ + let CopyCost = -1; // Don't allow copying of status registers +} + diff --git a/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp new file mode 100644 index 0000000..a91dce8 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/AVRTargetMachine.cpp @@ -0,0 +1,4 @@ + +extern "C" void LLVMInitializeAVRTarget() { + +} diff --git a/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp new file mode 100644 index 0000000..c0e0d20 --- /dev/null +++ b/contrib/llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp @@ -0,0 +1,25 @@ +//===-- AVRTargetInfo.cpp - AVR Target Implementation ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/Module.h" +#include "llvm/Support/TargetRegistry.h" + +namespace llvm { +Target TheAVRTarget; +} + +extern "C" void LLVMInitializeAVRTargetInfo() { + llvm::RegisterTarget<llvm::Triple::avr> X( + llvm::TheAVRTarget, "avr", "Atmel AVR Microcontroller"); +} + +// FIXME: Temporary stub - this function must be defined for linking +// to succeed. Remove once this function is properly implemented. +extern "C" void LLVMInitializeAVRTargetMC() { +} diff --git a/contrib/llvm/lib/Target/BPF/BPF.td b/contrib/llvm/lib/Target/BPF/BPF.td index a4ce90a..8493b0f 100644 --- a/contrib/llvm/lib/Target/BPF/BPF.td +++ b/contrib/llvm/lib/Target/BPF/BPF.td @@ -25,7 +25,14 @@ def BPFInstPrinter : AsmWriter { bit isMCAsmWriter = 1; } +def BPFAsmParserVariant : AsmParserVariant { + int Variant = 0; + string Name = "BPF"; + string BreakCharacters = "."; +} + def BPF : Target { let InstructionSet = BPFInstrInfo; let AssemblyWriters = [BPFInstPrinter]; + let AssemblyParserVariants = [BPFAsmParserVariant]; } diff --git a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp index 7341828..6a5b37e 100644 --- a/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp +++ b/contrib/llvm/lib/Target/BPF/BPFISelLowering.cpp @@ -547,8 +547,7 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = BB; - ++I; + MachineFunction::iterator I = ++BB->getIterator(); // ThisMBB: // ... diff --git a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h index adcaff6..4276d08 100644 --- a/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h +++ b/contrib/llvm/lib/Target/BPF/InstPrinter/BPFInstPrinter.h @@ -17,8 +17,6 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { -class MCOperand; - class BPFInstPrinter : public MCInstPrinter { public: BPFInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 36f9926..8c358ca 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -68,16 +68,23 @@ void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) { assert(Value == 0); - return; - } - assert(Fixup.getKind() == FK_PCRel_2); - Value = (uint16_t)((Value - 8) / 8); - if (IsLittleEndian) { - Data[Fixup.getOffset() + 2] = Value & 0xFF; - Data[Fixup.getOffset() + 3] = Value >> 8; + } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) { + unsigned Size = Fixup.getKind() == FK_Data_4 ? 4 : 8; + + for (unsigned i = 0; i != Size; ++i) { + unsigned Idx = IsLittleEndian ? i : Size - i; + Data[Fixup.getOffset() + Idx] = uint8_t(Value >> (i * 8)); + } } else { - Data[Fixup.getOffset() + 2] = Value >> 8; - Data[Fixup.getOffset() + 3] = Value & 0xFF; + assert(Fixup.getKind() == FK_PCRel_2); + Value = (uint16_t)((Value - 8) / 8); + if (IsLittleEndian) { + Data[Fixup.getOffset() + 2] = Value & 0xFF; + Data[Fixup.getOffset() + 3] = Value >> 8; + } else { + Data[Fixup.getOffset() + 2] = Value >> 8; + Data[Fixup.getOffset() + 3] = Value & 0xFF; + } } } diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp index 05ba618..87cdd5e 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFELFObjectWriter.cpp @@ -44,6 +44,10 @@ unsigned BPFELFObjectWriter::GetRelocType(const MCValue &Target, return ELF::R_X86_64_64; case FK_SecRel_4: return ELF::R_X86_64_PC32; + case FK_Data_8: + return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64; + case FK_Data_4: + return IsPCRel ? ELF::R_X86_64_PC32 : ELF::R_X86_64_32; } } diff --git a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index d63bbf4..1f440fe 100644 --- a/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/contrib/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -34,6 +34,8 @@ public: UsesELFSectionDirectiveForBSS = true; HasSingleParameterDotFile = false; HasDotTypeDotSizeDirective = false; + + SupportsDebugInformation = true; } }; } diff --git a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp index 272688e..5ea6551 100644 --- a/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp +++ b/contrib/llvm/lib/Target/CppBackend/CPPBackend.cpp @@ -551,7 +551,8 @@ void CppWriter::printAttributes(const AttributeSet &PAL, void CppWriter::printType(Type* Ty) { // We don't print definitions for primitive types if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || - Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy()) + Ty->isLabelTy() || Ty->isMetadataTy() || Ty->isVoidTy() || + Ty->isTokenTy()) return; // If we already defined this type, we don't need to define it again. @@ -1355,23 +1356,18 @@ void CppWriter::printInstruction(const Instruction *I, } case Instruction::GetElementPtr: { const GetElementPtrInst* gep = cast<GetElementPtrInst>(I); - if (gep->getNumOperands() <= 2) { - Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create(" - << opNames[0]; - if (gep->getNumOperands() == 2) - Out << ", " << opNames[1]; - } else { - Out << "std::vector<Value*> " << iName << "_indices;"; - nl(Out); - for (unsigned i = 1; i < gep->getNumOperands(); ++i ) { - Out << iName << "_indices.push_back(" - << opNames[i] << ");"; - nl(Out); + Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create(" + << getCppName(gep->getSourceElementType()) << ", " << opNames[0] << ", {"; + in(); + for (unsigned i = 1; i < gep->getNumOperands(); ++i ) { + if (i != 1) { + Out << ", "; } - Out << "Instruction* " << iName << " = GetElementPtrInst::Create(" - << opNames[0] << ", " << iName << "_indices"; + nl(Out); + Out << opNames[i]; } - Out << ", \""; + out(); + nl(Out) << "}, \""; printEscapedString(gep->getName()); Out << "\", " << bbname << ");"; break; @@ -1803,13 +1799,12 @@ void CppWriter::printFunctionBody(const Function *F) { << "->arg_begin();"; nl(Out); } - for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end(); - AI != AE; ++AI) { - Out << "Value* " << getCppName(AI) << " = args++;"; + for (const Argument &AI : F->args()) { + Out << "Value* " << getCppName(&AI) << " = args++;"; nl(Out); - if (AI->hasName()) { - Out << getCppName(AI) << "->setName(\""; - printEscapedString(AI->getName()); + if (AI.hasName()) { + Out << getCppName(&AI) << "->setName(\""; + printEscapedString(AI.getName()); Out << "\");"; nl(Out); } @@ -1818,29 +1813,25 @@ void CppWriter::printFunctionBody(const Function *F) { // Create all the basic blocks nl(Out); - for (Function::const_iterator BI = F->begin(), BE = F->end(); - BI != BE; ++BI) { - std::string bbname(getCppName(BI)); + for (const BasicBlock &BI : *F) { + std::string bbname(getCppName(&BI)); Out << "BasicBlock* " << bbname << " = BasicBlock::Create(mod->getContext(), \""; - if (BI->hasName()) - printEscapedString(BI->getName()); - Out << "\"," << getCppName(BI->getParent()) << ",0);"; + if (BI.hasName()) + printEscapedString(BI.getName()); + Out << "\"," << getCppName(BI.getParent()) << ",0);"; nl(Out); } // Output all of its basic blocks... for the function - for (Function::const_iterator BI = F->begin(), BE = F->end(); - BI != BE; ++BI) { - std::string bbname(getCppName(BI)); - nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")"; + for (const BasicBlock &BI : *F) { + std::string bbname(getCppName(&BI)); + nl(Out) << "// Block " << BI.getName() << " (" << bbname << ")"; nl(Out); // Output all of the instructions in the basic block... - for (BasicBlock::const_iterator I = BI->begin(), E = BI->end(); - I != E; ++I) { - printInstruction(I,bbname); - } + for (const Instruction &I : BI) + printInstruction(&I, bbname); } // Loop over the ForwardRefs and resolve them now that all instructions @@ -1883,7 +1874,7 @@ void CppWriter::printInline(const std::string& fname, printFunctionUses(F); printFunctionBody(F); is_inline = false; - Out << "return " << getCppName(F->begin()) << ";"; + Out << "return " << getCppName(&F->front()) << ";"; nl(Out) << "}"; nl(Out); } @@ -1896,17 +1887,14 @@ void CppWriter::printModuleBody() { // Functions can call each other and global variables can reference them so // define all the functions first before emitting their function bodies. nl(Out) << "// Function Declarations"; nl(Out); - for (Module::const_iterator I = TheModule->begin(), E = TheModule->end(); - I != E; ++I) - printFunctionHead(I); + for (const Function &I : *TheModule) + printFunctionHead(&I); // Process the global variables declarations. We can't initialze them until // after the constants are printed so just print a header for each global nl(Out) << "// Global Variable Declarations\n"; nl(Out); - for (Module::const_global_iterator I = TheModule->global_begin(), - E = TheModule->global_end(); I != E; ++I) { - printVariableHead(I); - } + for (const GlobalVariable &I : TheModule->globals()) + printVariableHead(&I); // Print out all the constants definitions. Constants don't recurse except // through GlobalValues. All GlobalValues have been declared at this point @@ -1918,21 +1906,18 @@ void CppWriter::printModuleBody() { // been emitted. These definitions just couple the gvars with their constant // initializers. nl(Out) << "// Global Variable Definitions"; nl(Out); - for (Module::const_global_iterator I = TheModule->global_begin(), - E = TheModule->global_end(); I != E; ++I) { - printVariableBody(I); - } + for (const GlobalVariable &I : TheModule->globals()) + printVariableBody(&I); // Finally, we can safely put out all of the function bodies. nl(Out) << "// Function Definitions"; nl(Out); - for (Module::const_iterator I = TheModule->begin(), E = TheModule->end(); - I != E; ++I) { - if (!I->isDeclaration()) { - nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I) + for (const Function &I : *TheModule) { + if (!I.isDeclaration()) { + nl(Out) << "// Function: " << I.getName() << " (" << getCppName(&I) << ")"; nl(Out) << "{"; nl(Out,1); - printFunctionBody(I); + printFunctionBody(&I); nl(Out,-1) << "}"; nl(Out); } diff --git a/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp new file mode 100644 index 0000000..a8622a9 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -0,0 +1,2152 @@ +//===-- HexagonAsmParser.cpp - Parse Hexagon asm to MCInst instructions----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "mcasmparser" + +#include "Hexagon.h" +#include "HexagonRegisterInfo.h" +#include "HexagonTargetStreamer.h" +#include "MCTargetDesc/HexagonBaseInfo.h" +#include "MCTargetDesc/HexagonMCELFStreamer.h" +#include "MCTargetDesc/HexagonMCChecker.h" +#include "MCTargetDesc/HexagonMCExpr.h" +#include "MCTargetDesc/HexagonMCShuffler.h" +#include "MCTargetDesc/HexagonMCTargetDesc.h" +#include "MCTargetDesc/HexagonMCAsmInfo.h" +#include "MCTargetDesc/HexagonShuffler.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCELFStreamer.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCParser/MCAsmLexer.h" +#include "llvm/MC/MCParser/MCAsmParser.h" +#include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSectionELF.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCTargetAsmParser.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +#include <sstream> + +using namespace llvm; + +static cl::opt<bool> EnableFutureRegs("mfuture-regs", + cl::desc("Enable future registers")); + +static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis", +cl::desc("Warn for missing parenthesis around predicate registers"), +cl::init(true)); +static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis", +cl::desc("Error for missing parenthesis around predicate registers"), +cl::init(false)); +static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch", +cl::desc("Warn for mismatching a signed and unsigned value"), +cl::init(true)); +static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register", +cl::desc("Warn for register names that arent contigious"), +cl::init(true)); +static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register", +cl::desc("Error for register names that aren't contigious"), +cl::init(false)); + + +namespace { +struct HexagonOperand; + +class HexagonAsmParser : public MCTargetAsmParser { + + HexagonTargetStreamer &getTargetStreamer() { + MCTargetStreamer &TS = *Parser.getStreamer().getTargetStreamer(); + return static_cast<HexagonTargetStreamer &>(TS); + } + + MCAsmParser &Parser; + MCAssembler *Assembler; + MCInstrInfo const &MCII; + MCInst MCB; + bool InBrackets; + + MCAsmParser &getParser() const { return Parser; } + MCAssembler *getAssembler() const { return Assembler; } + MCAsmLexer &getLexer() const { return Parser.getLexer(); } + + bool equalIsAsmAssignment() override { return false; } + bool isLabel(AsmToken &Token) override; + + void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); } + bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); } + bool ParseDirectiveFalign(unsigned Size, SMLoc L); + + virtual bool ParseRegister(unsigned &RegNo, + SMLoc &StartLoc, + SMLoc &EndLoc) override; + bool ParseDirectiveSubsection(SMLoc L); + bool ParseDirectiveValue(unsigned Size, SMLoc L); + bool ParseDirectiveComm(bool IsLocal, SMLoc L); + bool RegisterMatchesArch(unsigned MatchNum) const; + + bool matchBundleOptions(); + bool handleNoncontigiousRegister(bool Contigious, SMLoc &Loc); + bool finishBundle(SMLoc IDLoc, MCStreamer &Out); + void canonicalizeImmediates(MCInst &MCI); + bool matchOneInstruction(MCInst &MCB, SMLoc IDLoc, + OperandVector &InstOperands, uint64_t &ErrorInfo, + bool MatchingInlineAsm, bool &MustExtend); + + bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, MCStreamer &Out, + uint64_t &ErrorInfo, bool MatchingInlineAsm) override; + + unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override; + void OutOfRange(SMLoc IDLoc, long long Val, long long Max); + int processInstruction(MCInst &Inst, OperandVector const &Operands, + SMLoc IDLoc, bool &MustExtend); + + // Check if we have an assembler and, if so, set the ELF e_header flags. + void chksetELFHeaderEFlags(unsigned flags) { + if (getAssembler()) + getAssembler()->setELFHeaderEFlags(flags); + } + +/// @name Auto-generated Match Functions +/// { + +#define GET_ASSEMBLER_HEADER +#include "HexagonGenAsmMatcher.inc" + + /// } + +public: + HexagonAsmParser(const MCSubtargetInfo &_STI, MCAsmParser &_Parser, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, _STI), Parser(_Parser), + MCII (MII), MCB(HexagonMCInstrInfo::createBundle()), InBrackets(false) { + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + + MCAsmParserExtension::Initialize(_Parser); + + Assembler = nullptr; + // FIXME: need better way to detect AsmStreamer (upstream removed getKind()) + if (!Parser.getStreamer().hasRawTextSupport()) { + MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer()); + Assembler = &MES->getAssembler(); + } + } + + bool mustExtend(OperandVector &Operands); + bool splitIdentifier(OperandVector &Operands); + bool parseOperand(OperandVector &Operands); + bool parseInstruction(OperandVector &Operands); + bool implicitExpressionLocation(OperandVector &Operands); + bool parseExpressionOrOperand(OperandVector &Operands); + bool parseExpression(MCExpr const *& Expr); + virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + SMLoc NameLoc, OperandVector &Operands) override + { + llvm_unreachable("Unimplemented"); + } + virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name, + AsmToken ID, OperandVector &Operands) override; + + virtual bool ParseDirective(AsmToken DirectiveID) override; +}; + +/// HexagonOperand - Instances of this class represent a parsed Hexagon machine +/// instruction. +struct HexagonOperand : public MCParsedAsmOperand { + enum KindTy { Token, Immediate, Register } Kind; + + SMLoc StartLoc, EndLoc; + + struct TokTy { + const char *Data; + unsigned Length; + }; + + struct RegTy { + unsigned RegNum; + }; + + struct ImmTy { + const MCExpr *Val; + bool MustExtend; + }; + + struct InstTy { + OperandVector *SubInsts; + }; + + union { + struct TokTy Tok; + struct RegTy Reg; + struct ImmTy Imm; + }; + + HexagonOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {} + +public: + HexagonOperand(const HexagonOperand &o) : MCParsedAsmOperand() { + Kind = o.Kind; + StartLoc = o.StartLoc; + EndLoc = o.EndLoc; + switch (Kind) { + case Register: + Reg = o.Reg; + break; + case Immediate: + Imm = o.Imm; + break; + case Token: + Tok = o.Tok; + break; + } + } + + /// getStartLoc - Get the location of the first token of this operand. + SMLoc getStartLoc() const { return StartLoc; } + + /// getEndLoc - Get the location of the last token of this operand. + SMLoc getEndLoc() const { return EndLoc; } + + unsigned getReg() const { + assert(Kind == Register && "Invalid access!"); + return Reg.RegNum; + } + + const MCExpr *getImm() const { + assert(Kind == Immediate && "Invalid access!"); + return Imm.Val; + } + + bool isToken() const { return Kind == Token; } + bool isImm() const { return Kind == Immediate; } + bool isMem() const { llvm_unreachable("No isMem"); } + bool isReg() const { return Kind == Register; } + + bool CheckImmRange(int immBits, int zeroBits, bool isSigned, + bool isRelocatable, bool Extendable) const { + if (Kind == Immediate) { + const MCExpr *myMCExpr = getImm(); + if (Imm.MustExtend && !Extendable) + return false; + int64_t Res; + if (myMCExpr->evaluateAsAbsolute(Res)) { + int bits = immBits + zeroBits; + // Field bit range is zerobits + bits + // zeroBits must be 0 + if (Res & ((1 << zeroBits) - 1)) + return false; + if (isSigned) { + if (Res < (1LL << (bits - 1)) && Res >= -(1LL << (bits - 1))) + return true; + } else { + if (bits == 64) + return true; + if (Res >= 0) + return ((uint64_t)Res < (uint64_t)(1ULL << bits)) ? true : false; + else { + const int64_t high_bit_set = 1ULL << 63; + const uint64_t mask = (high_bit_set >> (63 - bits)); + return (((uint64_t)Res & mask) == mask) ? true : false; + } + } + } else if (myMCExpr->getKind() == MCExpr::SymbolRef && isRelocatable) + return true; + else if (myMCExpr->getKind() == MCExpr::Binary || + myMCExpr->getKind() == MCExpr::Unary) + return true; + } + return false; + } + + bool isf32Ext() const { return false; } + bool iss32Imm() const { return CheckImmRange(32, 0, true, true, false); } + bool iss8Imm() const { return CheckImmRange(8, 0, true, false, false); } + bool iss8Imm64() const { return CheckImmRange(8, 0, true, true, false); } + bool iss7Imm() const { return CheckImmRange(7, 0, true, false, false); } + bool iss6Imm() const { return CheckImmRange(6, 0, true, false, false); } + bool iss4Imm() const { return CheckImmRange(4, 0, true, false, false); } + bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); } + bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); } + bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); } + bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); } + bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); } + bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); } + bool iss3Imm() const { return CheckImmRange(3, 0, true, false, false); } + + bool isu64Imm() const { return CheckImmRange(64, 0, false, true, true); } + bool isu32Imm() const { return CheckImmRange(32, 0, false, true, false); } + bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); } + bool isu16Imm() const { return CheckImmRange(16, 0, false, true, false); } + bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); } + bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); } + bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); } + bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); } + bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); } + bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); } + bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); } + bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); } + bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); } + bool isu10Imm() const { return CheckImmRange(10, 0, false, false, false); } + bool isu9Imm() const { return CheckImmRange(9, 0, false, false, false); } + bool isu8Imm() const { return CheckImmRange(8, 0, false, false, false); } + bool isu7Imm() const { return CheckImmRange(7, 0, false, false, false); } + bool isu6Imm() const { return CheckImmRange(6, 0, false, false, false); } + bool isu5Imm() const { return CheckImmRange(5, 0, false, false, false); } + bool isu4Imm() const { return CheckImmRange(4, 0, false, false, false); } + bool isu3Imm() const { return CheckImmRange(3, 0, false, false, false); } + bool isu2Imm() const { return CheckImmRange(2, 0, false, false, false); } + bool isu1Imm() const { return CheckImmRange(1, 0, false, false, false); } + + bool ism6Imm() const { return CheckImmRange(6, 0, false, false, false); } + bool isn8Imm() const { return CheckImmRange(8, 0, false, false, false); } + + bool iss16Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); } + bool iss12Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); } + bool iss10Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); } + bool iss9Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); } + bool iss8Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); } + bool iss7Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); } + bool iss6Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); } + bool iss11_0Ext() const { + return CheckImmRange(11 + 26, 0, true, true, true); + } + bool iss11_1Ext() const { + return CheckImmRange(11 + 26, 1, true, true, true); + } + bool iss11_2Ext() const { + return CheckImmRange(11 + 26, 2, true, true, true); + } + bool iss11_3Ext() const { + return CheckImmRange(11 + 26, 3, true, true, true); + } + + bool isu6Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); } + bool isu7Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); } + bool isu8Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); } + bool isu9Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); } + bool isu10Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); } + bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); } + bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); } + bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); } + bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); } + bool isu32MustExt() const { return isImm() && Imm.MustExtend; } + + void addRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createReg(getReg())); + } + + void addImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + Inst.addOperand(MCOperand::createExpr(getImm())); + } + + void addSignedImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + MCExpr const *Expr = getImm(); + int64_t Value; + if (!Expr->evaluateAsAbsolute(Value)) { + Inst.addOperand(MCOperand::createExpr(Expr)); + return; + } + int64_t Extended = SignExtend64 (Value, 32); + if ((Extended < 0) == (Value < 0)) { + Inst.addOperand(MCOperand::createExpr(Expr)); + return; + } + // Flip bit 33 to signal signed unsigned mismatch + Extended ^= 0x100000000; + Inst.addOperand(MCOperand::createImm(Extended)); + } + + void addf32ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void adds32ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds8ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds8Imm64Operands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds6ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_0ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_1ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_2ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds4_3ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds3ImmOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + + void addu64ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu32ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu26_6ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_0ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_1ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_2ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu16_3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu11_3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu10ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu9ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu8ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu7ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_0ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_1ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_2ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu5ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu4ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu3ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu2ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu1ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void addm6ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addn8ImmOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void adds16ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds12ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds10ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds9ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds8ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds6ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_0ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_1ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_2ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + void adds11_3ExtOperands(MCInst &Inst, unsigned N) const { + addSignedImmOperands(Inst, N); + } + + void addu6ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu7ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu8ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu9ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu10ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_0ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_1ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_2ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu6_3ExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + void addu32MustExtOperands(MCInst &Inst, unsigned N) const { + addImmOperands(Inst, N); + } + + void adds4_6ImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() * 64)); + } + + void adds3_6ImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm()); + Inst.addOperand(MCOperand::createImm(CE->getValue() * 64)); + } + + StringRef getToken() const { + assert(Kind == Token && "Invalid access!"); + return StringRef(Tok.Data, Tok.Length); + } + + virtual void print(raw_ostream &OS) const; + + static std::unique_ptr<HexagonOperand> CreateToken(StringRef Str, SMLoc S) { + HexagonOperand *Op = new HexagonOperand(Token); + Op->Tok.Data = Str.data(); + Op->Tok.Length = Str.size(); + Op->StartLoc = S; + Op->EndLoc = S; + return std::unique_ptr<HexagonOperand>(Op); + } + + static std::unique_ptr<HexagonOperand> CreateReg(unsigned RegNum, SMLoc S, + SMLoc E) { + HexagonOperand *Op = new HexagonOperand(Register); + Op->Reg.RegNum = RegNum; + Op->StartLoc = S; + Op->EndLoc = E; + return std::unique_ptr<HexagonOperand>(Op); + } + + static std::unique_ptr<HexagonOperand> CreateImm(const MCExpr *Val, SMLoc S, + SMLoc E) { + HexagonOperand *Op = new HexagonOperand(Immediate); + Op->Imm.Val = Val; + Op->Imm.MustExtend = false; + Op->StartLoc = S; + Op->EndLoc = E; + return std::unique_ptr<HexagonOperand>(Op); + } +}; + +} // end anonymous namespace. + +void HexagonOperand::print(raw_ostream &OS) const { + switch (Kind) { + case Immediate: + getImm()->print(OS, nullptr); + break; + case Register: + OS << "<register R"; + OS << getReg() << ">"; + break; + case Token: + OS << "'" << getToken() << "'"; + break; + } +} + +/// @name Auto-generated Match Functions +static unsigned MatchRegisterName(StringRef Name); + +bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) { + DEBUG(dbgs() << "Bundle:"); + DEBUG(MCB.dump_pretty(dbgs())); + DEBUG(dbgs() << "--\n"); + + // Check the bundle for errors. + const MCRegisterInfo *RI = getContext().getRegisterInfo(); + HexagonMCChecker Check(MCII, getSTI(), MCB, MCB, *RI); + + bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MCII, getSTI(), + getContext(), MCB, + &Check); + + while (Check.getNextErrInfo() == true) { + unsigned Reg = Check.getErrRegister(); + Twine R(RI->getName(Reg)); + + uint64_t Err = Check.getError(); + if (Err != HexagonMCErrInfo::CHECK_SUCCESS) { + if (HexagonMCErrInfo::CHECK_ERROR_BRANCHES & Err) + Error(IDLoc, + "unconditional branch cannot precede another branch in packet"); + + if (HexagonMCErrInfo::CHECK_ERROR_NEWP & Err || + HexagonMCErrInfo::CHECK_ERROR_NEWV & Err) + Error(IDLoc, "register `" + R + + "' used with `.new' " + "but not validly modified in the same packet"); + + if (HexagonMCErrInfo::CHECK_ERROR_REGISTERS & Err) + Error(IDLoc, "register `" + R + "' modified more than once"); + + if (HexagonMCErrInfo::CHECK_ERROR_READONLY & Err) + Error(IDLoc, "cannot write to read-only register `" + R + "'"); + + if (HexagonMCErrInfo::CHECK_ERROR_LOOP & Err) + Error(IDLoc, "loop-setup and some branch instructions " + "cannot be in the same packet"); + + if (HexagonMCErrInfo::CHECK_ERROR_ENDLOOP & Err) { + Twine N(HexagonMCInstrInfo::isInnerLoop(MCB) ? '0' : '1'); + Error(IDLoc, "packet marked with `:endloop" + N + "' " + + "cannot contain instructions that modify register " + + "`" + R + "'"); + } + + if (HexagonMCErrInfo::CHECK_ERROR_SOLO & Err) + Error(IDLoc, + "instruction cannot appear in packet with other instructions"); + + if (HexagonMCErrInfo::CHECK_ERROR_NOSLOTS & Err) + Error(IDLoc, "too many slots used in packet"); + + if (Err & HexagonMCErrInfo::CHECK_ERROR_SHUFFLE) { + uint64_t Erm = Check.getShuffleError(); + + if (HexagonShuffler::SHUFFLE_ERROR_INVALID == Erm) + Error(IDLoc, "invalid instruction packet"); + else if (HexagonShuffler::SHUFFLE_ERROR_STORES == Erm) + Error(IDLoc, "invalid instruction packet: too many stores"); + else if (HexagonShuffler::SHUFFLE_ERROR_LOADS == Erm) + Error(IDLoc, "invalid instruction packet: too many loads"); + else if (HexagonShuffler::SHUFFLE_ERROR_BRANCHES == Erm) + Error(IDLoc, "too many branches in packet"); + else if (HexagonShuffler::SHUFFLE_ERROR_NOSLOTS == Erm) + Error(IDLoc, "invalid instruction packet: out of slots"); + else if (HexagonShuffler::SHUFFLE_ERROR_SLOTS == Erm) + Error(IDLoc, "invalid instruction packet: slot error"); + else if (HexagonShuffler::SHUFFLE_ERROR_ERRATA2 == Erm) + Error(IDLoc, "v60 packet violation"); + else if (HexagonShuffler::SHUFFLE_ERROR_STORE_LOAD_CONFLICT == Erm) + Error(IDLoc, "slot 0 instruction does not allow slot 1 store"); + else + Error(IDLoc, "unknown error in instruction packet"); + } + } + + unsigned Warn = Check.getWarning(); + if (Warn != HexagonMCErrInfo::CHECK_SUCCESS) { + if (HexagonMCErrInfo::CHECK_WARN_CURRENT & Warn) + Warning(IDLoc, "register `" + R + "' used with `.cur' " + "but not used in the same packet"); + else if (HexagonMCErrInfo::CHECK_WARN_TEMPORARY & Warn) + Warning(IDLoc, "register `" + R + "' used with `.tmp' " + "but not used in the same packet"); + } + } + + if (CheckOk) { + MCB.setLoc(IDLoc); + if (HexagonMCInstrInfo::bundleSize(MCB) == 0) { + assert(!HexagonMCInstrInfo::isInnerLoop(MCB)); + assert(!HexagonMCInstrInfo::isOuterLoop(MCB)); + // Empty packets are valid yet aren't emitted + return false; + } + Out.EmitInstruction(MCB, getSTI()); + } else { + // If compounding and duplexing didn't reduce the size below + // 4 or less we have a packet that is too big. + if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) { + Error(IDLoc, "invalid instruction packet: out of slots"); + return true; // Error + } + } + + return false; // No error +} + +bool HexagonAsmParser::matchBundleOptions() { + MCAsmParser &Parser = getParser(); + MCAsmLexer &Lexer = getLexer(); + while (true) { + if (!Parser.getTok().is(AsmToken::Colon)) + return false; + Lexer.Lex(); + StringRef Option = Parser.getTok().getString(); + if (Option.compare_lower("endloop0") == 0) + HexagonMCInstrInfo::setInnerLoop(MCB); + else if (Option.compare_lower("endloop1") == 0) + HexagonMCInstrInfo::setOuterLoop(MCB); + else if (Option.compare_lower("mem_noshuf") == 0) + HexagonMCInstrInfo::setMemReorderDisabled(MCB); + else if (Option.compare_lower("mem_shuf") == 0) + HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB); + else + return true; + Lexer.Lex(); + } +} + +// For instruction aliases, immediates are generated rather than +// MCConstantExpr. Convert them for uniform MCExpr. +// Also check for signed/unsigned mismatches and warn +void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) { + MCInst NewInst; + NewInst.setOpcode(MCI.getOpcode()); + for (MCOperand &I : MCI) + if (I.isImm()) { + int64_t Value (I.getImm()); + if ((Value & 0x100000000) != (Value & 0x80000000)) { + // Detect flipped bit 33 wrt bit 32 and signal warning + Value ^= 0x100000000; + if (WarnSignedMismatch) + Warning (MCI.getLoc(), "Signed/Unsigned mismatch"); + } + NewInst.addOperand(MCOperand::createExpr( + MCConstantExpr::create(Value, getContext()))); + } + else + NewInst.addOperand(I); + MCI = NewInst; +} + +bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc, + OperandVector &InstOperands, + uint64_t &ErrorInfo, + bool MatchingInlineAsm, + bool &MustExtend) { + // Perform matching with tablegen asmmatcher generated function + int result = + MatchInstructionImpl(InstOperands, MCI, ErrorInfo, MatchingInlineAsm); + if (result == Match_Success) { + MCI.setLoc(IDLoc); + MustExtend = mustExtend(InstOperands); + canonicalizeImmediates(MCI); + result = processInstruction(MCI, InstOperands, IDLoc, MustExtend); + + DEBUG(dbgs() << "Insn:"); + DEBUG(MCI.dump_pretty(dbgs())); + DEBUG(dbgs() << "\n\n"); + + MCI.setLoc(IDLoc); + } + + // Create instruction operand for bundle instruction + // Break this into a separate function Code here is less readable + // Think about how to get an instruction error to report correctly. + // SMLoc will return the "{" + switch (result) { + default: + break; + case Match_Success: + return false; + case Match_MissingFeature: + return Error(IDLoc, "invalid instruction"); + case Match_MnemonicFail: + return Error(IDLoc, "unrecognized instruction"); + case Match_InvalidOperand: + SMLoc ErrorLoc = IDLoc; + if (ErrorInfo != ~0U) { + if (ErrorInfo >= InstOperands.size()) + return Error(IDLoc, "too few operands for instruction"); + + ErrorLoc = (static_cast<HexagonOperand *>(InstOperands[ErrorInfo].get())) + ->getStartLoc(); + if (ErrorLoc == SMLoc()) + ErrorLoc = IDLoc; + } + return Error(ErrorLoc, "invalid operand for instruction"); + } + llvm_unreachable("Implement any new match types added!"); +} + +bool HexagonAsmParser::mustExtend(OperandVector &Operands) { + unsigned Count = 0; + for (std::unique_ptr<MCParsedAsmOperand> &i : Operands) + if (i->isImm()) + if (static_cast<HexagonOperand *>(i.get())->Imm.MustExtend) + ++Count; + // Multiple extenders should have been filtered by iss9Ext et. al. + assert(Count < 2 && "Multiple extenders"); + return Count == 1; +} + +bool HexagonAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, + OperandVector &Operands, + MCStreamer &Out, + uint64_t &ErrorInfo, + bool MatchingInlineAsm) { + if (!InBrackets) { + MCB.clear(); + MCB.addOperand(MCOperand::createImm(0)); + } + HexagonOperand &FirstOperand = static_cast<HexagonOperand &>(*Operands[0]); + if (FirstOperand.isToken() && FirstOperand.getToken() == "{") { + assert(Operands.size() == 1 && "Brackets should be by themselves"); + if (InBrackets) { + getParser().Error(IDLoc, "Already in a packet"); + return true; + } + InBrackets = true; + return false; + } + if (FirstOperand.isToken() && FirstOperand.getToken() == "}") { + assert(Operands.size() == 1 && "Brackets should be by themselves"); + if (!InBrackets) { + getParser().Error(IDLoc, "Not in a packet"); + return true; + } + InBrackets = false; + if (matchBundleOptions()) + return true; + return finishBundle(IDLoc, Out); + } + MCInst *SubInst = new (getParser().getContext()) MCInst; + bool MustExtend = false; + if (matchOneInstruction(*SubInst, IDLoc, Operands, ErrorInfo, + MatchingInlineAsm, MustExtend)) + return true; + HexagonMCInstrInfo::extendIfNeeded( + getParser().getContext(), MCII, MCB, *SubInst, + HexagonMCInstrInfo::isExtended(MCII, *SubInst) || MustExtend); + MCB.addOperand(MCOperand::createInst(SubInst)); + if (!InBrackets) + return finishBundle(IDLoc, Out); + return false; +} + +/// ParseDirective parses the Hexagon specific directives +bool HexagonAsmParser::ParseDirective(AsmToken DirectiveID) { + StringRef IDVal = DirectiveID.getIdentifier(); + if ((IDVal.lower() == ".word") || (IDVal.lower() == ".4byte")) + return ParseDirectiveValue(4, DirectiveID.getLoc()); + if (IDVal.lower() == ".short" || IDVal.lower() == ".hword" || + IDVal.lower() == ".half") + return ParseDirectiveValue(2, DirectiveID.getLoc()); + if (IDVal.lower() == ".falign") + return ParseDirectiveFalign(256, DirectiveID.getLoc()); + if ((IDVal.lower() == ".lcomm") || (IDVal.lower() == ".lcommon")) + return ParseDirectiveComm(true, DirectiveID.getLoc()); + if ((IDVal.lower() == ".comm") || (IDVal.lower() == ".common")) + return ParseDirectiveComm(false, DirectiveID.getLoc()); + if (IDVal.lower() == ".subsection") + return ParseDirectiveSubsection(DirectiveID.getLoc()); + + return true; +} +bool HexagonAsmParser::ParseDirectiveSubsection(SMLoc L) { + const MCExpr *Subsection = 0; + int64_t Res; + + assert((getLexer().isNot(AsmToken::EndOfStatement)) && + "Invalid subsection directive"); + getParser().parseExpression(Subsection); + + if (!Subsection->evaluateAsAbsolute(Res)) + return Error(L, "Cannot evaluate subsection number"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + + // 0-8192 is the hard-coded range in MCObjectStreamper.cpp, this keeps the + // negative subsections together and in the same order but at the opposite + // end of the section. Only legacy hexagon-gcc created assembly code + // used negative subsections. + if ((Res < 0) && (Res > -8193)) + Subsection = MCConstantExpr::create(8192 + Res, this->getContext()); + + getStreamer().SubSection(Subsection); + return false; +} + +/// ::= .falign [expression] +bool HexagonAsmParser::ParseDirectiveFalign(unsigned Size, SMLoc L) { + + int64_t MaxBytesToFill = 15; + + // if there is an arguement + if (getLexer().isNot(AsmToken::EndOfStatement)) { + const MCExpr *Value; + SMLoc ExprLoc = L; + + // Make sure we have a number (false is returned if expression is a number) + if (getParser().parseExpression(Value) == false) { + // Make sure this is a number that is in range + const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(Size, IntValue) && !isIntN(Size, IntValue)) + return Error(ExprLoc, "literal value out of range (256) for falign"); + MaxBytesToFill = IntValue; + Lex(); + } else { + return Error(ExprLoc, "not a valid expression for falign directive"); + } + } + + getTargetStreamer().emitFAlign(16, MaxBytesToFill); + Lex(); + + return false; +} + +/// ::= .word [ expression (, expression)* ] +bool HexagonAsmParser::ParseDirectiveValue(unsigned Size, SMLoc L) { + if (getLexer().isNot(AsmToken::EndOfStatement)) { + + for (;;) { + const MCExpr *Value; + SMLoc ExprLoc = L; + if (getParser().parseExpression(Value)) + return true; + + // Special case constant expressions to match code generator. + if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else + getStreamer().EmitValue(Value, Size); + + if (getLexer().is(AsmToken::EndOfStatement)) + break; + + // FIXME: Improve diagnostic. + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in directive"); + Lex(); + } + } + + Lex(); + return false; +} + +// This is largely a copy of AsmParser's ParseDirectiveComm extended to +// accept a 3rd argument, AccessAlignment which indicates the smallest +// memory access made to the symbol, expressed in bytes. If no +// AccessAlignment is specified it defaults to the Alignment Value. +// Hexagon's .lcomm: +// .lcomm Symbol, Length, Alignment, AccessAlignment +bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) { + // FIXME: need better way to detect if AsmStreamer (upstream removed + // getKind()) + if (getStreamer().hasRawTextSupport()) + return true; // Only object file output requires special treatment. + + StringRef Name; + if (getParser().parseIdentifier(Name)) + return TokError("expected identifier in directive"); + // Handle the identifier as the key symbol. + MCSymbol *Sym = getContext().getOrCreateSymbol(Name); + + if (getLexer().isNot(AsmToken::Comma)) + return TokError("unexpected token in directive"); + Lex(); + + int64_t Size; + SMLoc SizeLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(Size)) + return true; + + int64_t ByteAlignment = 1; + SMLoc ByteAlignmentLoc; + if (getLexer().is(AsmToken::Comma)) { + Lex(); + ByteAlignmentLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(ByteAlignment)) + return true; + if (!isPowerOf2_64(ByteAlignment)) + return Error(ByteAlignmentLoc, "alignment must be a power of 2"); + } + + int64_t AccessAlignment = 0; + if (getLexer().is(AsmToken::Comma)) { + // The optional access argument specifies the size of the smallest memory + // access to be made to the symbol, expressed in bytes. + SMLoc AccessAlignmentLoc; + Lex(); + AccessAlignmentLoc = getLexer().getLoc(); + if (getParser().parseAbsoluteExpression(AccessAlignment)) + return true; + + if (!isPowerOf2_64(AccessAlignment)) + return Error(AccessAlignmentLoc, "access alignment must be a power of 2"); + } + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in '.comm' or '.lcomm' directive"); + + Lex(); + + // NOTE: a size of zero for a .comm should create a undefined symbol + // but a size of .lcomm creates a bss symbol of size zero. + if (Size < 0) + return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't " + "be less than zero"); + + // NOTE: The alignment in the directive is a power of 2 value, the assembler + // may internally end up wanting an alignment in bytes. + // FIXME: Diagnose overflow. + if (ByteAlignment < 0) + return Error(ByteAlignmentLoc, "invalid '.comm' or '.lcomm' directive " + "alignment, can't be less than zero"); + + if (!Sym->isUndefined()) + return Error(Loc, "invalid symbol redefinition"); + + HexagonMCELFStreamer &HexagonELFStreamer = + static_cast<HexagonMCELFStreamer &>(getStreamer()); + if (IsLocal) { + HexagonELFStreamer.HexagonMCEmitLocalCommonSymbol(Sym, Size, ByteAlignment, + AccessAlignment); + return false; + } + + HexagonELFStreamer.HexagonMCEmitCommonSymbol(Sym, Size, ByteAlignment, + AccessAlignment); + return false; +} + +// validate register against architecture +bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const { + return true; +} + +// extern "C" void LLVMInitializeHexagonAsmLexer(); + +/// Force static initialization. +extern "C" void LLVMInitializeHexagonAsmParser() { + RegisterMCAsmParser<HexagonAsmParser> X(TheHexagonTarget); +} + +#define GET_MATCHER_IMPLEMENTATION +#define GET_REGISTER_MATCHER +#include "HexagonGenAsmMatcher.inc" + +namespace { +bool previousEqual(OperandVector &Operands, size_t Index, StringRef String) { + if (Index >= Operands.size()) + return false; + MCParsedAsmOperand &Operand = *Operands[Operands.size() - Index - 1]; + if (!Operand.isToken()) + return false; + return static_cast<HexagonOperand &>(Operand).getToken().equals_lower(String); +} +bool previousIsLoop(OperandVector &Operands, size_t Index) { + return previousEqual(Operands, Index, "loop0") || + previousEqual(Operands, Index, "loop1") || + previousEqual(Operands, Index, "sp1loop0") || + previousEqual(Operands, Index, "sp2loop0") || + previousEqual(Operands, Index, "sp3loop0"); +} +} + +bool HexagonAsmParser::splitIdentifier(OperandVector &Operands) { + AsmToken const &Token = getParser().getTok(); + StringRef String = Token.getString(); + SMLoc Loc = Token.getLoc(); + getLexer().Lex(); + do { + std::pair<StringRef, StringRef> HeadTail = String.split('.'); + if (!HeadTail.first.empty()) + Operands.push_back(HexagonOperand::CreateToken(HeadTail.first, Loc)); + if (!HeadTail.second.empty()) + Operands.push_back(HexagonOperand::CreateToken( + String.substr(HeadTail.first.size(), 1), Loc)); + String = HeadTail.second; + } while (!String.empty()); + return false; +} + +bool HexagonAsmParser::parseOperand(OperandVector &Operands) { + unsigned Register; + SMLoc Begin; + SMLoc End; + MCAsmLexer &Lexer = getLexer(); + if (!ParseRegister(Register, Begin, End)) { + if (!ErrorMissingParenthesis) + switch (Register) { + default: + break; + case Hexagon::P0: + case Hexagon::P1: + case Hexagon::P2: + case Hexagon::P3: + if (previousEqual(Operands, 0, "if")) { + if (WarnMissingParenthesis) + Warning (Begin, "Missing parenthesis around predicate register"); + static char const *LParen = "("; + static char const *RParen = ")"; + Operands.push_back(HexagonOperand::CreateToken(LParen, Begin)); + Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End)); + AsmToken MaybeDotNew = Lexer.getTok(); + if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) && + MaybeDotNew.getString().equals_lower(".new")) + splitIdentifier(Operands); + Operands.push_back(HexagonOperand::CreateToken(RParen, Begin)); + return false; + } + if (previousEqual(Operands, 0, "!") && + previousEqual(Operands, 1, "if")) { + if (WarnMissingParenthesis) + Warning (Begin, "Missing parenthesis around predicate register"); + static char const *LParen = "("; + static char const *RParen = ")"; + Operands.insert(Operands.end () - 1, + HexagonOperand::CreateToken(LParen, Begin)); + Operands.push_back(HexagonOperand::CreateReg(Register, Begin, End)); + AsmToken MaybeDotNew = Lexer.getTok(); + if (MaybeDotNew.is(AsmToken::TokenKind::Identifier) && + MaybeDotNew.getString().equals_lower(".new")) + splitIdentifier(Operands); + Operands.push_back(HexagonOperand::CreateToken(RParen, Begin)); + return false; + } + break; + } + Operands.push_back(HexagonOperand::CreateReg( + Register, Begin, End)); + return false; + } + return splitIdentifier(Operands); +} + +bool HexagonAsmParser::isLabel(AsmToken &Token) { + MCAsmLexer &Lexer = getLexer(); + AsmToken const &Second = Lexer.getTok(); + AsmToken Third = Lexer.peekTok(); + StringRef String = Token.getString(); + if (Token.is(AsmToken::TokenKind::LCurly) || + Token.is(AsmToken::TokenKind::RCurly)) + return false; + if (!Token.is(AsmToken::TokenKind::Identifier)) + return true; + if (!MatchRegisterName(String.lower())) + return true; + (void)Second; + assert(Second.is(AsmToken::Colon)); + StringRef Raw (String.data(), Third.getString().data() - String.data() + + Third.getString().size()); + std::string Collapsed = Raw; + Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace), + Collapsed.end()); + StringRef Whole = Collapsed; + std::pair<StringRef, StringRef> DotSplit = Whole.split('.'); + if (!MatchRegisterName(DotSplit.first.lower())) + return true; + return false; +} + +bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious, SMLoc &Loc) { + if (!Contigious && ErrorNoncontigiousRegister) { + Error(Loc, "Register name is not contigious"); + return true; + } + if (!Contigious && WarnNoncontigiousRegister) + Warning(Loc, "Register name is not contigious"); + return false; +} + +bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) { + MCAsmLexer &Lexer = getLexer(); + StartLoc = getLexer().getLoc(); + SmallVector<AsmToken, 5> Lookahead; + StringRef RawString(Lexer.getTok().getString().data(), 0); + bool Again = Lexer.is(AsmToken::Identifier); + bool NeededWorkaround = false; + while (Again) { + AsmToken const &Token = Lexer.getTok(); + RawString = StringRef(RawString.data(), + Token.getString().data() - RawString.data () + + Token.getString().size()); + Lookahead.push_back(Token); + Lexer.Lex(); + bool Contigious = Lexer.getTok().getString().data() == + Lookahead.back().getString().data() + + Lookahead.back().getString().size(); + bool Type = Lexer.is(AsmToken::Identifier) || Lexer.is(AsmToken::Dot) || + Lexer.is(AsmToken::Integer) || Lexer.is(AsmToken::Real) || + Lexer.is(AsmToken::Colon); + bool Workaround = Lexer.is(AsmToken::Colon) || + Lookahead.back().is(AsmToken::Colon); + Again = (Contigious && Type) || (Workaround && Type); + NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type)); + } + std::string Collapsed = RawString; + Collapsed.erase(std::remove_if(Collapsed.begin(), Collapsed.end(), isspace), + Collapsed.end()); + StringRef FullString = Collapsed; + std::pair<StringRef, StringRef> DotSplit = FullString.split('.'); + unsigned DotReg = MatchRegisterName(DotSplit.first.lower()); + if (DotReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) { + if (DotSplit.second.empty()) { + RegNo = DotReg; + EndLoc = Lexer.getLoc(); + if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc)) + return true; + return false; + } else { + RegNo = DotReg; + size_t First = RawString.find('.'); + StringRef DotString (RawString.data() + First, RawString.size() - First); + Lexer.UnLex(AsmToken(AsmToken::Identifier, DotString)); + EndLoc = Lexer.getLoc(); + if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc)) + return true; + return false; + } + } + std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':'); + unsigned ColonReg = MatchRegisterName(ColonSplit.first.lower()); + if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) { + Lexer.UnLex(Lookahead.back()); + Lookahead.pop_back(); + Lexer.UnLex(Lookahead.back()); + Lookahead.pop_back(); + RegNo = ColonReg; + EndLoc = Lexer.getLoc(); + if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc)) + return true; + return false; + } + while (!Lookahead.empty()) { + Lexer.UnLex(Lookahead.back()); + Lookahead.pop_back(); + } + return true; +} + +bool HexagonAsmParser::implicitExpressionLocation(OperandVector &Operands) { + if (previousEqual(Operands, 0, "call")) + return true; + if (previousEqual(Operands, 0, "jump")) + if (!getLexer().getTok().is(AsmToken::Colon)) + return true; + if (previousEqual(Operands, 0, "(") && previousIsLoop(Operands, 1)) + return true; + if (previousEqual(Operands, 1, ":") && previousEqual(Operands, 2, "jump") && + (previousEqual(Operands, 0, "nt") || previousEqual(Operands, 0, "t"))) + return true; + return false; +} + +bool HexagonAsmParser::parseExpression(MCExpr const *& Expr) { + llvm::SmallVector<AsmToken, 4> Tokens; + MCAsmLexer &Lexer = getLexer(); + bool Done = false; + static char const * Comma = ","; + do { + Tokens.emplace_back (Lexer.getTok()); + Lexer.Lex(); + switch (Tokens.back().getKind()) + { + case AsmToken::TokenKind::Hash: + if (Tokens.size () > 1) + if ((Tokens.end () - 2)->getKind() == AsmToken::TokenKind::Plus) { + Tokens.insert(Tokens.end() - 2, + AsmToken(AsmToken::TokenKind::Comma, Comma)); + Done = true; + } + break; + case AsmToken::TokenKind::RCurly: + case AsmToken::TokenKind::EndOfStatement: + case AsmToken::TokenKind::Eof: + Done = true; + break; + default: + break; + } + } while (!Done); + while (!Tokens.empty()) { + Lexer.UnLex(Tokens.back()); + Tokens.pop_back(); + } + return getParser().parseExpression(Expr); +} + +bool HexagonAsmParser::parseExpressionOrOperand(OperandVector &Operands) { + if (implicitExpressionLocation(Operands)) { + MCAsmParser &Parser = getParser(); + SMLoc Loc = Parser.getLexer().getLoc(); + std::unique_ptr<HexagonOperand> Expr = + HexagonOperand::CreateImm(nullptr, Loc, Loc); + MCExpr const *& Val = Expr->Imm.Val; + Operands.push_back(std::move(Expr)); + return parseExpression(Val); + } + return parseOperand(Operands); +} + +/// Parse an instruction. +bool HexagonAsmParser::parseInstruction(OperandVector &Operands) { + MCAsmParser &Parser = getParser(); + MCAsmLexer &Lexer = getLexer(); + while (true) { + AsmToken const &Token = Parser.getTok(); + switch (Token.getKind()) { + case AsmToken::EndOfStatement: { + Lexer.Lex(); + return false; + } + case AsmToken::LCurly: { + if (!Operands.empty()) + return true; + Operands.push_back( + HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); + Lexer.Lex(); + return false; + } + case AsmToken::RCurly: { + if (Operands.empty()) { + Operands.push_back( + HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); + Lexer.Lex(); + } + return false; + } + case AsmToken::Comma: { + Lexer.Lex(); + continue; + } + case AsmToken::EqualEqual: + case AsmToken::ExclaimEqual: + case AsmToken::GreaterEqual: + case AsmToken::GreaterGreater: + case AsmToken::LessEqual: + case AsmToken::LessLess: { + Operands.push_back(HexagonOperand::CreateToken( + Token.getString().substr(0, 1), Token.getLoc())); + Operands.push_back(HexagonOperand::CreateToken( + Token.getString().substr(1, 1), Token.getLoc())); + Lexer.Lex(); + continue; + } + case AsmToken::Hash: { + bool MustNotExtend = false; + bool ImplicitExpression = implicitExpressionLocation(Operands); + std::unique_ptr<HexagonOperand> Expr = HexagonOperand::CreateImm( + nullptr, Lexer.getLoc(), Lexer.getLoc()); + if (!ImplicitExpression) + Operands.push_back( + HexagonOperand::CreateToken(Token.getString(), Token.getLoc())); + Lexer.Lex(); + bool MustExtend = false; + bool HiOnly = false; + bool LoOnly = false; + if (Lexer.is(AsmToken::Hash)) { + Lexer.Lex(); + MustExtend = true; + } else if (ImplicitExpression) + MustNotExtend = true; + AsmToken const &Token = Parser.getTok(); + if (Token.is(AsmToken::Identifier)) { + StringRef String = Token.getString(); + AsmToken IDToken = Token; + if (String.lower() == "hi") { + HiOnly = true; + } else if (String.lower() == "lo") { + LoOnly = true; + } + if (HiOnly || LoOnly) { + AsmToken LParen = Lexer.peekTok(); + if (!LParen.is(AsmToken::LParen)) { + HiOnly = false; + LoOnly = false; + } else { + Lexer.Lex(); + } + } + } + if (parseExpression(Expr->Imm.Val)) + return true; + int64_t Value; + MCContext &Context = Parser.getContext(); + assert(Expr->Imm.Val != nullptr); + if (Expr->Imm.Val->evaluateAsAbsolute(Value)) { + if (HiOnly) + Expr->Imm.Val = MCBinaryExpr::createLShr( + Expr->Imm.Val, MCConstantExpr::create(16, Context), Context); + if (HiOnly || LoOnly) + Expr->Imm.Val = MCBinaryExpr::createAnd( + Expr->Imm.Val, MCConstantExpr::create(0xffff, Context), Context); + } + if (MustNotExtend) + Expr->Imm.Val = HexagonNoExtendOperand::Create(Expr->Imm.Val, Context); + Expr->Imm.MustExtend = MustExtend; + Operands.push_back(std::move(Expr)); + continue; + } + default: + break; + } + if (parseExpressionOrOperand(Operands)) + return true; + } +} + +bool HexagonAsmParser::ParseInstruction(ParseInstructionInfo &Info, + StringRef Name, + AsmToken ID, + OperandVector &Operands) { + getLexer().UnLex(ID); + return parseInstruction(Operands); +} + +namespace { +MCInst makeCombineInst(int opCode, MCOperand &Rdd, + MCOperand &MO1, MCOperand &MO2) { + MCInst TmpInst; + TmpInst.setOpcode(opCode); + TmpInst.addOperand(Rdd); + TmpInst.addOperand(MO1); + TmpInst.addOperand(MO2); + + return TmpInst; +} +} + +// Define this matcher function after the auto-generated include so we +// have the match class enum definitions. +unsigned HexagonAsmParser::validateTargetOperandClass(MCParsedAsmOperand &AsmOp, + unsigned Kind) { + HexagonOperand *Op = static_cast<HexagonOperand *>(&AsmOp); + + switch (Kind) { + case MCK_0: { + int64_t Value; + return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 0 + ? Match_Success + : Match_InvalidOperand; + } + case MCK_1: { + int64_t Value; + return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == 1 + ? Match_Success + : Match_InvalidOperand; + } + case MCK__MINUS_1: { + int64_t Value; + return Op->isImm() && Op->Imm.Val->evaluateAsAbsolute(Value) && Value == -1 + ? Match_Success + : Match_InvalidOperand; + } + } + if (Op->Kind == HexagonOperand::Token && Kind != InvalidMatchClass) { + StringRef myStringRef = StringRef(Op->Tok.Data, Op->Tok.Length); + if (matchTokenString(myStringRef.lower()) == (MatchClassKind)Kind) + return Match_Success; + if (matchTokenString(myStringRef.upper()) == (MatchClassKind)Kind) + return Match_Success; + } + + DEBUG(dbgs() << "Unmatched Operand:"); + DEBUG(Op->dump()); + DEBUG(dbgs() << "\n"); + + return Match_InvalidOperand; +} + +void HexagonAsmParser::OutOfRange(SMLoc IDLoc, long long Val, long long Max) { + std::string errStr; + raw_string_ostream ES(errStr); + ES << "value " << Val << "(" << format_hex(Val, 0) << ") out of range: "; + if (Max >= 0) + ES << "0-" << Max; + else + ES << Max << "-" << (-Max - 1); + Error(IDLoc, ES.str().c_str()); +} + +int HexagonAsmParser::processInstruction(MCInst &Inst, + OperandVector const &Operands, + SMLoc IDLoc, bool &MustExtend) { + MCContext &Context = getParser().getContext(); + const MCRegisterInfo *RI = getContext().getRegisterInfo(); + std::string r = "r"; + std::string v = "v"; + std::string Colon = ":"; + + bool is32bit = false; // used to distinguish between CONST32 and CONST64 + switch (Inst.getOpcode()) { + default: + break; + + case Hexagon::M4_mpyrr_addr: + case Hexagon::S4_addi_asl_ri: + case Hexagon::S4_addi_lsr_ri: + case Hexagon::S4_andi_asl_ri: + case Hexagon::S4_andi_lsr_ri: + case Hexagon::S4_ori_asl_ri: + case Hexagon::S4_ori_lsr_ri: + case Hexagon::S4_or_andix: + case Hexagon::S4_subi_asl_ri: + case Hexagon::S4_subi_lsr_ri: { + MCOperand &Ry = Inst.getOperand(0); + MCOperand &src = Inst.getOperand(2); + if (RI->getEncodingValue(Ry.getReg()) != RI->getEncodingValue(src.getReg())) + return Match_InvalidOperand; + break; + } + + case Hexagon::C2_cmpgei: { + MCOperand &MO = Inst.getOperand(2); + MO.setExpr(MCBinaryExpr::createSub( + MO.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::C2_cmpgti); + break; + } + + case Hexagon::C2_cmpgeui: { + MCOperand &MO = Inst.getOperand(2); + int64_t Value; + bool Success = MO.getExpr()->evaluateAsAbsolute(Value); + (void)Success; + assert(Success && "Assured by matcher"); + if (Value == 0) { + MCInst TmpInst; + MCOperand &Pd = Inst.getOperand(0); + MCOperand &Rt = Inst.getOperand(1); + TmpInst.setOpcode(Hexagon::C2_cmpeq); + TmpInst.addOperand(Pd); + TmpInst.addOperand(Rt); + TmpInst.addOperand(Rt); + Inst = TmpInst; + } else { + MO.setExpr(MCBinaryExpr::createSub( + MO.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::C2_cmpgtui); + } + break; + } + case Hexagon::J2_loop1r: + case Hexagon::J2_loop1i: + case Hexagon::J2_loop0r: + case Hexagon::J2_loop0i: { + MCOperand &MO = Inst.getOperand(0); + // Loop has different opcodes for extended vs not extended, but we should + // not use the other opcode as it is a legacy artifact of TD files. + int64_t Value; + if (MO.getExpr()->evaluateAsAbsolute(Value)) { + // if the operand can fit within a 7:2 field + if (Value < (1 << 8) && Value >= -(1 << 8)) { + SMLoc myLoc = Operands[2]->getStartLoc(); + // # is left in startLoc in the case of ## + // If '##' found then force extension. + if (*myLoc.getPointer() == '#') { + MustExtend = true; + break; + } + } else { + // If immediate and out of 7:2 range. + MustExtend = true; + } + } + break; + } + + // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)" + case Hexagon::A2_tfrp: { + MCOperand &MO = Inst.getOperand(1); + unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + MO.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst.setOpcode(Hexagon::A2_combinew); + break; + } + + case Hexagon::A2_tfrpt: + case Hexagon::A2_tfrpf: { + MCOperand &MO = Inst.getOperand(2); + unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + MO.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt) + ? Hexagon::C2_ccombinewt + : Hexagon::C2_ccombinewf); + break; + } + case Hexagon::A2_tfrptnew: + case Hexagon::A2_tfrpfnew: { + MCOperand &MO = Inst.getOperand(2); + unsigned int RegPairNum = RI->getEncodingValue(MO.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + MO.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + Inst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew) + ? Hexagon::C2_ccombinewnewt + : Hexagon::C2_ccombinewnewf); + break; + } + + // Translate a "$Rx = CONST32(#imm)" to "$Rx = memw(gp+#LABEL) " + case Hexagon::CONST32: + case Hexagon::CONST32_Float_Real: + case Hexagon::CONST32_Int_Real: + case Hexagon::FCONST32_nsdata: + is32bit = true; + // Translate a "$Rx:y = CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) " + case Hexagon::CONST64_Float_Real: + case Hexagon::CONST64_Int_Real: + + // FIXME: need better way to detect AsmStreamer (upstream removed getKind()) + if (!Parser.getStreamer().hasRawTextSupport()) { + MCELFStreamer *MES = static_cast<MCELFStreamer *>(&Parser.getStreamer()); + MCOperand &MO_1 = Inst.getOperand(1); + MCOperand &MO_0 = Inst.getOperand(0); + + // push section onto section stack + MES->PushSection(); + + std::string myCharStr; + MCSectionELF *mySection; + + // check if this as an immediate or a symbol + int64_t Value; + bool Absolute = MO_1.getExpr()->evaluateAsAbsolute(Value); + if (Absolute) { + // Create a new section - one for each constant + // Some or all of the zeros are replaced with the given immediate. + if (is32bit) { + std::string myImmStr = utohexstr(static_cast<uint32_t>(Value)); + myCharStr = StringRef(".gnu.linkonce.l4.CONST_00000000") + .drop_back(myImmStr.size()) + .str() + + myImmStr; + } else { + std::string myImmStr = utohexstr(Value); + myCharStr = StringRef(".gnu.linkonce.l8.CONST_0000000000000000") + .drop_back(myImmStr.size()) + .str() + + myImmStr; + } + + mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE); + } else if (MO_1.isExpr()) { + // .lita - for expressions + myCharStr = ".lita"; + mySection = getContext().getELFSection(myCharStr, ELF::SHT_PROGBITS, + ELF::SHF_ALLOC | ELF::SHF_WRITE); + } else + llvm_unreachable("unexpected type of machine operand!"); + + MES->SwitchSection(mySection); + unsigned byteSize = is32bit ? 4 : 8; + getStreamer().EmitCodeAlignment(byteSize, byteSize); + + MCSymbol *Sym; + + // for symbols, get rid of prepended ".gnu.linkonce.lx." + + // emit symbol if needed + if (Absolute) { + Sym = getContext().getOrCreateSymbol(StringRef(myCharStr.c_str() + 16)); + if (Sym->isUndefined()) { + getStreamer().EmitLabel(Sym); + getStreamer().EmitSymbolAttribute(Sym, MCSA_Global); + getStreamer().EmitIntValue(Value, byteSize); + } + } else if (MO_1.isExpr()) { + const char *StringStart = 0; + const char *StringEnd = 0; + if (*Operands[4]->getStartLoc().getPointer() == '#') { + StringStart = Operands[5]->getStartLoc().getPointer(); + StringEnd = Operands[6]->getStartLoc().getPointer(); + } else { // no pound + StringStart = Operands[4]->getStartLoc().getPointer(); + StringEnd = Operands[5]->getStartLoc().getPointer(); + } + + unsigned size = StringEnd - StringStart; + std::string DotConst = ".CONST_"; + Sym = getContext().getOrCreateSymbol(DotConst + + StringRef(StringStart, size)); + + if (Sym->isUndefined()) { + // case where symbol is not yet defined: emit symbol + getStreamer().EmitLabel(Sym); + getStreamer().EmitSymbolAttribute(Sym, MCSA_Local); + getStreamer().EmitValue(MO_1.getExpr(), 4); + } + } else + llvm_unreachable("unexpected type of machine operand!"); + + MES->PopSection(); + + if (Sym) { + MCInst TmpInst; + if (is32bit) // 32 bit + TmpInst.setOpcode(Hexagon::L2_loadrigp); + else // 64 bit + TmpInst.setOpcode(Hexagon::L2_loadrdgp); + + TmpInst.addOperand(MO_0); + TmpInst.addOperand( + MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext()))); + Inst = TmpInst; + } + } + break; + + // Translate a "$Rdd = #-imm" to "$Rdd = combine(#[-1,0], #-imm)" + case Hexagon::A2_tfrpi: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO = Inst.getOperand(1); + int64_t Value; + int sVal = (MO.getExpr()->evaluateAsAbsolute(Value) && Value < 0) ? -1 : 0; + MCOperand imm(MCOperand::createExpr(MCConstantExpr::create(sVal, Context))); + Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, imm, MO); + break; + } + + // Translate a "$Rdd = [#]#imm" to "$Rdd = combine(#, [#]#imm)" + case Hexagon::TFRI64_V4: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO = Inst.getOperand(1); + int64_t Value; + if (MO.getExpr()->evaluateAsAbsolute(Value)) { + unsigned long long u64 = Value; + signed int s8 = (u64 >> 32) & 0xFFFFFFFF; + if (s8 < -128 || s8 > 127) + OutOfRange(IDLoc, s8, -128); + MCOperand imm(MCOperand::createExpr( + MCConstantExpr::create(s8, Context))); // upper 32 + MCOperand imm2(MCOperand::createExpr( + MCConstantExpr::create(u64 & 0xFFFFFFFF, Context))); // lower 32 + Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, imm2); + } else { + MCOperand imm(MCOperand::createExpr( + MCConstantExpr::create(0, Context))); // upper 32 + Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, imm, MO); + } + break; + } + + // Handle $Rdd = combine(##imm, #imm)" + case Hexagon::TFRI64_V2_ext: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO1 = Inst.getOperand(1); + MCOperand &MO2 = Inst.getOperand(2); + int64_t Value; + if (MO2.getExpr()->evaluateAsAbsolute(Value)) { + int s8 = Value; + if (s8 < -128 || s8 > 127) + OutOfRange(IDLoc, s8, -128); + } + Inst = makeCombineInst(Hexagon::A2_combineii, Rdd, MO1, MO2); + break; + } + + // Handle $Rdd = combine(#imm, ##imm)" + case Hexagon::A4_combineii: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &MO1 = Inst.getOperand(1); + int64_t Value; + if (MO1.getExpr()->evaluateAsAbsolute(Value)) { + int s8 = Value; + if (s8 < -128 || s8 > 127) + OutOfRange(IDLoc, s8, -128); + } + MCOperand &MO2 = Inst.getOperand(2); + Inst = makeCombineInst(Hexagon::A4_combineii, Rdd, MO1, MO2); + break; + } + + case Hexagon::S2_tableidxb_goodsyntax: { + Inst.setOpcode(Hexagon::S2_tableidxb); + break; + } + + case Hexagon::S2_tableidxh_goodsyntax: { + MCInst TmpInst; + MCOperand &Rx = Inst.getOperand(0); + MCOperand &_dst_ = Inst.getOperand(1); + MCOperand &Rs = Inst.getOperand(2); + MCOperand &Imm4 = Inst.getOperand(3); + MCOperand &Imm6 = Inst.getOperand(4); + Imm6.setExpr(MCBinaryExpr::createSub( + Imm6.getExpr(), MCConstantExpr::create(1, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_tableidxh); + TmpInst.addOperand(Rx); + TmpInst.addOperand(_dst_); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm4); + TmpInst.addOperand(Imm6); + Inst = TmpInst; + break; + } + + case Hexagon::S2_tableidxw_goodsyntax: { + MCInst TmpInst; + MCOperand &Rx = Inst.getOperand(0); + MCOperand &_dst_ = Inst.getOperand(1); + MCOperand &Rs = Inst.getOperand(2); + MCOperand &Imm4 = Inst.getOperand(3); + MCOperand &Imm6 = Inst.getOperand(4); + Imm6.setExpr(MCBinaryExpr::createSub( + Imm6.getExpr(), MCConstantExpr::create(2, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_tableidxw); + TmpInst.addOperand(Rx); + TmpInst.addOperand(_dst_); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm4); + TmpInst.addOperand(Imm6); + Inst = TmpInst; + break; + } + + case Hexagon::S2_tableidxd_goodsyntax: { + MCInst TmpInst; + MCOperand &Rx = Inst.getOperand(0); + MCOperand &_dst_ = Inst.getOperand(1); + MCOperand &Rs = Inst.getOperand(2); + MCOperand &Imm4 = Inst.getOperand(3); + MCOperand &Imm6 = Inst.getOperand(4); + Imm6.setExpr(MCBinaryExpr::createSub( + Imm6.getExpr(), MCConstantExpr::create(3, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_tableidxd); + TmpInst.addOperand(Rx); + TmpInst.addOperand(_dst_); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm4); + TmpInst.addOperand(Imm6); + Inst = TmpInst; + break; + } + + case Hexagon::M2_mpyui: { + Inst.setOpcode(Hexagon::M2_mpyi); + break; + } + case Hexagon::M2_mpysmi: { + MCInst TmpInst; + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (!MustExtend) { + if (Value < 0 && Value > -256) { + Imm.setExpr(MCConstantExpr::create(Value * -1, Context)); + TmpInst.setOpcode(Hexagon::M2_mpysin); + } else if (Value < 256 && Value >= 0) + TmpInst.setOpcode(Hexagon::M2_mpysip); + else + return Match_InvalidOperand; + } else { + if (Value >= 0) + TmpInst.setOpcode(Hexagon::M2_mpysip); + else + return Match_InvalidOperand; + } + TmpInst.addOperand(Rd); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm); + Inst = TmpInst; + break; + } + + case Hexagon::S2_asr_i_r_rnd_goodsyntax: { + MCOperand &Imm = Inst.getOperand(2); + MCInst TmpInst; + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) { // convert to $Rd = $Rs + TmpInst.setOpcode(Hexagon::A2_tfr); + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + TmpInst.addOperand(Rd); + TmpInst.addOperand(Rs); + } else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd); + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + TmpInst.addOperand(Rd); + TmpInst.addOperand(Rs); + TmpInst.addOperand(Imm); + } + Inst = TmpInst; + break; + } + + case Hexagon::S2_asr_i_p_rnd_goodsyntax: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &Rss = Inst.getOperand(1); + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) { // convert to $Rdd = combine ($Rs[0], $Rs[1]) + MCInst TmpInst; + unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + Rss.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + TmpInst.setOpcode(Hexagon::A2_combinew); + TmpInst.addOperand(Rdd); + TmpInst.addOperand(Rss); + TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst = TmpInst; + } else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::S2_asr_i_p_rnd); + } + break; + } + + case Hexagon::A4_boundscheck: { + MCOperand &Rs = Inst.getOperand(1); + unsigned int RegNum = RI->getEncodingValue(Rs.getReg()); + if (RegNum & 1) { // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2 + Inst.setOpcode(Hexagon::A4_boundscheck_hi); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } else { // raw:lo + Inst.setOpcode(Hexagon::A4_boundscheck_lo); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::A2_addsp: { + MCOperand &Rs = Inst.getOperand(1); + unsigned int RegNum = RI->getEncodingValue(Rs.getReg()); + if (RegNum & 1) { // Odd mapped to raw:hi + Inst.setOpcode(Hexagon::A2_addsph); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped raw:lo + Inst.setOpcode(Hexagon::A2_addspl); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rs.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::M2_vrcmpys_s1: { + MCOperand &Rt = Inst.getOperand(2); + unsigned int RegNum = RI->getEncodingValue(Rt.getReg()); + if (RegNum & 1) { // Odd mapped to sat:raw:hi + Inst.setOpcode(Hexagon::M2_vrcmpys_s1_h); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped sat:raw:lo + Inst.setOpcode(Hexagon::M2_vrcmpys_s1_l); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::M2_vrcmpys_acc_s1: { + MCInst TmpInst; + MCOperand &Rxx = Inst.getOperand(0); + MCOperand &Rss = Inst.getOperand(2); + MCOperand &Rt = Inst.getOperand(3); + unsigned int RegNum = RI->getEncodingValue(Rt.getReg()); + if (RegNum & 1) { // Odd mapped to sat:raw:hi + TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped sat:raw:lo + TmpInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } + // Registers are in different positions + TmpInst.addOperand(Rxx); + TmpInst.addOperand(Rxx); + TmpInst.addOperand(Rss); + TmpInst.addOperand(Rt); + Inst = TmpInst; + break; + } + + case Hexagon::M2_vrcmpys_s1rp: { + MCOperand &Rt = Inst.getOperand(2); + unsigned int RegNum = RI->getEncodingValue(Rt.getReg()); + if (RegNum & 1) { // Odd mapped to rnd:sat:raw:hi + Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h); + std::string Name = + r + llvm::utostr_32(RegNum) + Colon + llvm::utostr_32(RegNum - 1); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } else { // Even mapped rnd:sat:raw:lo + Inst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l); + std::string Name = + r + llvm::utostr_32(RegNum + 1) + Colon + llvm::utostr_32(RegNum); + StringRef RegPair = Name; + Rt.setReg(MatchRegisterName(RegPair)); + } + break; + } + + case Hexagon::S5_asrhub_rnd_sat_goodsyntax: { + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) + Inst.setOpcode(Hexagon::S2_vsathub); + else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::S5_asrhub_rnd_sat); + } + break; + } + + case Hexagon::S5_vasrhrnd_goodsyntax: { + MCOperand &Rdd = Inst.getOperand(0); + MCOperand &Rss = Inst.getOperand(1); + MCOperand &Imm = Inst.getOperand(2); + int64_t Value; + bool Absolute = Imm.getExpr()->evaluateAsAbsolute(Value); + assert(Absolute); + (void)Absolute; + if (Value == 0) { + MCInst TmpInst; + unsigned int RegPairNum = RI->getEncodingValue(Rss.getReg()); + std::string R1 = r + llvm::utostr_32(RegPairNum + 1); + StringRef Reg1(R1); + Rss.setReg(MatchRegisterName(Reg1)); + // Add a new operand for the second register in the pair. + std::string R2 = r + llvm::utostr_32(RegPairNum); + StringRef Reg2(R2); + TmpInst.setOpcode(Hexagon::A2_combinew); + TmpInst.addOperand(Rdd); + TmpInst.addOperand(Rss); + TmpInst.addOperand(MCOperand::createReg(MatchRegisterName(Reg2))); + Inst = TmpInst; + } else { + Imm.setExpr(MCBinaryExpr::createSub( + Imm.getExpr(), MCConstantExpr::create(1, Context), Context)); + Inst.setOpcode(Hexagon::S5_vasrhrnd); + } + break; + } + + case Hexagon::A2_not: { + MCInst TmpInst; + MCOperand &Rd = Inst.getOperand(0); + MCOperand &Rs = Inst.getOperand(1); + TmpInst.setOpcode(Hexagon::A2_subri); + TmpInst.addOperand(Rd); + TmpInst.addOperand( + MCOperand::createExpr(MCConstantExpr::create(-1, Context))); + TmpInst.addOperand(Rs); + Inst = TmpInst; + break; + } + } // switch + + return Match_Success; +} diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp index cb7e633..ea96eb0 100644 --- a/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp +++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.cpp @@ -868,7 +868,7 @@ void BT::visitNonBranch(const MachineInstr *MI) { continue; bool Changed = false; - if (!Eval || !ResMap.has(RD.Reg)) { + if (!Eval || ResMap.count(RD.Reg) == 0) { // Set to "ref" (aka "bottom"). uint16_t DefBW = ME.getRegBitWidth(RD); RegisterCell RefC = RegisterCell::self(RD.Reg, DefBW); @@ -951,11 +951,11 @@ void BT::visitBranchesFrom(const MachineInstr *BI) { // be processed. for (succ_iterator I = B.succ_begin(), E = B.succ_end(); I != E; ++I) { const MachineBasicBlock *SB = *I; - if (SB->isLandingPad()) + if (SB->isEHPad()) Targets.insert(SB); } if (FallsThrough) { - MachineFunction::const_iterator BIt = &B; + MachineFunction::const_iterator BIt = B.getIterator(); MachineFunction::const_iterator Next = std::next(BIt); if (Next != MF.end()) Targets.insert(&*Next); @@ -1005,7 +1005,7 @@ void BT::put(RegisterRef RR, const RegisterCell &RC) { // Replace all references to bits from OldRR with the corresponding bits // in NewRR. void BT::subst(RegisterRef OldRR, RegisterRef NewRR) { - assert(Map.has(OldRR.Reg) && "OldRR not present in map"); + assert(Map.count(OldRR.Reg) > 0 && "OldRR not present in map"); BitMask OM = ME.mask(OldRR.Reg, OldRR.Sub); BitMask NM = ME.mask(NewRR.Reg, NewRR.Sub); uint16_t OMB = OM.first(), OME = OM.last(); @@ -1104,9 +1104,9 @@ void BT::run() { } // If block end has been reached, add the fall-through edge to the queue. if (It == End) { - MachineFunction::const_iterator BIt = &B; + MachineFunction::const_iterator BIt = B.getIterator(); MachineFunction::const_iterator Next = std::next(BIt); - if (Next != MF.end()) { + if (Next != MF.end() && B.isSuccessor(&*Next)) { int ThisN = B.getNumber(); int NextN = Next->getNumber(); FlowQ.push(CFGEdge(ThisN, NextN)); diff --git a/contrib/llvm/lib/Target/Hexagon/BitTracker.h b/contrib/llvm/lib/Target/Hexagon/BitTracker.h index ed002a7..959c831 100644 --- a/contrib/llvm/lib/Target/Hexagon/BitTracker.h +++ b/contrib/llvm/lib/Target/Hexagon/BitTracker.h @@ -36,9 +36,7 @@ struct BitTracker { typedef SetVector<const MachineBasicBlock *> BranchTargetList; - struct CellMapType : public std::map<unsigned,RegisterCell> { - bool has(unsigned Reg) const; - }; + typedef std::map<unsigned, RegisterCell> CellMapType; BitTracker(const MachineEvaluator &E, MachineFunction &F); ~BitTracker(); @@ -79,7 +77,6 @@ private: // Abstraction of a reference to bit at position Pos from a register Reg. struct BitTracker::BitRef { BitRef(unsigned R = 0, uint16_t P = 0) : Reg(R), Pos(P) {} - BitRef(const BitRef &BR) : Reg(BR.Reg), Pos(BR.Pos) {} bool operator== (const BitRef &BR) const { // If Reg is 0, disregard Pos. return Reg == BR.Reg && (Reg == 0 || Pos == BR.Pos); @@ -146,7 +143,6 @@ struct BitTracker::BitValue { BitValue(ValueType T = Top) : Type(T) {} BitValue(bool B) : Type(B ? One : Zero) {} - BitValue(const BitValue &V) : Type(V.Type), RefI(V.RefI) {} BitValue(unsigned Reg, uint16_t Pos) : Type(Ref), RefI(Reg, Pos) {} bool operator== (const BitValue &V) const { @@ -279,11 +275,6 @@ struct BitTracker::RegisterCell { return !operator==(RC); } - const RegisterCell &operator=(const RegisterCell &RC) { - Bits = RC.Bits; - return *this; - } - // Generate a "ref" cell for the corresponding register. In the resulting // cell each bit will be described as being the same as the corresponding // bit in register Reg (i.e. the cell is "defined" by register Reg). @@ -344,11 +335,6 @@ BitTracker::RegisterCell::ref(const RegisterCell &C) { return RC; } - -inline bool BitTracker::CellMapType::has(unsigned Reg) const { - return find(Reg) != end(); -} - // A class to evaluate target's instructions and update the cell maps. // This is used internally by the bit tracker. A target that wants to // utilize this should implement the evaluation functions (noted below) diff --git a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 9cc1e94..4a9c341 100644 --- a/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -7,42 +7,45 @@ // //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "hexagon-disassembler" + #include "Hexagon.h" #include "MCTargetDesc/HexagonBaseInfo.h" -#include "MCTargetDesc/HexagonMCInstrInfo.h" +#include "MCTargetDesc/HexagonMCChecker.h" #include "MCTargetDesc/HexagonMCTargetDesc.h" - -#include "llvm/MC/MCContext.h" +#include "MCTargetDesc/HexagonMCInstrInfo.h" +#include "MCTargetDesc/HexagonInstPrinter.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCDisassembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixedLenDisassembler.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LEB128.h" -#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/MemoryObject.h" #include "llvm/Support/raw_ostream.h" -#include <array> +#include "llvm/Support/TargetRegistry.h" #include <vector> using namespace llvm; using namespace Hexagon; -#define DEBUG_TYPE "hexagon-disassembler" - -// Pull DecodeStatus and its enum values into the global namespace. -typedef llvm::MCDisassembler::DecodeStatus DecodeStatus; +typedef MCDisassembler::DecodeStatus DecodeStatus; namespace { /// \brief Hexagon disassembler for all Hexagon platforms. class HexagonDisassembler : public MCDisassembler { public: + std::unique_ptr<MCInstrInfo const> const MCII; std::unique_ptr<MCInst *> CurrentBundle; - HexagonDisassembler(MCSubtargetInfo const &STI, MCContext &Ctx) - : MCDisassembler(STI, Ctx), CurrentBundle(new MCInst *) {} + HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, + MCInstrInfo const *MCII) + : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *) {} DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB, ArrayRef<uint8_t> Bytes, uint64_t Address, @@ -52,23 +55,57 @@ public: ArrayRef<uint8_t> Bytes, uint64_t Address, raw_ostream &VStream, raw_ostream &CStream) const override; + + void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const; + void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const; }; } -static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, +// Forward declare these because the auto-generated code will reference them. +// Definitions are further down. + +static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); +static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder); static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, - void const *Decoder); + const void *Decoder); + +static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn); +static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn, + void const *Decoder); static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op, raw_ostream &os); -static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst); +static unsigned getRegFromSubinstEncoding(unsigned encoded_reg); + +static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp, + uint64_t Address, const void *Decoder); static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, @@ -95,129 +132,19 @@ static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, const void *Decoder); - -static const uint16_t IntRegDecoderTable[] = { - Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4, - Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9, - Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14, - Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19, - Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24, - Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29, - Hexagon::R30, Hexagon::R31}; - -static const uint16_t PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1, - Hexagon::P2, Hexagon::P3}; - -static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, - const uint16_t Table[], size_t Size) { - if (RegNo < Size) { - Inst.addOperand(MCOperand::createReg(Table[RegNo])); - return MCDisassembler::Success; - } else - return MCDisassembler::Fail; -} - -static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - void const *Decoder) { - if (RegNo > 31) - return MCDisassembler::Fail; - - unsigned Register = IntRegDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - const void *Decoder) { - static const uint16_t CtrlRegDecoderTable[] = { - Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1, - Hexagon::P3_0, Hexagon::NoRegister, Hexagon::C6, Hexagon::C7, - Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP, - Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPCH}; - - if (RegNo >= sizeof(CtrlRegDecoderTable) / sizeof(CtrlRegDecoderTable[0])) - return MCDisassembler::Fail; - - if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister) - return MCDisassembler::Fail; - - unsigned Register = CtrlRegDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - void const *Decoder) { - static const uint16_t CtrlReg64DecoderTable[] = { - Hexagon::C1_0, Hexagon::NoRegister, Hexagon::C3_2, - Hexagon::NoRegister, Hexagon::NoRegister, Hexagon::NoRegister, - Hexagon::C7_6, Hexagon::NoRegister, Hexagon::C9_8, - Hexagon::NoRegister, Hexagon::C11_10, Hexagon::NoRegister, - Hexagon::CS, Hexagon::NoRegister, Hexagon::UPC, - Hexagon::NoRegister}; - - if (RegNo >= sizeof(CtrlReg64DecoderTable) / sizeof(CtrlReg64DecoderTable[0])) - return MCDisassembler::Fail; - - if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister) - return MCDisassembler::Fail; - - unsigned Register = CtrlReg64DecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - const void *Decoder) { - unsigned Register = 0; - switch (RegNo) { - case 0: - Register = Hexagon::M0; - break; - case 1: - Register = Hexagon::M1; - break; - default: - return MCDisassembler::Fail; - } - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} - -static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - const void *Decoder) { - static const uint16_t DoubleRegDecoderTable[] = { - Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3, - Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7, - Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11, - Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15}; - - return (DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable, - sizeof(DoubleRegDecoderTable))); -} - -static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, - uint64_t /*Address*/, - void const *Decoder) { - if (RegNo > 3) - return MCDisassembler::Fail; - - unsigned Register = PredRegDecoderTable[RegNo]; - Inst.addOperand(MCOperand::createReg(Register)); - return MCDisassembler::Success; -} +static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); +static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder); #include "HexagonGenDisassemblerTables.inc" -static MCDisassembler *createHexagonDisassembler(Target const &T, - MCSubtargetInfo const &STI, +static MCDisassembler *createHexagonDisassembler(const Target &T, + const MCSubtargetInfo &STI, MCContext &Ctx) { - return new HexagonDisassembler(STI, Ctx); + return new HexagonDisassembler(STI, Ctx, T.createMCInstrInfo()); } extern "C" void LLVMInitializeHexagonDisassembler() { @@ -235,8 +162,7 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Size = 0; *CurrentBundle = &MI; - MI.setOpcode(Hexagon::BUNDLE); - MI.addOperand(MCOperand::createImm(0)); + MI = HexagonMCInstrInfo::createBundle(); while (Result == Success && Complete == false) { if (Bytes.size() < HEXAGON_INSTR_SIZE) return MCDisassembler::Fail; @@ -246,7 +172,21 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, Size += HEXAGON_INSTR_SIZE; Bytes = Bytes.slice(HEXAGON_INSTR_SIZE); } - return Result; + if(Result == MCDisassembler::Fail) + return Result; + HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo()); + if(!Checker.check()) + return MCDisassembler::Fail; + return MCDisassembler::Success; +} + +namespace { +HexagonDisassembler const &disassembler(void const *Decoder) { + return *static_cast<HexagonDisassembler const *>(Decoder); +} +MCContext &contextFromDecoder(void const *Decoder) { + return disassembler(Decoder).getContext(); +} } DecodeStatus HexagonDisassembler::getSingleInstruction( @@ -255,8 +195,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction( assert(Bytes.size() >= HEXAGON_INSTR_SIZE); uint32_t Instruction = - llvm::support::endian::read<uint32_t, llvm::support::little, - llvm::support::unaligned>(Bytes.data()); + (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0); auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB); if ((Instruction & HexagonII::INST_PARSE_MASK) == @@ -360,8 +299,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction( MILow->setOpcode(opLow); MCInst *MIHigh = new (getContext()) MCInst; MIHigh->setOpcode(opHigh); - AddSubinstOperands(MILow, opLow, instLow); - AddSubinstOperands(MIHigh, opHigh, instHigh); + addSubinstOperands(MILow, opLow, instLow); + addSubinstOperands(MIHigh, opHigh, instHigh); // see ConvertToSubInst() in // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -378,102 +317,774 @@ DecodeStatus HexagonDisassembler::getSingleInstruction( // Calling the auto-generated decoder function. Result = decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI); + + // If a, "standard" insn isn't found check special cases. + if (MCDisassembler::Success != Result || + MI.getOpcode() == Hexagon::A4_ext) { + Result = decodeImmext(MI, Instruction, this); + if (MCDisassembler::Success != Result) { + Result = decodeSpecial(MI, Instruction); + } + } else { + // If the instruction is a compound instruction, register values will + // follow the duplex model, so the register values in the MCInst are + // incorrect. If the instruction is a compound, loop through the + // operands and change registers appropriately. + if (llvm::HexagonMCInstrInfo::getType(*MCII, MI) == + HexagonII::TypeCOMPOUND) { + for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) { + if (i->isReg()) { + unsigned reg = i->getReg() - Hexagon::R0; + i->setReg(getRegFromSubinstEncoding(reg)); + } + } + } + } + } + + if (HexagonMCInstrInfo::isNewValue(*MCII, MI)) { + unsigned OpIndex = HexagonMCInstrInfo::getNewValueOp(*MCII, MI); + MCOperand &MCO = MI.getOperand(OpIndex); + assert(MCO.isReg() && "New value consumers must be registers"); + unsigned Register = + getContext().getRegisterInfo()->getEncodingValue(MCO.getReg()); + if ((Register & 0x6) == 0) + // HexagonPRM 10.11 Bit 1-2 == 0 is reserved + return MCDisassembler::Fail; + unsigned Lookback = (Register & 0x6) >> 1; + unsigned Offset = 1; + bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI); + auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle); + auto i = Instructions.end() - 1; + for (auto n = Instructions.begin() - 1;; --i, ++Offset) { + if (i == n) + // Couldn't find producer + return MCDisassembler::Fail; + if (Vector && !HexagonMCInstrInfo::isVector(*MCII, *i->getInst())) + // Skip scalars when calculating distances for vectors + ++Lookback; + if (HexagonMCInstrInfo::isImmext(*i->getInst())) + ++Lookback; + if (Offset == Lookback) + break; + } + auto const &Inst = *i->getInst(); + bool SubregBit = (Register & 0x1) != 0; + if (SubregBit && HexagonMCInstrInfo::hasNewValue2(*MCII, Inst)) { + // If subreg bit is set we're selecting the second produced newvalue + unsigned Producer = + HexagonMCInstrInfo::getNewValueOperand2(*MCII, Inst).getReg(); + assert(Producer != Hexagon::NoRegister); + MCO.setReg(Producer); + } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) { + unsigned Producer = + HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg(); + if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15) + Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0; + else if (SubregBit) + // Subreg bit should not be set for non-doublevector newvalue producers + return MCDisassembler::Fail; + assert(Producer != Hexagon::NoRegister); + MCO.setReg(Producer); + } else + return MCDisassembler::Fail; } + adjustExtendedInstructions(MI, MCB); + MCInst const *Extender = + HexagonMCInstrInfo::extenderForIndex(MCB, + HexagonMCInstrInfo::bundleSize(MCB)); + if(Extender != nullptr) { + MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ? + *MI.getOperand(1).getInst() : MI; + if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) && + !HexagonMCInstrInfo::isExtended(*MCII, Inst)) + return MCDisassembler::Fail; + } return Result; } +void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI, + MCInst const &MCB) const { + if (!HexagonMCInstrInfo::hasExtenderForIndex( + MCB, HexagonMCInstrInfo::bundleSize(MCB))) { + unsigned opcode; + // This code is used by the disassembler to disambiguate between GP + // relative and absolute addressing instructions since they both have + // same encoding bits. However, an absolute addressing instruction must + // follow an immediate extender. Disassembler alwaus select absolute + // addressing instructions first and uses this code to change them into + // GP relative instruction in the absence of the corresponding immediate + // extender. + switch (MCI.getOpcode()) { + case Hexagon::S2_storerbabs: + opcode = Hexagon::S2_storerbgp; + break; + case Hexagon::S2_storerhabs: + opcode = Hexagon::S2_storerhgp; + break; + case Hexagon::S2_storerfabs: + opcode = Hexagon::S2_storerfgp; + break; + case Hexagon::S2_storeriabs: + opcode = Hexagon::S2_storerigp; + break; + case Hexagon::S2_storerbnewabs: + opcode = Hexagon::S2_storerbnewgp; + break; + case Hexagon::S2_storerhnewabs: + opcode = Hexagon::S2_storerhnewgp; + break; + case Hexagon::S2_storerinewabs: + opcode = Hexagon::S2_storerinewgp; + break; + case Hexagon::S2_storerdabs: + opcode = Hexagon::S2_storerdgp; + break; + case Hexagon::L4_loadrb_abs: + opcode = Hexagon::L2_loadrbgp; + break; + case Hexagon::L4_loadrub_abs: + opcode = Hexagon::L2_loadrubgp; + break; + case Hexagon::L4_loadrh_abs: + opcode = Hexagon::L2_loadrhgp; + break; + case Hexagon::L4_loadruh_abs: + opcode = Hexagon::L2_loadruhgp; + break; + case Hexagon::L4_loadri_abs: + opcode = Hexagon::L2_loadrigp; + break; + case Hexagon::L4_loadrd_abs: + opcode = Hexagon::L2_loadrdgp; + break; + default: + opcode = MCI.getOpcode(); + } + MCI.setOpcode(opcode); + } +} + +namespace llvm { +extern const MCInstrDesc HexagonInsts[]; +} + +static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo, + ArrayRef<MCPhysReg> Table) { + if (RegNo < Table.size()) { + Inst.addOperand(MCOperand::createReg(Table[RegNo])); + return MCDisassembler::Success; + } + + return MCDisassembler::Fail; +} + +static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + return DecodeIntRegsRegisterClass(Inst, RegNo, Address, Decoder); +} + +static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + static const MCPhysReg IntRegDecoderTable[] = { + Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4, + Hexagon::R5, Hexagon::R6, Hexagon::R7, Hexagon::R8, Hexagon::R9, + Hexagon::R10, Hexagon::R11, Hexagon::R12, Hexagon::R13, Hexagon::R14, + Hexagon::R15, Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19, + Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23, Hexagon::R24, + Hexagon::R25, Hexagon::R26, Hexagon::R27, Hexagon::R28, Hexagon::R29, + Hexagon::R30, Hexagon::R31}; + + return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable); +} + +static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg VecRegDecoderTable[] = { + Hexagon::V0, Hexagon::V1, Hexagon::V2, Hexagon::V3, Hexagon::V4, + Hexagon::V5, Hexagon::V6, Hexagon::V7, Hexagon::V8, Hexagon::V9, + Hexagon::V10, Hexagon::V11, Hexagon::V12, Hexagon::V13, Hexagon::V14, + Hexagon::V15, Hexagon::V16, Hexagon::V17, Hexagon::V18, Hexagon::V19, + Hexagon::V20, Hexagon::V21, Hexagon::V22, Hexagon::V23, Hexagon::V24, + Hexagon::V25, Hexagon::V26, Hexagon::V27, Hexagon::V28, Hexagon::V29, + Hexagon::V30, Hexagon::V31}; + + return DecodeRegisterClass(Inst, RegNo, VecRegDecoderTable); +} + +static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg DoubleRegDecoderTable[] = { + Hexagon::D0, Hexagon::D1, Hexagon::D2, Hexagon::D3, + Hexagon::D4, Hexagon::D5, Hexagon::D6, Hexagon::D7, + Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11, + Hexagon::D12, Hexagon::D13, Hexagon::D14, Hexagon::D15}; + + return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable); +} + +static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg VecDblRegDecoderTable[] = { + Hexagon::W0, Hexagon::W1, Hexagon::W2, Hexagon::W3, + Hexagon::W4, Hexagon::W5, Hexagon::W6, Hexagon::W7, + Hexagon::W8, Hexagon::W9, Hexagon::W10, Hexagon::W11, + Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15}; + + return (DecodeRegisterClass(Inst, RegNo >> 1, VecDblRegDecoderTable)); +} + +static DecodeStatus DecodePredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg PredRegDecoderTable[] = {Hexagon::P0, Hexagon::P1, + Hexagon::P2, Hexagon::P3}; + + return DecodeRegisterClass(Inst, RegNo, PredRegDecoderTable); +} + +static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg VecPredRegDecoderTable[] = {Hexagon::Q0, Hexagon::Q1, + Hexagon::Q2, Hexagon::Q3}; + + return DecodeRegisterClass(Inst, RegNo, VecPredRegDecoderTable); +} + +static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg CtrlRegDecoderTable[] = { + Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1, + Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7, + Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP, + Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC + }; + + if (RegNo >= array_lengthof(CtrlRegDecoderTable)) + return MCDisassembler::Fail; + + if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister) + return MCDisassembler::Fail; + + unsigned Register = CtrlRegDecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + static const MCPhysReg CtrlReg64DecoderTable[] = { + Hexagon::C1_0, Hexagon::NoRegister, + Hexagon::C3_2, Hexagon::NoRegister, + Hexagon::C7_6, Hexagon::NoRegister, + Hexagon::C9_8, Hexagon::NoRegister, + Hexagon::C11_10, Hexagon::NoRegister, + Hexagon::CS, Hexagon::NoRegister, + Hexagon::UPC, Hexagon::NoRegister + }; + + if (RegNo >= array_lengthof(CtrlReg64DecoderTable)) + return MCDisassembler::Fail; + + if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister) + return MCDisassembler::Fail; + + unsigned Register = CtrlReg64DecoderTable[RegNo]; + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t /*Address*/, + const void *Decoder) { + unsigned Register = 0; + switch (RegNo) { + case 0: + Register = Hexagon::M0; + break; + case 1: + Register = Hexagon::M1; + break; + default: + return MCDisassembler::Fail; + } + Inst.addOperand(MCOperand::createReg(Register)); + return MCDisassembler::Success; +} + +namespace { +uint32_t fullValue(MCInstrInfo const &MCII, + MCInst &MCB, + MCInst &MI, + int64_t Value) { + MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex( + MCB, HexagonMCInstrInfo::bundleSize(MCB)); + if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI)) + return Value; + unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI); + uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f; + int64_t Bits; + bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits); + assert(Success);(void)Success; + uint32_t Upper26 = static_cast<uint32_t>(Bits); + uint32_t Operand = Upper26 | Lower6; + return Operand; +} +template <size_t T> +void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) { + HexagonDisassembler const &Disassembler = disassembler(Decoder); + int64_t FullValue = fullValue(*Disassembler.MCII, + **Disassembler.CurrentBundle, + MI, SignExtend64<T>(tmp)); + int64_t Extended = SignExtend64<32>(FullValue); + HexagonMCInstrInfo::addConstant(MI, Extended, + Disassembler.getContext()); +} +} + +static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, + const void *Decoder) { + HexagonDisassembler const &Disassembler = disassembler(Decoder); + int64_t FullValue = fullValue(*Disassembler.MCII, + **Disassembler.CurrentBundle, + MI, tmp); + assert(FullValue >= 0 && "Negative in unsigned decoder"); + HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext()); + return MCDisassembler::Success; +} + static DecodeStatus s16ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<16>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<16>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s12ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<12>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<12>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<11>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<11>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<12>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder)); return MCDisassembler::Success; } static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<13>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<13>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<14>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<14>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s10ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<10>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<10>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s8ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<8>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<8>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<6>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<6>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<4>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<4>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<5>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<5>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<6>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<6>(MI, tmp, Decoder); return MCDisassembler::Success; } static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/, const void *Decoder) { - uint64_t imm = SignExtend64<7>(tmp); - MI.addOperand(MCOperand::createImm(imm)); + signedDecoder<7>(MI, tmp, Decoder); return MCDisassembler::Success; } +static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + signedDecoder<10>(MI, tmp, Decoder); + return MCDisassembler::Success; +} + +static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, + uint64_t /*Address*/, const void *Decoder) { + signedDecoder<19>(MI, tmp, Decoder); + return MCDisassembler::Success; +} + +// custom decoder for various jump/call immediates +static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address, + const void *Decoder) { + HexagonDisassembler const &Disassembler = disassembler(Decoder); + unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI); + // r13_2 is not extendable, so if there are no extent bits, it's r13_2 + if (Bits == 0) + Bits = 15; + uint32_t FullValue = fullValue(*Disassembler.MCII, + **Disassembler.CurrentBundle, + MI, SignExtend64(tmp, Bits)); + int64_t Extended = SignExtend64<32>(FullValue) + Address; + if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true, + 0, 4)) + HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext()); + return MCDisassembler::Success; +} + +// Addressing mode dependent load store opcode map. +// - If an insn is preceded by an extender the address is absolute. +// - memw(##symbol) = r0 +// - If an insn is not preceded by an extender the address is GP relative. +// - memw(gp + #symbol) = r0 +// Please note that the instructions must be ordered in the descending order +// of their opcode. +// HexagonII::INST_ICLASS_ST +static const unsigned int StoreConditionalOpcodeData[][2] = { + {S4_pstorerdfnew_abs, 0xafc02084}, + {S4_pstorerdtnew_abs, 0xafc02080}, + {S4_pstorerdf_abs, 0xafc00084}, + {S4_pstorerdt_abs, 0xafc00080}, + {S4_pstorerinewfnew_abs, 0xafa03084}, + {S4_pstorerinewtnew_abs, 0xafa03080}, + {S4_pstorerhnewfnew_abs, 0xafa02884}, + {S4_pstorerhnewtnew_abs, 0xafa02880}, + {S4_pstorerbnewfnew_abs, 0xafa02084}, + {S4_pstorerbnewtnew_abs, 0xafa02080}, + {S4_pstorerinewf_abs, 0xafa01084}, + {S4_pstorerinewt_abs, 0xafa01080}, + {S4_pstorerhnewf_abs, 0xafa00884}, + {S4_pstorerhnewt_abs, 0xafa00880}, + {S4_pstorerbnewf_abs, 0xafa00084}, + {S4_pstorerbnewt_abs, 0xafa00080}, + {S4_pstorerifnew_abs, 0xaf802084}, + {S4_pstoreritnew_abs, 0xaf802080}, + {S4_pstorerif_abs, 0xaf800084}, + {S4_pstorerit_abs, 0xaf800080}, + {S4_pstorerhfnew_abs, 0xaf402084}, + {S4_pstorerhtnew_abs, 0xaf402080}, + {S4_pstorerhf_abs, 0xaf400084}, + {S4_pstorerht_abs, 0xaf400080}, + {S4_pstorerbfnew_abs, 0xaf002084}, + {S4_pstorerbtnew_abs, 0xaf002080}, + {S4_pstorerbf_abs, 0xaf000084}, + {S4_pstorerbt_abs, 0xaf000080}}; +// HexagonII::INST_ICLASS_LD + +// HexagonII::INST_ICLASS_LD_ST_2 +static unsigned int LoadStoreOpcodeData[][2] = {{L4_loadrd_abs, 0x49c00000}, + {L4_loadri_abs, 0x49800000}, + {L4_loadruh_abs, 0x49600000}, + {L4_loadrh_abs, 0x49400000}, + {L4_loadrub_abs, 0x49200000}, + {L4_loadrb_abs, 0x49000000}, + {S2_storerdabs, 0x48c00000}, + {S2_storerinewabs, 0x48a01000}, + {S2_storerhnewabs, 0x48a00800}, + {S2_storerbnewabs, 0x48a00000}, + {S2_storeriabs, 0x48800000}, + {S2_storerfabs, 0x48600000}, + {S2_storerhabs, 0x48400000}, + {S2_storerbabs, 0x48000000}}; +static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData); +static const size_t NumLS = array_lengthof(LoadStoreOpcodeData); + +static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) { + + unsigned MachineOpcode = 0; + unsigned LLVMOpcode = 0; + + if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) { + for (size_t i = 0; i < NumCondS; ++i) { + if ((insn & StoreConditionalOpcodeData[i][1]) == + StoreConditionalOpcodeData[i][1]) { + MachineOpcode = StoreConditionalOpcodeData[i][1]; + LLVMOpcode = StoreConditionalOpcodeData[i][0]; + break; + } + } + } + if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) { + for (size_t i = 0; i < NumLS; ++i) { + if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) { + MachineOpcode = LoadStoreOpcodeData[i][1]; + LLVMOpcode = LoadStoreOpcodeData[i][0]; + break; + } + } + } + + if (MachineOpcode) { + unsigned Value = 0; + unsigned shift = 0; + MI.setOpcode(LLVMOpcode); + // Remove the parse bits from the insn. + insn &= ~HexagonII::INST_PARSE_MASK; + + switch (LLVMOpcode) { + default: + return MCDisassembler::Fail; + break; + + case Hexagon::S4_pstorerdf_abs: + case Hexagon::S4_pstorerdt_abs: + case Hexagon::S4_pstorerdfnew_abs: + case Hexagon::S4_pstorerdtnew_abs: { + // op: Pv + Value = insn & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 12) & UINT64_C(48); + Value |= (insn >> 3) & UINT64_C(15); + MI.addOperand(MCOperand::createImm(Value)); + // op: Rtt + Value = (insn >> 8) & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + break; + } + + case Hexagon::S4_pstorerbnewf_abs: + case Hexagon::S4_pstorerbnewt_abs: + case Hexagon::S4_pstorerbnewfnew_abs: + case Hexagon::S4_pstorerbnewtnew_abs: + case Hexagon::S4_pstorerhnewf_abs: + case Hexagon::S4_pstorerhnewt_abs: + case Hexagon::S4_pstorerhnewfnew_abs: + case Hexagon::S4_pstorerhnewtnew_abs: + case Hexagon::S4_pstorerinewf_abs: + case Hexagon::S4_pstorerinewt_abs: + case Hexagon::S4_pstorerinewfnew_abs: + case Hexagon::S4_pstorerinewtnew_abs: { + // op: Pv + Value = insn & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 12) & UINT64_C(48); + Value |= (insn >> 3) & UINT64_C(15); + MI.addOperand(MCOperand::createImm(Value)); + // op: Nt + Value = (insn >> 8) & UINT64_C(7); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + + case Hexagon::S4_pstorerbf_abs: + case Hexagon::S4_pstorerbt_abs: + case Hexagon::S4_pstorerbfnew_abs: + case Hexagon::S4_pstorerbtnew_abs: + case Hexagon::S4_pstorerhf_abs: + case Hexagon::S4_pstorerht_abs: + case Hexagon::S4_pstorerhfnew_abs: + case Hexagon::S4_pstorerhtnew_abs: + case Hexagon::S4_pstorerif_abs: + case Hexagon::S4_pstorerit_abs: + case Hexagon::S4_pstorerifnew_abs: + case Hexagon::S4_pstoreritnew_abs: { + // op: Pv + Value = insn & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 12) & UINT64_C(48); + Value |= (insn >> 3) & UINT64_C(15); + MI.addOperand(MCOperand::createImm(Value)); + // op: Rt + Value = (insn >> 8) & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + + case Hexagon::L4_ploadrdf_abs: + case Hexagon::L4_ploadrdt_abs: + case Hexagon::L4_ploadrdfnew_abs: + case Hexagon::L4_ploadrdtnew_abs: { + // op: Rdd + Value = insn & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + // op: Pt + Value = ((insn >> 9) & UINT64_C(3)); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = ((insn >> 15) & UINT64_C(62)); + Value |= ((insn >> 8) & UINT64_C(1)); + MI.addOperand(MCOperand::createImm(Value)); + break; + } + + case Hexagon::L4_ploadrbf_abs: + case Hexagon::L4_ploadrbt_abs: + case Hexagon::L4_ploadrbfnew_abs: + case Hexagon::L4_ploadrbtnew_abs: + case Hexagon::L4_ploadrhf_abs: + case Hexagon::L4_ploadrht_abs: + case Hexagon::L4_ploadrhfnew_abs: + case Hexagon::L4_ploadrhtnew_abs: + case Hexagon::L4_ploadrubf_abs: + case Hexagon::L4_ploadrubt_abs: + case Hexagon::L4_ploadrubfnew_abs: + case Hexagon::L4_ploadrubtnew_abs: + case Hexagon::L4_ploadruhf_abs: + case Hexagon::L4_ploadruht_abs: + case Hexagon::L4_ploadruhfnew_abs: + case Hexagon::L4_ploadruhtnew_abs: + case Hexagon::L4_ploadrif_abs: + case Hexagon::L4_ploadrit_abs: + case Hexagon::L4_ploadrifnew_abs: + case Hexagon::L4_ploadritnew_abs: + // op: Rd + Value = insn & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + // op: Pt + Value = (insn >> 9) & UINT64_C(3); + DecodePredRegsRegisterClass(MI, Value, 0, 0); + // op: u6 + Value = (insn >> 15) & UINT64_C(62); + Value |= (insn >> 8) & UINT64_C(1); + MI.addOperand(MCOperand::createImm(Value)); + break; + + // op: g16_2 + case (Hexagon::L4_loadri_abs): + ++shift; + // op: g16_1 + case Hexagon::L4_loadrh_abs: + case Hexagon::L4_loadruh_abs: + ++shift; + // op: g16_0 + case Hexagon::L4_loadrb_abs: + case Hexagon::L4_loadrub_abs: { + // op: Rd + Value |= insn & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(511); + MI.addOperand(MCOperand::createImm(Value << shift)); + break; + } + + case Hexagon::L4_loadrd_abs: { + Value = insn & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(511); + MI.addOperand(MCOperand::createImm(Value << 3)); + break; + } + + case Hexagon::S2_storerdabs: { + // op: g16_3 + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(256); + Value |= insn & UINT64_C(255); + MI.addOperand(MCOperand::createImm(Value << 3)); + // op: Rtt + Value = (insn >> 8) & UINT64_C(31); + DecodeDoubleRegsRegisterClass(MI, Value, 0, 0); + break; + } + + // op: g16_2 + case Hexagon::S2_storerinewabs: + ++shift; + // op: g16_1 + case Hexagon::S2_storerhnewabs: + ++shift; + // op: g16_0 + case Hexagon::S2_storerbnewabs: { + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(256); + Value |= insn & UINT64_C(255); + MI.addOperand(MCOperand::createImm(Value << shift)); + // op: Nt + Value = (insn >> 8) & UINT64_C(7); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + + // op: g16_2 + case Hexagon::S2_storeriabs: + ++shift; + // op: g16_1 + case Hexagon::S2_storerhabs: + case Hexagon::S2_storerfabs: + ++shift; + // op: g16_0 + case Hexagon::S2_storerbabs: { + Value = (insn >> 11) & UINT64_C(49152); + Value |= (insn >> 7) & UINT64_C(15872); + Value |= (insn >> 5) & UINT64_C(256); + Value |= insn & UINT64_C(255); + MI.addOperand(MCOperand::createImm(Value << shift)); + // op: Rt + Value = (insn >> 8) & UINT64_C(31); + DecodeIntRegsRegisterClass(MI, Value, 0, 0); + break; + } + } + return MCDisassembler::Success; + } + return MCDisassembler::Fail; +} + +static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn, + void const *Decoder) { + + // Instruction Class for a constant a extender: bits 31:28 = 0x0000 + if ((~insn & 0xf0000000) == 0xf0000000) { + unsigned Value; + // 27:16 High 12 bits of 26-bit extender. + Value = (insn & 0x0fff0000) << 4; + // 13:0 Low 14 bits of 26-bit extender. + Value |= ((insn & 0x3fff) << 6); + MI.setOpcode(Hexagon::A4_ext); + HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder)); + return MCDisassembler::Success; + } + return MCDisassembler::Fail; +} + // These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td enum subInstBinaryValues { V4_SA1_addi_BITS = 0x0000, @@ -731,6 +1342,8 @@ static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) { return Hexagon::R0 + encoded_reg; else if (encoded_reg < 16) return Hexagon::R0 + encoded_reg + 8; + + // patently false value return Hexagon::NoRegister; } @@ -739,10 +1352,13 @@ static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) { return Hexagon::D0 + encoded_dreg; else if (encoded_dreg < 8) return Hexagon::D0 + encoded_dreg + 4; + + // patently false value return Hexagon::NoRegister; } -static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { +void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode, + unsigned inst) const { int64_t operand; MCOperand Op; switch (opcode) { @@ -762,8 +1378,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { case Hexagon::V4_SS2_allocframe: // u 8-4{5_3} operand = ((inst & 0x1f0) >> 4) << 3; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL1_loadri_io: // Rd 3-0, Rs 7-4, u 11-8{4_2} @@ -774,8 +1389,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf00) >> 6; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL1_loadrub_io: // Rd 3-0, Rs 7-4, u 11-8 @@ -786,8 +1400,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf00) >> 8; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadrb_io: // Rd 3-0, Rs 7-4, u 10-8 @@ -798,8 +1411,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0x700) >> 8; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadrh_io: case Hexagon::V4_SL2_loadruh_io: @@ -811,8 +1423,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x700) >> 8) << 1; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadrd_sp: // Rdd 2-0, u 7-3{5_3} @@ -820,8 +1431,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x0f8) >> 3) << 3; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SL2_loadri_sp: // Rd 3-0, u 8-4{5_2} @@ -829,8 +1439,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x1f0) >> 4) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_addi: // Rx 3-0 (x2), s7 10-4 @@ -839,8 +1448,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { MI->addOperand(Op); MI->addOperand(Op); operand = SignExtend64<7>((inst & 0x7f0) >> 4); - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_addrx: // Rx 3-0 (x2), Rs 7-4 @@ -873,8 +1481,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x3f0) >> 4) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_seti: // Rd 3-0, u 9-4 @@ -882,8 +1489,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0x3f0) >> 4; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_clrf: case Hexagon::V4_SA1_clrfnew: @@ -901,8 +1507,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = inst & 0x3; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_combine0i: case Hexagon::V4_SA1_combine1i: @@ -913,8 +1518,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0x060) >> 5; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SA1_combinerz: case Hexagon::V4_SA1_combinezr: @@ -932,8 +1536,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf00) >> 8; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); @@ -944,8 +1547,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0xf00) >> 8) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); @@ -957,8 +1559,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = inst & 0xf; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SS2_storewi0: case Hexagon::V4_SS2_storewi1: @@ -967,25 +1568,23 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = (inst & 0xf) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); break; case Hexagon::V4_SS2_stored_sp: // s 8-3{6_3}, Rtt 2-0 operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3); - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getDRegFromSubinstEncoding(inst & 0x7); Op = MCOperand::createReg(operand); MI->addOperand(Op); + break; case Hexagon::V4_SS2_storeh_io: // Rs 7-4, u 10-8{3_1}, Rt 3-0 operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4); Op = MCOperand::createReg(operand); MI->addOperand(Op); operand = ((inst & 0x700) >> 8) << 1; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); @@ -993,8 +1592,7 @@ static void AddSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) { case Hexagon::V4_SS2_storew_sp: // u 8-4{5_2}, Rd 3-0 operand = ((inst & 0x1f0) >> 4) << 2; - Op = MCOperand::createImm(operand); - MI->addOperand(Op); + HexagonMCInstrInfo::addConstant(*MI, operand, getContext()); operand = getRegFromSubinstEncoding(inst & 0xf); Op = MCOperand::createReg(operand); MI->addOperand(Op); diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.h b/contrib/llvm/lib/Target/Hexagon/Hexagon.h index d360be2..ed7d957 100644 --- a/contrib/llvm/lib/Target/Hexagon/Hexagon.h +++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.h @@ -47,15 +47,8 @@ #include "llvm/Target/TargetMachine.h" namespace llvm { - class MachineInstr; - class MCInst; - class MCInstrInfo; - class HexagonAsmPrinter; class HexagonTargetMachine; - void HexagonLowerToMC(const MachineInstr *MI, MCInst &MCI, - HexagonAsmPrinter &AP); - /// \brief Creates a Hexagon-specific Target Transformation Info pass. ImmutablePass *createHexagonTargetTransformInfoPass(const HexagonTargetMachine *TM); } // end namespace llvm; diff --git a/contrib/llvm/lib/Target/Hexagon/Hexagon.td b/contrib/llvm/lib/Target/Hexagon/Hexagon.td index 53a687c..1189cfd 100644 --- a/contrib/llvm/lib/Target/Hexagon/Hexagon.td +++ b/contrib/llvm/lib/Target/Hexagon/Hexagon.td @@ -24,14 +24,32 @@ include "llvm/Target/Target.td" // Hexagon Architectures def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Hexagon V4">; def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Hexagon V5">; +def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">; +def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">; + +// Hexagon ISA Extensions +def ExtensionHVX: SubtargetFeature<"hvx", "UseHVXOps", + "true", "Hexagon HVX instructions">; +def ExtensionHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", + "true", "Hexagon HVX Double instructions">; //===----------------------------------------------------------------------===// // Hexagon Instruction Predicate Definitions. //===----------------------------------------------------------------------===// -def HasV5T : Predicate<"HST->hasV5TOps()">; -def NoV5T : Predicate<"!HST->hasV5TOps()">; -def UseMEMOP : Predicate<"HST->useMemOps()">; -def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">; +def HasV5T : Predicate<"HST->hasV5TOps()">; +def NoV5T : Predicate<"!HST->hasV5TOps()">; +def HasV55T : Predicate<"HST->hasV55TOps()">, + AssemblerPredicate<"ArchV55">; +def HasV60T : Predicate<"HST->hasV60TOps()">, + AssemblerPredicate<"ArchV60">; +def UseMEMOP : Predicate<"HST->useMemOps()">; +def IEEERndNearV5T : Predicate<"HST->modeIEEERndNear()">; +def UseHVXDbl : Predicate<"HST->useHVXDblOps()">, + AssemblerPredicate<"ExtensionHVXDbl">; +def UseHVXSgl : Predicate<"HST->useHVXSglOps()">; + +def UseHVX : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">, + AssemblerPredicate<"ExtensionHVX">; //===----------------------------------------------------------------------===// // Classes used for relation maps. @@ -53,6 +71,7 @@ class NewValueRel: PredNewRel; // NewValueRel - Filter class used to relate load/store instructions having // different addressing modes with each other. class AddrModeRel: NewValueRel; +class IntrinsicsRel; //===----------------------------------------------------------------------===// // Generate mapping table to relate non-predicate instructions with their @@ -62,7 +81,7 @@ class AddrModeRel: NewValueRel; def getPredOpcode : InstrMapping { let FilterClass = "PredRel"; // Instructions with the same BaseOpcode and isNVStore values form a row. - let RowFields = ["BaseOpcode", "isNVStore", "PNewValue"]; + let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"]; // Instructions with the same predicate sense form a column. let ColFields = ["PredSense"]; // The key column is the unpredicated instructions. @@ -77,7 +96,7 @@ def getPredOpcode : InstrMapping { // def getFalsePredOpcode : InstrMapping { let FilterClass = "PredRel"; - let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"]; + let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"]; let ColFields = ["PredSense"]; let KeyCol = ["true"]; let ValueCols = [["false"]]; @@ -89,7 +108,7 @@ def getFalsePredOpcode : InstrMapping { // def getTruePredOpcode : InstrMapping { let FilterClass = "PredRel"; - let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken"]; + let RowFields = ["BaseOpcode", "PNewValue", "isNVStore", "isBrTaken", "isNT"]; let ColFields = ["PredSense"]; let KeyCol = ["false"]; let ValueCols = [["true"]]; @@ -125,7 +144,7 @@ def getPredOldOpcode : InstrMapping { // def getNewValueOpcode : InstrMapping { let FilterClass = "NewValueRel"; - let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"]; + let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"]; let ColFields = ["NValueST"]; let KeyCol = ["false"]; let ValueCols = [["true"]]; @@ -137,16 +156,16 @@ def getNewValueOpcode : InstrMapping { // def getNonNVStore : InstrMapping { let FilterClass = "NewValueRel"; - let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode"]; + let RowFields = ["BaseOpcode", "PredSense", "PNewValue", "addrMode", "isNT"]; let ColFields = ["NValueST"]; let KeyCol = ["true"]; let ValueCols = [["false"]]; } -def getBasedWithImmOffset : InstrMapping { +def getBaseWithImmOffset : InstrMapping { let FilterClass = "AddrModeRel"; let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore", - "isMEMri", "isFloat"]; + "isFloat"]; let ColFields = ["addrMode"]; let KeyCol = ["Absolute"]; let ValueCols = [["BaseImmOffset"]]; @@ -168,6 +187,37 @@ def getRegForm : InstrMapping { let ValueCols = [["reg"]]; } +def getRegShlForm : InstrMapping { + let FilterClass = "ImmRegShl"; + let RowFields = ["CextOpcode", "PredSense", "PNewValue", "isNVStore"]; + let ColFields = ["InputType"]; + let KeyCol = ["imm"]; + let ValueCols = [["reg"]]; +} + +def notTakenBranchPrediction : InstrMapping { + let FilterClass = "PredRel"; + let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"]; + let ColFields = ["isBrTaken"]; + let KeyCol = ["true"]; + let ValueCols = [["false"]]; +} + +def takenBranchPrediction : InstrMapping { + let FilterClass = "PredRel"; + let RowFields = ["BaseOpcode", "PNewValue", "PredSense", "isBranch", "isPredicated"]; + let ColFields = ["isBrTaken"]; + let KeyCol = ["false"]; + let ValueCols = [["true"]]; +} + +def getRealHWInstr : InstrMapping { + let FilterClass = "IntrinsicsRel"; + let RowFields = ["BaseOpcode"]; + let ColFields = ["InstrType"]; + let KeyCol = ["Pseudo"]; + let ValueCols = [["Pseudo"], ["Real"]]; +} //===----------------------------------------------------------------------===// // Register File, Calling Conv, Instruction Descriptions //===----------------------------------------------------------------------===// @@ -192,12 +242,22 @@ def : Proc<"hexagonv4", HexagonModelV4, [ArchV4]>; def : Proc<"hexagonv5", HexagonModelV4, [ArchV4, ArchV5]>; +def : Proc<"hexagonv55", HexagonModelV55, + [ArchV4, ArchV5, ArchV55]>; +def : Proc<"hexagonv60", HexagonModelV60, + [ArchV4, ArchV5, ArchV55, ArchV60, ExtensionHVX]>; //===----------------------------------------------------------------------===// // Declare the target which we are implementing //===----------------------------------------------------------------------===// +def HexagonAsmParserVariant : AsmParserVariant { + int Variant = 0; + string TokenizingCharacters = "#()=:.<>!+*"; +} + def Hexagon : Target { // Pull in Instruction Info: let InstructionSet = HexagonInstrInfo; + let AssemblyParserVariants = [HexagonAsmParserVariant]; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp index 05728d2..e213089 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp @@ -40,11 +40,13 @@ #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCSection.h" +#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ELF.h" #include "llvm/Support/Format.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TargetRegistry.h" @@ -56,12 +58,27 @@ using namespace llvm; +namespace llvm { + void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP); +} + #define DEBUG_TYPE "asm-printer" static cl::opt<bool> AlignCalls( "hexagon-align-calls", cl::Hidden, cl::init(true), cl::desc("Insert falign after call instruction for Hexagon target")); +// Given a scalar register return its pair. +inline static unsigned getHexagonRegisterPair(unsigned Reg, + const MCRegisterInfo *RI) { + assert(Hexagon::IntRegsRegClass.contains(Reg)); + MCSuperRegIterator SR(Reg, RI, false); + unsigned Pair = *SR; + assert(Hexagon::DoubleRegsRegClass.contains(Pair)); + return Pair; +} + HexagonAsmPrinter::HexagonAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr) {} @@ -102,9 +119,8 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, // bool HexagonAsmPrinter:: isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const { - if (MBB->hasAddressTaken()) { + if (MBB->hasAddressTaken()) return false; - } return AsmPrinter::isBlockOnlyReachableByFallthrough(MBB); } @@ -117,7 +133,8 @@ bool HexagonAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS) { // Does this asm operand have a single letter operand modifier? if (ExtraCode && ExtraCode[0]) { - if (ExtraCode[1] != 0) return true; // Unknown modifier. + if (ExtraCode[1] != 0) + return true; // Unknown modifier. switch (ExtraCode[0]) { default: @@ -173,45 +190,407 @@ bool HexagonAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, return false; } +MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI, + MCStreamer &OutStreamer, + const MCOperand &Imm, int AlignSize) { + MCSymbol *Sym; + int64_t Value; + if (Imm.getExpr()->evaluateAsAbsolute(Value)) { + StringRef sectionPrefix; + std::string ImmString; + StringRef Name; + if (AlignSize == 8) { + Name = ".CONST_0000000000000000"; + sectionPrefix = ".gnu.linkonce.l8"; + ImmString = utohexstr(Value); + } else { + Name = ".CONST_00000000"; + sectionPrefix = ".gnu.linkonce.l4"; + ImmString = utohexstr(static_cast<uint32_t>(Value)); + } + + std::string symbolName = // Yes, leading zeros are kept. + Name.drop_back(ImmString.size()).str() + ImmString; + std::string sectionName = sectionPrefix.str() + symbolName; + + MCSectionELF *Section = OutStreamer.getContext().getELFSection( + sectionName, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + OutStreamer.SwitchSection(Section); + + Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName)); + if (Sym->isUndefined()) { + OutStreamer.EmitLabel(Sym); + OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global); + OutStreamer.EmitIntValue(Value, AlignSize); + OutStreamer.EmitCodeAlignment(AlignSize); + } + } else { + assert(Imm.isExpr() && "Expected expression and found none"); + const MachineOperand &MO = MI.getOperand(1); + assert(MO.isGlobal() || MO.isCPI() || MO.isJTI()); + MCSymbol *MOSymbol = nullptr; + if (MO.isGlobal()) + MOSymbol = AP.getSymbol(MO.getGlobal()); + else if (MO.isCPI()) + MOSymbol = AP.GetCPISymbol(MO.getIndex()); + else if (MO.isJTI()) + MOSymbol = AP.GetJTISymbol(MO.getIndex()); + else + llvm_unreachable("Unknown operand type!"); + + StringRef SymbolName = MOSymbol->getName(); + std::string LitaName = ".CONST_" + SymbolName.str(); + + MCSectionELF *Section = OutStreamer.getContext().getELFSection( + ".lita", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); + + OutStreamer.SwitchSection(Section); + Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName)); + if (Sym->isUndefined()) { + OutStreamer.EmitLabel(Sym); + OutStreamer.EmitSymbolAttribute(Sym, MCSA_Local); + OutStreamer.EmitValue(Imm.getExpr(), AlignSize); + OutStreamer.EmitCodeAlignment(AlignSize); + } + } + return Sym; +} + +void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst, + const MachineInstr &MI) { + MCInst &MappedInst = static_cast <MCInst &>(Inst); + const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo(); + + switch (Inst.getOpcode()) { + default: return; + + // "$dst = CONST64(#$src1)", + case Hexagon::CONST64_Float_Real: + case Hexagon::CONST64_Int_Real: + if (!OutStreamer->hasRawTextSupport()) { + const MCOperand &Imm = MappedInst.getOperand(1); + MCSectionSubPair Current = OutStreamer->getCurrentSection(); + + MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 8); + + OutStreamer->SwitchSection(Current.first, Current.second); + MCInst TmpInst; + MCOperand &Reg = MappedInst.getOperand(0); + TmpInst.setOpcode(Hexagon::L2_loadrdgp); + TmpInst.addOperand(Reg); + TmpInst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Sym, OutContext))); + MappedInst = TmpInst; + + } + break; + case Hexagon::CONST32: + case Hexagon::CONST32_Float_Real: + case Hexagon::CONST32_Int_Real: + case Hexagon::FCONST32_nsdata: + if (!OutStreamer->hasRawTextSupport()) { + MCOperand &Imm = MappedInst.getOperand(1); + MCSectionSubPair Current = OutStreamer->getCurrentSection(); + MCSymbol *Sym = smallData(*this, MI, *OutStreamer, Imm, 4); + OutStreamer->SwitchSection(Current.first, Current.second); + MCInst TmpInst; + MCOperand &Reg = MappedInst.getOperand(0); + TmpInst.setOpcode(Hexagon::L2_loadrigp); + TmpInst.addOperand(Reg); + TmpInst.addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Sym, OutContext))); + MappedInst = TmpInst; + } + break; + + // C2_pxfer_map maps to C2_or instruction. Though, it's possible to use + // C2_or during instruction selection itself but it results + // into suboptimal code. + case Hexagon::C2_pxfer_map: { + MCOperand &Ps = Inst.getOperand(1); + MappedInst.setOpcode(Hexagon::C2_or); + MappedInst.addOperand(Ps); + return; + } + + // Vector reduce complex multiply by scalar, Rt & 1 map to :hi else :lo + // The insn is mapped from the 4 operand to the 3 operand raw form taking + // 3 register pairs. + case Hexagon::M2_vrcmpys_acc_s1: { + MCOperand &Rt = Inst.getOperand(3); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_h); + else + MappedInst.setOpcode(Hexagon::M2_vrcmpys_acc_s1_l); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + case Hexagon::M2_vrcmpys_s1: { + MCOperand &Rt = Inst.getOperand(2); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_h); + else + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1_l); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + + case Hexagon::M2_vrcmpys_s1rp: { + MCOperand &Rt = Inst.getOperand(2); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_h); + else + MappedInst.setOpcode(Hexagon::M2_vrcmpys_s1rp_l); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + + case Hexagon::A4_boundscheck: { + MCOperand &Rs = Inst.getOperand(1); + assert (Rs.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rs.getReg()); + if (Reg & 1) // Odd mapped to raw:hi, regpair is rodd:odd-1, like r3:2 + MappedInst.setOpcode(Hexagon::A4_boundscheck_hi); + else // raw:lo + MappedInst.setOpcode(Hexagon::A4_boundscheck_lo); + Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI)); + return; + } + case Hexagon::S5_asrhub_rnd_sat_goodsyntax: { + MCOperand &MO = MappedInst.getOperand(2); + int64_t Imm; + MCExpr const *Expr = MO.getExpr(); + bool Success = Expr->evaluateAsAbsolute(Imm); + assert (Success && "Expected immediate and none was found");(void)Success; + MCInst TmpInst; + if (Imm == 0) { + TmpInst.setOpcode(Hexagon::S2_vsathub); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + MappedInst = TmpInst; + return; + } + TmpInst.setOpcode(Hexagon::S5_asrhub_rnd_sat); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + const MCExpr *One = MCConstantExpr::create(1, OutContext); + const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Sub)); + MappedInst = TmpInst; + return; + } + case Hexagon::S5_vasrhrnd_goodsyntax: + case Hexagon::S2_asr_i_p_rnd_goodsyntax: { + MCOperand &MO2 = MappedInst.getOperand(2); + MCExpr const *Expr = MO2.getExpr(); + int64_t Imm; + bool Success = Expr->evaluateAsAbsolute(Imm); + assert (Success && "Expected immediate and none was found");(void)Success; + MCInst TmpInst; + if (Imm == 0) { + TmpInst.setOpcode(Hexagon::A2_combinew); + TmpInst.addOperand(MappedInst.getOperand(0)); + MCOperand &MO1 = MappedInst.getOperand(1); + unsigned High = RI->getSubReg(MO1.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO1.getReg(), Hexagon::subreg_loreg); + // Add a new operand for the second register in the pair. + TmpInst.addOperand(MCOperand::createReg(High)); + TmpInst.addOperand(MCOperand::createReg(Low)); + MappedInst = TmpInst; + return; + } + + if (Inst.getOpcode() == Hexagon::S2_asr_i_p_rnd_goodsyntax) + TmpInst.setOpcode(Hexagon::S2_asr_i_p_rnd); + else + TmpInst.setOpcode(Hexagon::S5_vasrhrnd); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + const MCExpr *One = MCConstantExpr::create(1, OutContext); + const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Sub)); + MappedInst = TmpInst; + return; + } + // if ("#u5==0") Assembler mapped to: "Rd=Rs"; else Rd=asr(Rs,#u5-1):rnd + case Hexagon::S2_asr_i_r_rnd_goodsyntax: { + MCOperand &MO = Inst.getOperand(2); + MCExpr const *Expr = MO.getExpr(); + int64_t Imm; + bool Success = Expr->evaluateAsAbsolute(Imm); + assert (Success && "Expected immediate and none was found");(void)Success; + MCInst TmpInst; + if (Imm == 0) { + TmpInst.setOpcode(Hexagon::A2_tfr); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + MappedInst = TmpInst; + return; + } + TmpInst.setOpcode(Hexagon::S2_asr_i_r_rnd); + TmpInst.addOperand(MappedInst.getOperand(0)); + TmpInst.addOperand(MappedInst.getOperand(1)); + const MCExpr *One = MCConstantExpr::create(1, OutContext); + const MCExpr *Sub = MCBinaryExpr::createSub(Expr, One, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Sub)); + MappedInst = TmpInst; + return; + } + case Hexagon::TFRI_f: + MappedInst.setOpcode(Hexagon::A2_tfrsi); + return; + case Hexagon::TFRI_cPt_f: + MappedInst.setOpcode(Hexagon::C2_cmoveit); + return; + case Hexagon::TFRI_cNotPt_f: + MappedInst.setOpcode(Hexagon::C2_cmoveif); + return; + case Hexagon::MUX_ri_f: + MappedInst.setOpcode(Hexagon::C2_muxri); + return; + case Hexagon::MUX_ir_f: + MappedInst.setOpcode(Hexagon::C2_muxir); + return; + + // Translate a "$Rdd = #imm" to "$Rdd = combine(#[-1,0], #imm)" + case Hexagon::A2_tfrpi: { + MCInst TmpInst; + MCOperand &Rdd = MappedInst.getOperand(0); + MCOperand &MO = MappedInst.getOperand(1); + + TmpInst.setOpcode(Hexagon::A2_combineii); + TmpInst.addOperand(Rdd); + int64_t Imm; + bool Success = MO.getExpr()->evaluateAsAbsolute(Imm); + if (Success && Imm < 0) { + const MCExpr *MOne = MCConstantExpr::create(-1, OutContext); + TmpInst.addOperand(MCOperand::createExpr(MOne)); + } else { + const MCExpr *Zero = MCConstantExpr::create(0, OutContext); + TmpInst.addOperand(MCOperand::createExpr(Zero)); + } + TmpInst.addOperand(MO); + MappedInst = TmpInst; + return; + } + // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)" + case Hexagon::A2_tfrp: { + MCOperand &MO = MappedInst.getOperand(1); + unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg); + MO.setReg(High); + // Add a new operand for the second register in the pair. + MappedInst.addOperand(MCOperand::createReg(Low)); + MappedInst.setOpcode(Hexagon::A2_combinew); + return; + } + + case Hexagon::A2_tfrpt: + case Hexagon::A2_tfrpf: { + MCOperand &MO = MappedInst.getOperand(2); + unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg); + MO.setReg(High); + // Add a new operand for the second register in the pair. + MappedInst.addOperand(MCOperand::createReg(Low)); + MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt) + ? Hexagon::C2_ccombinewt + : Hexagon::C2_ccombinewf); + return; + } + case Hexagon::A2_tfrptnew: + case Hexagon::A2_tfrpfnew: { + MCOperand &MO = MappedInst.getOperand(2); + unsigned High = RI->getSubReg(MO.getReg(), Hexagon::subreg_hireg); + unsigned Low = RI->getSubReg(MO.getReg(), Hexagon::subreg_loreg); + MO.setReg(High); + // Add a new operand for the second register in the pair. + MappedInst.addOperand(MCOperand::createReg(Low)); + MappedInst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew) + ? Hexagon::C2_ccombinewnewt + : Hexagon::C2_ccombinewnewf); + return; + } + + case Hexagon::M2_mpysmi: { + MCOperand &Imm = MappedInst.getOperand(2); + MCExpr const *Expr = Imm.getExpr(); + int64_t Value; + bool Success = Expr->evaluateAsAbsolute(Value); + assert(Success);(void)Success; + if (Value < 0 && Value > -256) { + MappedInst.setOpcode(Hexagon::M2_mpysin); + Imm.setExpr(MCUnaryExpr::createMinus(Expr, OutContext)); + } + else + MappedInst.setOpcode(Hexagon::M2_mpysip); + return; + } + + case Hexagon::A2_addsp: { + MCOperand &Rt = Inst.getOperand(1); + assert (Rt.isReg() && "Expected register and none was found"); + unsigned Reg = RI->getEncodingValue(Rt.getReg()); + if (Reg & 1) + MappedInst.setOpcode(Hexagon::A2_addsph); + else + MappedInst.setOpcode(Hexagon::A2_addspl); + Rt.setReg(getHexagonRegisterPair(Rt.getReg(), RI)); + return; + } + case Hexagon::HEXAGON_V6_vd0_pseudo: + case Hexagon::HEXAGON_V6_vd0_pseudo_128B: { + MCInst TmpInst; + assert (Inst.getOperand(0).isReg() && + "Expected register and none was found"); + + TmpInst.setOpcode(Hexagon::V6_vxor); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(0)); + MappedInst = TmpInst; + return; + } + + } +} + /// printMachineInstruction -- Print out a single Hexagon MI in Darwin syntax to /// the current output stream. /// void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) { - MCInst MCB; - MCB.setOpcode(Hexagon::BUNDLE); - MCB.addOperand(MCOperand::createImm(0)); + MCInst MCB = HexagonMCInstrInfo::createBundle(); + const MCInstrInfo &MCII = *Subtarget->getInstrInfo(); if (MI->isBundle()) { const MachineBasicBlock* MBB = MI->getParent(); - MachineBasicBlock::const_instr_iterator MII = MI; + MachineBasicBlock::const_instr_iterator MII = MI->getIterator(); unsigned IgnoreCount = 0; - for (++MII; MII != MBB->end() && MII->isInsideBundle(); ++MII) { + for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII) if (MII->getOpcode() == TargetOpcode::DBG_VALUE || MII->getOpcode() == TargetOpcode::IMPLICIT_DEF) ++IgnoreCount; - else { - HexagonLowerToMC(MII, MCB, *this); - } - } + else + HexagonLowerToMC(MCII, &*MII, MCB, *this); } - else { - HexagonLowerToMC(MI, MCB, *this); - HexagonMCInstrInfo::padEndloop(MCB); - } - // Examine the packet and try to find instructions that can be converted - // to compounds. - HexagonMCInstrInfo::tryCompound(*Subtarget->getInstrInfo(), - OutStreamer->getContext(), MCB); - // Examine the packet and convert pairs of instructions to duplex - // instructions when possible. - SmallVector<DuplexCandidate, 8> possibleDuplexes; - possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties( - *Subtarget->getInstrInfo(), MCB); - HexagonMCShuffle(*Subtarget->getInstrInfo(), *Subtarget, - OutStreamer->getContext(), MCB, possibleDuplexes); - EmitToStreamer(*OutStreamer, MCB); + else + HexagonLowerToMC(MCII, MI, MCB, *this); + + bool Ok = HexagonMCInstrInfo::canonicalizePacket( + MCII, *Subtarget, OutStreamer->getContext(), MCB, nullptr); + assert(Ok); + (void)Ok; + if(HexagonMCInstrInfo::bundleSize(MCB) == 0) + return; + OutStreamer->EmitInstruction(MCB, getSubtargetInfo()); } extern "C" void LLVMInitializeHexagonAsmPrinter() { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h index 792fc8b..a78d97e 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h @@ -42,6 +42,10 @@ namespace llvm { void EmitInstruction(const MachineInstr *MI) override; + void HexagonProcessInstruction(MCInst &Inst, + const MachineInstr &MBB); + + void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant, const char *ExtraCode, diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp new file mode 100644 index 0000000..77907b0 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -0,0 +1,2778 @@ +//===--- HexagonBitSimplify.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "hexbit" + +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "HexagonTargetMachine.h" +#include "HexagonBitTracker.h" + +using namespace llvm; + +namespace llvm { + void initializeHexagonBitSimplifyPass(PassRegistry& Registry); + FunctionPass *createHexagonBitSimplify(); +} + +namespace { + // Set of virtual registers, based on BitVector. + struct RegisterSet : private BitVector { + RegisterSet() : BitVector() {} + explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {} + RegisterSet(const RegisterSet &RS) : BitVector(RS) {} + + using BitVector::clear; + using BitVector::count; + + unsigned find_first() const { + int First = BitVector::find_first(); + if (First < 0) + return 0; + return x2v(First); + } + + unsigned find_next(unsigned Prev) const { + int Next = BitVector::find_next(v2x(Prev)); + if (Next < 0) + return 0; + return x2v(Next); + } + + RegisterSet &insert(unsigned R) { + unsigned Idx = v2x(R); + ensure(Idx); + return static_cast<RegisterSet&>(BitVector::set(Idx)); + } + RegisterSet &remove(unsigned R) { + unsigned Idx = v2x(R); + if (Idx >= size()) + return *this; + return static_cast<RegisterSet&>(BitVector::reset(Idx)); + } + + RegisterSet &insert(const RegisterSet &Rs) { + return static_cast<RegisterSet&>(BitVector::operator|=(Rs)); + } + RegisterSet &remove(const RegisterSet &Rs) { + return static_cast<RegisterSet&>(BitVector::reset(Rs)); + } + + reference operator[](unsigned R) { + unsigned Idx = v2x(R); + ensure(Idx); + return BitVector::operator[](Idx); + } + bool operator[](unsigned R) const { + unsigned Idx = v2x(R); + assert(Idx < size()); + return BitVector::operator[](Idx); + } + bool has(unsigned R) const { + unsigned Idx = v2x(R); + if (Idx >= size()) + return false; + return BitVector::test(Idx); + } + + bool empty() const { + return !BitVector::any(); + } + bool includes(const RegisterSet &Rs) const { + // A.BitVector::test(B) <=> A-B != {} + return !Rs.BitVector::test(*this); + } + bool intersects(const RegisterSet &Rs) const { + return BitVector::anyCommon(Rs); + } + + private: + void ensure(unsigned Idx) { + if (size() <= Idx) + resize(std::max(Idx+1, 32U)); + } + static inline unsigned v2x(unsigned v) { + return TargetRegisterInfo::virtReg2Index(v); + } + static inline unsigned x2v(unsigned x) { + return TargetRegisterInfo::index2VirtReg(x); + } + }; + + + struct PrintRegSet { + PrintRegSet(const RegisterSet &S, const TargetRegisterInfo *RI) + : RS(S), TRI(RI) {} + friend raw_ostream &operator<< (raw_ostream &OS, + const PrintRegSet &P); + private: + const RegisterSet &RS; + const TargetRegisterInfo *TRI; + }; + + raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) + LLVM_ATTRIBUTE_UNUSED; + raw_ostream &operator<< (raw_ostream &OS, const PrintRegSet &P) { + OS << '{'; + for (unsigned R = P.RS.find_first(); R; R = P.RS.find_next(R)) + OS << ' ' << PrintReg(R, P.TRI); + OS << " }"; + return OS; + } +} + + +namespace { + class Transformation; + + class HexagonBitSimplify : public MachineFunctionPass { + public: + static char ID; + HexagonBitSimplify() : MachineFunctionPass(ID), MDT(0) { + initializeHexagonBitSimplifyPass(*PassRegistry::getPassRegistry()); + } + virtual const char *getPassName() const { + return "Hexagon bit simplification"; + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + virtual bool runOnMachineFunction(MachineFunction &MF); + + static void getInstrDefs(const MachineInstr &MI, RegisterSet &Defs); + static void getInstrUses(const MachineInstr &MI, RegisterSet &Uses); + static bool isEqual(const BitTracker::RegisterCell &RC1, uint16_t B1, + const BitTracker::RegisterCell &RC2, uint16_t B2, uint16_t W); + static bool isConst(const BitTracker::RegisterCell &RC, uint16_t B, + uint16_t W); + static bool isZero(const BitTracker::RegisterCell &RC, uint16_t B, + uint16_t W); + static bool getConst(const BitTracker::RegisterCell &RC, uint16_t B, + uint16_t W, uint64_t &U); + static bool replaceReg(unsigned OldR, unsigned NewR, + MachineRegisterInfo &MRI); + static bool getSubregMask(const BitTracker::RegisterRef &RR, + unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI); + static bool replaceRegWithSub(unsigned OldR, unsigned NewR, + unsigned NewSR, MachineRegisterInfo &MRI); + static bool replaceSubWithSub(unsigned OldR, unsigned OldSR, + unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI); + static bool parseRegSequence(const MachineInstr &I, + BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH); + + static bool getUsedBitsInStore(unsigned Opc, BitVector &Bits, + uint16_t Begin); + static bool getUsedBits(unsigned Opc, unsigned OpN, BitVector &Bits, + uint16_t Begin, const HexagonInstrInfo &HII); + + static const TargetRegisterClass *getFinalVRegClass( + const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI); + static bool isTransparentCopy(const BitTracker::RegisterRef &RD, + const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI); + + private: + MachineDominatorTree *MDT; + + bool visitBlock(MachineBasicBlock &B, Transformation &T, RegisterSet &AVs); + }; + + char HexagonBitSimplify::ID = 0; + typedef HexagonBitSimplify HBS; + + + // The purpose of this class is to provide a common facility to traverse + // the function top-down or bottom-up via the dominator tree, and keep + // track of the available registers. + class Transformation { + public: + bool TopDown; + Transformation(bool TD) : TopDown(TD) {} + virtual bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) = 0; + virtual ~Transformation() {} + }; +} + +INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexbit", + "Hexagon bit simplification", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit", + "Hexagon bit simplification", false, false) + + +bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T, + RegisterSet &AVs) { + MachineDomTreeNode *N = MDT->getNode(&B); + typedef GraphTraits<MachineDomTreeNode*> GTN; + bool Changed = false; + + if (T.TopDown) + Changed = T.processBlock(B, AVs); + + RegisterSet Defs; + for (auto &I : B) + getInstrDefs(I, Defs); + RegisterSet NewAVs = AVs; + NewAVs.insert(Defs); + + for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) { + MachineBasicBlock *SB = (*I)->getBlock(); + Changed |= visitBlock(*SB, T, NewAVs); + } + if (!T.TopDown) + Changed |= T.processBlock(B, AVs); + + return Changed; +} + +// +// Utility functions: +// +void HexagonBitSimplify::getInstrDefs(const MachineInstr &MI, + RegisterSet &Defs) { + for (auto &Op : MI.operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + Defs.insert(R); + } +} + +void HexagonBitSimplify::getInstrUses(const MachineInstr &MI, + RegisterSet &Uses) { + for (auto &Op : MI.operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + Uses.insert(R); + } +} + +// Check if all the bits in range [B, E) in both cells are equal. +bool HexagonBitSimplify::isEqual(const BitTracker::RegisterCell &RC1, + uint16_t B1, const BitTracker::RegisterCell &RC2, uint16_t B2, + uint16_t W) { + for (uint16_t i = 0; i < W; ++i) { + // If RC1[i] is "bottom", it cannot be proven equal to RC2[i]. + if (RC1[B1+i].Type == BitTracker::BitValue::Ref && RC1[B1+i].RefI.Reg == 0) + return false; + // Same for RC2[i]. + if (RC2[B2+i].Type == BitTracker::BitValue::Ref && RC2[B2+i].RefI.Reg == 0) + return false; + if (RC1[B1+i] != RC2[B2+i]) + return false; + } + return true; +} + + +bool HexagonBitSimplify::isConst(const BitTracker::RegisterCell &RC, + uint16_t B, uint16_t W) { + assert(B < RC.width() && B+W <= RC.width()); + for (uint16_t i = B; i < B+W; ++i) + if (!RC[i].num()) + return false; + return true; +} + + +bool HexagonBitSimplify::isZero(const BitTracker::RegisterCell &RC, + uint16_t B, uint16_t W) { + assert(B < RC.width() && B+W <= RC.width()); + for (uint16_t i = B; i < B+W; ++i) + if (!RC[i].is(0)) + return false; + return true; +} + + +bool HexagonBitSimplify::getConst(const BitTracker::RegisterCell &RC, + uint16_t B, uint16_t W, uint64_t &U) { + assert(B < RC.width() && B+W <= RC.width()); + int64_t T = 0; + for (uint16_t i = B+W; i > B; --i) { + const BitTracker::BitValue &BV = RC[i-1]; + T <<= 1; + if (BV.is(1)) + T |= 1; + else if (!BV.is(0)) + return false; + } + U = T; + return true; +} + + +bool HexagonBitSimplify::replaceReg(unsigned OldR, unsigned NewR, + MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(OldR) || + !TargetRegisterInfo::isVirtualRegister(NewR)) + return false; + auto Begin = MRI.use_begin(OldR), End = MRI.use_end(); + decltype(End) NextI; + for (auto I = Begin; I != End; I = NextI) { + NextI = std::next(I); + I->setReg(NewR); + } + return Begin != End; +} + + +bool HexagonBitSimplify::replaceRegWithSub(unsigned OldR, unsigned NewR, + unsigned NewSR, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(OldR) || + !TargetRegisterInfo::isVirtualRegister(NewR)) + return false; + auto Begin = MRI.use_begin(OldR), End = MRI.use_end(); + decltype(End) NextI; + for (auto I = Begin; I != End; I = NextI) { + NextI = std::next(I); + I->setReg(NewR); + I->setSubReg(NewSR); + } + return Begin != End; +} + + +bool HexagonBitSimplify::replaceSubWithSub(unsigned OldR, unsigned OldSR, + unsigned NewR, unsigned NewSR, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(OldR) || + !TargetRegisterInfo::isVirtualRegister(NewR)) + return false; + auto Begin = MRI.use_begin(OldR), End = MRI.use_end(); + decltype(End) NextI; + for (auto I = Begin; I != End; I = NextI) { + NextI = std::next(I); + if (I->getSubReg() != OldSR) + continue; + I->setReg(NewR); + I->setSubReg(NewSR); + } + return Begin != End; +} + + +// For a register ref (pair Reg:Sub), set Begin to the position of the LSB +// of Sub in Reg, and set Width to the size of Sub in bits. Return true, +// if this succeeded, otherwise return false. +bool HexagonBitSimplify::getSubregMask(const BitTracker::RegisterRef &RR, + unsigned &Begin, unsigned &Width, MachineRegisterInfo &MRI) { + const TargetRegisterClass *RC = MRI.getRegClass(RR.Reg); + if (RC == &Hexagon::IntRegsRegClass) { + assert(RR.Sub == 0); + Begin = 0; + Width = 32; + return true; + } + if (RC == &Hexagon::DoubleRegsRegClass) { + if (RR.Sub == 0) { + Begin = 0; + Width = 64; + return true; + } + assert(RR.Sub == Hexagon::subreg_loreg || RR.Sub == Hexagon::subreg_hireg); + Width = 32; + Begin = (RR.Sub == Hexagon::subreg_loreg ? 0 : 32); + return true; + } + return false; +} + + +// For a REG_SEQUENCE, set SL to the low subregister and SH to the high +// subregister. +bool HexagonBitSimplify::parseRegSequence(const MachineInstr &I, + BitTracker::RegisterRef &SL, BitTracker::RegisterRef &SH) { + assert(I.getOpcode() == TargetOpcode::REG_SEQUENCE); + unsigned Sub1 = I.getOperand(2).getImm(), Sub2 = I.getOperand(4).getImm(); + assert(Sub1 != Sub2); + if (Sub1 == Hexagon::subreg_loreg && Sub2 == Hexagon::subreg_hireg) { + SL = I.getOperand(1); + SH = I.getOperand(3); + return true; + } + if (Sub1 == Hexagon::subreg_hireg && Sub2 == Hexagon::subreg_loreg) { + SH = I.getOperand(1); + SL = I.getOperand(3); + return true; + } + return false; +} + + +// All stores (except 64-bit stores) take a 32-bit register as the source +// of the value to be stored. If the instruction stores into a location +// that is shorter than 32 bits, some bits of the source register are not +// used. For each store instruction, calculate the set of used bits in +// the source register, and set appropriate bits in Bits. Return true if +// the bits are calculated, false otherwise. +bool HexagonBitSimplify::getUsedBitsInStore(unsigned Opc, BitVector &Bits, + uint16_t Begin) { + using namespace Hexagon; + + switch (Opc) { + // Store byte + case S2_storerb_io: // memb(Rs32+#s11:0)=Rt32 + case S2_storerbnew_io: // memb(Rs32+#s11:0)=Nt8.new + case S2_pstorerbt_io: // if (Pv4) memb(Rs32+#u6:0)=Rt32 + case S2_pstorerbf_io: // if (!Pv4) memb(Rs32+#u6:0)=Rt32 + case S4_pstorerbtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Rt32 + case S4_pstorerbfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Rt32 + case S2_pstorerbnewt_io: // if (Pv4) memb(Rs32+#u6:0)=Nt8.new + case S2_pstorerbnewf_io: // if (!Pv4) memb(Rs32+#u6:0)=Nt8.new + case S4_pstorerbnewtnew_io: // if (Pv4.new) memb(Rs32+#u6:0)=Nt8.new + case S4_pstorerbnewfnew_io: // if (!Pv4.new) memb(Rs32+#u6:0)=Nt8.new + case S2_storerb_pi: // memb(Rx32++#s4:0)=Rt32 + case S2_storerbnew_pi: // memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbt_pi: // if (Pv4) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Rt32 + case S2_pstorerbnewt_pi: // if (Pv4) memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbnewf_pi: // if (!Pv4) memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbnewtnew_pi: // if (Pv4.new) memb(Rx32++#s4:0)=Nt8.new + case S2_pstorerbnewfnew_pi: // if (!Pv4.new) memb(Rx32++#s4:0)=Nt8.new + case S4_storerb_ap: // memb(Re32=#U6)=Rt32 + case S4_storerbnew_ap: // memb(Re32=#U6)=Nt8.new + case S2_storerb_pr: // memb(Rx32++Mu2)=Rt32 + case S2_storerbnew_pr: // memb(Rx32++Mu2)=Nt8.new + case S4_storerb_ur: // memb(Ru32<<#u2+#U6)=Rt32 + case S4_storerbnew_ur: // memb(Ru32<<#u2+#U6)=Nt8.new + case S2_storerb_pbr: // memb(Rx32++Mu2:brev)=Rt32 + case S2_storerbnew_pbr: // memb(Rx32++Mu2:brev)=Nt8.new + case S2_storerb_pci: // memb(Rx32++#s4:0:circ(Mu2))=Rt32 + case S2_storerbnew_pci: // memb(Rx32++#s4:0:circ(Mu2))=Nt8.new + case S2_storerb_pcr: // memb(Rx32++I:circ(Mu2))=Rt32 + case S2_storerbnew_pcr: // memb(Rx32++I:circ(Mu2))=Nt8.new + case S4_storerb_rr: // memb(Rs32+Ru32<<#u2)=Rt32 + case S4_storerbnew_rr: // memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerbnewt_rr: // if (Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbnewf_rr: // if (!Pv4) memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbnewtnew_rr: // if (Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerbnewfnew_rr: // if (!Pv4.new) memb(Rs32+Ru32<<#u2)=Nt8.new + case S2_storerbgp: // memb(gp+#u16:0)=Rt32 + case S2_storerbnewgp: // memb(gp+#u16:0)=Nt8.new + case S4_pstorerbt_abs: // if (Pv4) memb(#u6)=Rt32 + case S4_pstorerbf_abs: // if (!Pv4) memb(#u6)=Rt32 + case S4_pstorerbtnew_abs: // if (Pv4.new) memb(#u6)=Rt32 + case S4_pstorerbfnew_abs: // if (!Pv4.new) memb(#u6)=Rt32 + case S4_pstorerbnewt_abs: // if (Pv4) memb(#u6)=Nt8.new + case S4_pstorerbnewf_abs: // if (!Pv4) memb(#u6)=Nt8.new + case S4_pstorerbnewtnew_abs: // if (Pv4.new) memb(#u6)=Nt8.new + case S4_pstorerbnewfnew_abs: // if (!Pv4.new) memb(#u6)=Nt8.new + Bits.set(Begin, Begin+8); + return true; + + // Store low half + case S2_storerh_io: // memh(Rs32+#s11:1)=Rt32 + case S2_storerhnew_io: // memh(Rs32+#s11:1)=Nt8.new + case S2_pstorerht_io: // if (Pv4) memh(Rs32+#u6:1)=Rt32 + case S2_pstorerhf_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt32 + case S4_pstorerhtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt32 + case S4_pstorerhfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt32 + case S2_pstorerhnewt_io: // if (Pv4) memh(Rs32+#u6:1)=Nt8.new + case S2_pstorerhnewf_io: // if (!Pv4) memh(Rs32+#u6:1)=Nt8.new + case S4_pstorerhnewtnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Nt8.new + case S4_pstorerhnewfnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Nt8.new + case S2_storerh_pi: // memh(Rx32++#s4:1)=Rt32 + case S2_storerhnew_pi: // memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerht_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt32 + case S2_pstorerhnewt_pi: // if (Pv4) memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerhnewf_pi: // if (!Pv4) memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerhnewtnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Nt8.new + case S2_pstorerhnewfnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Nt8.new + case S4_storerh_ap: // memh(Re32=#U6)=Rt32 + case S4_storerhnew_ap: // memh(Re32=#U6)=Nt8.new + case S2_storerh_pr: // memh(Rx32++Mu2)=Rt32 + case S2_storerhnew_pr: // memh(Rx32++Mu2)=Nt8.new + case S4_storerh_ur: // memh(Ru32<<#u2+#U6)=Rt32 + case S4_storerhnew_ur: // memh(Ru32<<#u2+#U6)=Nt8.new + case S2_storerh_pbr: // memh(Rx32++Mu2:brev)=Rt32 + case S2_storerhnew_pbr: // memh(Rx32++Mu2:brev)=Nt8.new + case S2_storerh_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt32 + case S2_storerhnew_pci: // memh(Rx32++#s4:1:circ(Mu2))=Nt8.new + case S2_storerh_pcr: // memh(Rx32++I:circ(Mu2))=Rt32 + case S2_storerhnew_pcr: // memh(Rx32++I:circ(Mu2))=Nt8.new + case S4_storerh_rr: // memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerht_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerhf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerhtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_pstorerhfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt32 + case S4_storerhnew_rr: // memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewt_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewf_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewtnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new + case S4_pstorerhnewfnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Nt8.new + case S2_storerhgp: // memh(gp+#u16:1)=Rt32 + case S2_storerhnewgp: // memh(gp+#u16:1)=Nt8.new + case S4_pstorerht_abs: // if (Pv4) memh(#u6)=Rt32 + case S4_pstorerhf_abs: // if (!Pv4) memh(#u6)=Rt32 + case S4_pstorerhtnew_abs: // if (Pv4.new) memh(#u6)=Rt32 + case S4_pstorerhfnew_abs: // if (!Pv4.new) memh(#u6)=Rt32 + case S4_pstorerhnewt_abs: // if (Pv4) memh(#u6)=Nt8.new + case S4_pstorerhnewf_abs: // if (!Pv4) memh(#u6)=Nt8.new + case S4_pstorerhnewtnew_abs: // if (Pv4.new) memh(#u6)=Nt8.new + case S4_pstorerhnewfnew_abs: // if (!Pv4.new) memh(#u6)=Nt8.new + Bits.set(Begin, Begin+16); + return true; + + // Store high half + case S2_storerf_io: // memh(Rs32+#s11:1)=Rt.H32 + case S2_pstorerft_io: // if (Pv4) memh(Rs32+#u6:1)=Rt.H32 + case S2_pstorerff_io: // if (!Pv4) memh(Rs32+#u6:1)=Rt.H32 + case S4_pstorerftnew_io: // if (Pv4.new) memh(Rs32+#u6:1)=Rt.H32 + case S4_pstorerffnew_io: // if (!Pv4.new) memh(Rs32+#u6:1)=Rt.H32 + case S2_storerf_pi: // memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerft_pi: // if (Pv4) memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerff_pi: // if (!Pv4) memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerftnew_pi: // if (Pv4.new) memh(Rx32++#s4:1)=Rt.H32 + case S2_pstorerffnew_pi: // if (!Pv4.new) memh(Rx32++#s4:1)=Rt.H32 + case S4_storerf_ap: // memh(Re32=#U6)=Rt.H32 + case S2_storerf_pr: // memh(Rx32++Mu2)=Rt.H32 + case S4_storerf_ur: // memh(Ru32<<#u2+#U6)=Rt.H32 + case S2_storerf_pbr: // memh(Rx32++Mu2:brev)=Rt.H32 + case S2_storerf_pci: // memh(Rx32++#s4:1:circ(Mu2))=Rt.H32 + case S2_storerf_pcr: // memh(Rx32++I:circ(Mu2))=Rt.H32 + case S4_storerf_rr: // memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerft_rr: // if (Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerff_rr: // if (!Pv4) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerftnew_rr: // if (Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S4_pstorerffnew_rr: // if (!Pv4.new) memh(Rs32+Ru32<<#u2)=Rt.H32 + case S2_storerfgp: // memh(gp+#u16:1)=Rt.H32 + case S4_pstorerft_abs: // if (Pv4) memh(#u6)=Rt.H32 + case S4_pstorerff_abs: // if (!Pv4) memh(#u6)=Rt.H32 + case S4_pstorerftnew_abs: // if (Pv4.new) memh(#u6)=Rt.H32 + case S4_pstorerffnew_abs: // if (!Pv4.new) memh(#u6)=Rt.H32 + Bits.set(Begin+16, Begin+32); + return true; + } + + return false; +} + + +// For an instruction with opcode Opc, calculate the set of bits that it +// uses in a register in operand OpN. This only calculates the set of used +// bits for cases where it does not depend on any operands (as is the case +// in shifts, for example). For concrete instructions from a program, the +// operand may be a subregister of a larger register, while Bits would +// correspond to the larger register in its entirety. Because of that, +// the parameter Begin can be used to indicate which bit of Bits should be +// considered the LSB of of the operand. +bool HexagonBitSimplify::getUsedBits(unsigned Opc, unsigned OpN, + BitVector &Bits, uint16_t Begin, const HexagonInstrInfo &HII) { + using namespace Hexagon; + + const MCInstrDesc &D = HII.get(Opc); + if (D.mayStore()) { + if (OpN == D.getNumOperands()-1) + return getUsedBitsInStore(Opc, Bits, Begin); + return false; + } + + switch (Opc) { + // One register source. Used bits: R1[0-7]. + case A2_sxtb: + case A2_zxtb: + case A4_cmpbeqi: + case A4_cmpbgti: + case A4_cmpbgtui: + if (OpN == 1) { + Bits.set(Begin, Begin+8); + return true; + } + break; + + // One register source. Used bits: R1[0-15]. + case A2_aslh: + case A2_sxth: + case A2_zxth: + case A4_cmpheqi: + case A4_cmphgti: + case A4_cmphgtui: + if (OpN == 1) { + Bits.set(Begin, Begin+16); + return true; + } + break; + + // One register source. Used bits: R1[16-31]. + case A2_asrh: + if (OpN == 1) { + Bits.set(Begin+16, Begin+32); + return true; + } + break; + + // Two register sources. Used bits: R1[0-7], R2[0-7]. + case A4_cmpbeq: + case A4_cmpbgt: + case A4_cmpbgtu: + if (OpN == 1) { + Bits.set(Begin, Begin+8); + return true; + } + break; + + // Two register sources. Used bits: R1[0-15], R2[0-15]. + case A4_cmpheq: + case A4_cmphgt: + case A4_cmphgtu: + case A2_addh_h16_ll: + case A2_addh_h16_sat_ll: + case A2_addh_l16_ll: + case A2_addh_l16_sat_ll: + case A2_combine_ll: + case A2_subh_h16_ll: + case A2_subh_h16_sat_ll: + case A2_subh_l16_ll: + case A2_subh_l16_sat_ll: + case M2_mpy_acc_ll_s0: + case M2_mpy_acc_ll_s1: + case M2_mpy_acc_sat_ll_s0: + case M2_mpy_acc_sat_ll_s1: + case M2_mpy_ll_s0: + case M2_mpy_ll_s1: + case M2_mpy_nac_ll_s0: + case M2_mpy_nac_ll_s1: + case M2_mpy_nac_sat_ll_s0: + case M2_mpy_nac_sat_ll_s1: + case M2_mpy_rnd_ll_s0: + case M2_mpy_rnd_ll_s1: + case M2_mpy_sat_ll_s0: + case M2_mpy_sat_ll_s1: + case M2_mpy_sat_rnd_ll_s0: + case M2_mpy_sat_rnd_ll_s1: + case M2_mpyd_acc_ll_s0: + case M2_mpyd_acc_ll_s1: + case M2_mpyd_ll_s0: + case M2_mpyd_ll_s1: + case M2_mpyd_nac_ll_s0: + case M2_mpyd_nac_ll_s1: + case M2_mpyd_rnd_ll_s0: + case M2_mpyd_rnd_ll_s1: + case M2_mpyu_acc_ll_s0: + case M2_mpyu_acc_ll_s1: + case M2_mpyu_ll_s0: + case M2_mpyu_ll_s1: + case M2_mpyu_nac_ll_s0: + case M2_mpyu_nac_ll_s1: + case M2_mpyud_acc_ll_s0: + case M2_mpyud_acc_ll_s1: + case M2_mpyud_ll_s0: + case M2_mpyud_ll_s1: + case M2_mpyud_nac_ll_s0: + case M2_mpyud_nac_ll_s1: + if (OpN == 1 || OpN == 2) { + Bits.set(Begin, Begin+16); + return true; + } + break; + + // Two register sources. Used bits: R1[0-15], R2[16-31]. + case A2_addh_h16_lh: + case A2_addh_h16_sat_lh: + case A2_combine_lh: + case A2_subh_h16_lh: + case A2_subh_h16_sat_lh: + case M2_mpy_acc_lh_s0: + case M2_mpy_acc_lh_s1: + case M2_mpy_acc_sat_lh_s0: + case M2_mpy_acc_sat_lh_s1: + case M2_mpy_lh_s0: + case M2_mpy_lh_s1: + case M2_mpy_nac_lh_s0: + case M2_mpy_nac_lh_s1: + case M2_mpy_nac_sat_lh_s0: + case M2_mpy_nac_sat_lh_s1: + case M2_mpy_rnd_lh_s0: + case M2_mpy_rnd_lh_s1: + case M2_mpy_sat_lh_s0: + case M2_mpy_sat_lh_s1: + case M2_mpy_sat_rnd_lh_s0: + case M2_mpy_sat_rnd_lh_s1: + case M2_mpyd_acc_lh_s0: + case M2_mpyd_acc_lh_s1: + case M2_mpyd_lh_s0: + case M2_mpyd_lh_s1: + case M2_mpyd_nac_lh_s0: + case M2_mpyd_nac_lh_s1: + case M2_mpyd_rnd_lh_s0: + case M2_mpyd_rnd_lh_s1: + case M2_mpyu_acc_lh_s0: + case M2_mpyu_acc_lh_s1: + case M2_mpyu_lh_s0: + case M2_mpyu_lh_s1: + case M2_mpyu_nac_lh_s0: + case M2_mpyu_nac_lh_s1: + case M2_mpyud_acc_lh_s0: + case M2_mpyud_acc_lh_s1: + case M2_mpyud_lh_s0: + case M2_mpyud_lh_s1: + case M2_mpyud_nac_lh_s0: + case M2_mpyud_nac_lh_s1: + // These four are actually LH. + case A2_addh_l16_hl: + case A2_addh_l16_sat_hl: + case A2_subh_l16_hl: + case A2_subh_l16_sat_hl: + if (OpN == 1) { + Bits.set(Begin, Begin+16); + return true; + } + if (OpN == 2) { + Bits.set(Begin+16, Begin+32); + return true; + } + break; + + // Two register sources, used bits: R1[16-31], R2[0-15]. + case A2_addh_h16_hl: + case A2_addh_h16_sat_hl: + case A2_combine_hl: + case A2_subh_h16_hl: + case A2_subh_h16_sat_hl: + case M2_mpy_acc_hl_s0: + case M2_mpy_acc_hl_s1: + case M2_mpy_acc_sat_hl_s0: + case M2_mpy_acc_sat_hl_s1: + case M2_mpy_hl_s0: + case M2_mpy_hl_s1: + case M2_mpy_nac_hl_s0: + case M2_mpy_nac_hl_s1: + case M2_mpy_nac_sat_hl_s0: + case M2_mpy_nac_sat_hl_s1: + case M2_mpy_rnd_hl_s0: + case M2_mpy_rnd_hl_s1: + case M2_mpy_sat_hl_s0: + case M2_mpy_sat_hl_s1: + case M2_mpy_sat_rnd_hl_s0: + case M2_mpy_sat_rnd_hl_s1: + case M2_mpyd_acc_hl_s0: + case M2_mpyd_acc_hl_s1: + case M2_mpyd_hl_s0: + case M2_mpyd_hl_s1: + case M2_mpyd_nac_hl_s0: + case M2_mpyd_nac_hl_s1: + case M2_mpyd_rnd_hl_s0: + case M2_mpyd_rnd_hl_s1: + case M2_mpyu_acc_hl_s0: + case M2_mpyu_acc_hl_s1: + case M2_mpyu_hl_s0: + case M2_mpyu_hl_s1: + case M2_mpyu_nac_hl_s0: + case M2_mpyu_nac_hl_s1: + case M2_mpyud_acc_hl_s0: + case M2_mpyud_acc_hl_s1: + case M2_mpyud_hl_s0: + case M2_mpyud_hl_s1: + case M2_mpyud_nac_hl_s0: + case M2_mpyud_nac_hl_s1: + if (OpN == 1) { + Bits.set(Begin+16, Begin+32); + return true; + } + if (OpN == 2) { + Bits.set(Begin, Begin+16); + return true; + } + break; + + // Two register sources, used bits: R1[16-31], R2[16-31]. + case A2_addh_h16_hh: + case A2_addh_h16_sat_hh: + case A2_combine_hh: + case A2_subh_h16_hh: + case A2_subh_h16_sat_hh: + case M2_mpy_acc_hh_s0: + case M2_mpy_acc_hh_s1: + case M2_mpy_acc_sat_hh_s0: + case M2_mpy_acc_sat_hh_s1: + case M2_mpy_hh_s0: + case M2_mpy_hh_s1: + case M2_mpy_nac_hh_s0: + case M2_mpy_nac_hh_s1: + case M2_mpy_nac_sat_hh_s0: + case M2_mpy_nac_sat_hh_s1: + case M2_mpy_rnd_hh_s0: + case M2_mpy_rnd_hh_s1: + case M2_mpy_sat_hh_s0: + case M2_mpy_sat_hh_s1: + case M2_mpy_sat_rnd_hh_s0: + case M2_mpy_sat_rnd_hh_s1: + case M2_mpyd_acc_hh_s0: + case M2_mpyd_acc_hh_s1: + case M2_mpyd_hh_s0: + case M2_mpyd_hh_s1: + case M2_mpyd_nac_hh_s0: + case M2_mpyd_nac_hh_s1: + case M2_mpyd_rnd_hh_s0: + case M2_mpyd_rnd_hh_s1: + case M2_mpyu_acc_hh_s0: + case M2_mpyu_acc_hh_s1: + case M2_mpyu_hh_s0: + case M2_mpyu_hh_s1: + case M2_mpyu_nac_hh_s0: + case M2_mpyu_nac_hh_s1: + case M2_mpyud_acc_hh_s0: + case M2_mpyud_acc_hh_s1: + case M2_mpyud_hh_s0: + case M2_mpyud_hh_s1: + case M2_mpyud_nac_hh_s0: + case M2_mpyud_nac_hh_s1: + if (OpN == 1 || OpN == 2) { + Bits.set(Begin+16, Begin+32); + return true; + } + break; + } + + return false; +} + + +// Calculate the register class that matches Reg:Sub. For example, if +// vreg1 is a double register, then vreg1:subreg_hireg would match "int" +// register class. +const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass( + const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(RR.Reg)) + return nullptr; + auto *RC = MRI.getRegClass(RR.Reg); + if (RR.Sub == 0) + return RC; + + auto VerifySR = [] (unsigned Sub) -> void { + assert(Sub == Hexagon::subreg_hireg || Sub == Hexagon::subreg_loreg); + }; + + switch (RC->getID()) { + case Hexagon::DoubleRegsRegClassID: + VerifySR(RR.Sub); + return &Hexagon::IntRegsRegClass; + } + return nullptr; +} + + +// Check if RD could be replaced with RS at any possible use of RD. +// For example a predicate register cannot be replaced with a integer +// register, but a 64-bit register with a subregister can be replaced +// with a 32-bit register. +bool HexagonBitSimplify::isTransparentCopy(const BitTracker::RegisterRef &RD, + const BitTracker::RegisterRef &RS, MachineRegisterInfo &MRI) { + if (!TargetRegisterInfo::isVirtualRegister(RD.Reg) || + !TargetRegisterInfo::isVirtualRegister(RS.Reg)) + return false; + // Return false if one (or both) classes are nullptr. + auto *DRC = getFinalVRegClass(RD, MRI); + if (!DRC) + return false; + + return DRC == getFinalVRegClass(RS, MRI); +} + + +// +// Dead code elimination +// +namespace { + class DeadCodeElimination { + public: + DeadCodeElimination(MachineFunction &mf, MachineDominatorTree &mdt) + : MF(mf), HII(*MF.getSubtarget<HexagonSubtarget>().getInstrInfo()), + MDT(mdt), MRI(mf.getRegInfo()) {} + + bool run() { + return runOnNode(MDT.getRootNode()); + } + + private: + bool isDead(unsigned R) const; + bool runOnNode(MachineDomTreeNode *N); + + MachineFunction &MF; + const HexagonInstrInfo &HII; + MachineDominatorTree &MDT; + MachineRegisterInfo &MRI; + }; +} + + +bool DeadCodeElimination::isDead(unsigned R) const { + for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) { + MachineInstr *UseI = I->getParent(); + if (UseI->isDebugValue()) + continue; + if (UseI->isPHI()) { + assert(!UseI->getOperand(0).getSubReg()); + unsigned DR = UseI->getOperand(0).getReg(); + if (DR == R) + continue; + } + return false; + } + return true; +} + + +bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) { + bool Changed = false; + typedef GraphTraits<MachineDomTreeNode*> GTN; + for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) + Changed |= runOnNode(*I); + + MachineBasicBlock *B = N->getBlock(); + std::vector<MachineInstr*> Instrs; + for (auto I = B->rbegin(), E = B->rend(); I != E; ++I) + Instrs.push_back(&*I); + + for (auto MI : Instrs) { + unsigned Opc = MI->getOpcode(); + // Do not touch lifetime markers. This is why the target-independent DCE + // cannot be used. + if (Opc == TargetOpcode::LIFETIME_START || + Opc == TargetOpcode::LIFETIME_END) + continue; + bool Store = false; + if (MI->isInlineAsm()) + continue; + // Delete PHIs if possible. + if (!MI->isPHI() && !MI->isSafeToMove(nullptr, Store)) + continue; + + bool AllDead = true; + SmallVector<unsigned,2> Regs; + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isDef()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R) || !isDead(R)) { + AllDead = false; + break; + } + Regs.push_back(R); + } + if (!AllDead) + continue; + + B->erase(MI); + for (unsigned i = 0, n = Regs.size(); i != n; ++i) + MRI.markUsesInDebugValueAsUndef(Regs[i]); + Changed = true; + } + + return Changed; +} + + +// +// Eliminate redundant instructions +// +// This transformation will identify instructions where the output register +// is the same as one of its input registers. This only works on instructions +// that define a single register (unlike post-increment loads, for example). +// The equality check is actually more detailed: the code calculates which +// bits of the output are used, and only compares these bits with the input +// registers. +// If the output matches an input, the instruction is replaced with COPY. +// The copies will be removed by another transformation. +namespace { + class RedundantInstrElimination : public Transformation { + public: + RedundantInstrElimination(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + bool isLossyShiftLeft(const MachineInstr &MI, unsigned OpN, + unsigned &LostB, unsigned &LostE); + bool isLossyShiftRight(const MachineInstr &MI, unsigned OpN, + unsigned &LostB, unsigned &LostE); + bool computeUsedBits(unsigned Reg, BitVector &Bits); + bool computeUsedBits(const MachineInstr &MI, unsigned OpN, BitVector &Bits, + uint16_t Begin); + bool usedBitsEqual(BitTracker::RegisterRef RD, BitTracker::RegisterRef RS); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; +} + + +// Check if the instruction is a lossy shift left, where the input being +// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range +// of bit indices that are lost. +bool RedundantInstrElimination::isLossyShiftLeft(const MachineInstr &MI, + unsigned OpN, unsigned &LostB, unsigned &LostE) { + using namespace Hexagon; + unsigned Opc = MI.getOpcode(); + unsigned ImN, RegN, Width; + switch (Opc) { + case S2_asl_i_p: + ImN = 2; + RegN = 1; + Width = 64; + break; + case S2_asl_i_p_acc: + case S2_asl_i_p_and: + case S2_asl_i_p_nac: + case S2_asl_i_p_or: + case S2_asl_i_p_xacc: + ImN = 3; + RegN = 2; + Width = 64; + break; + case S2_asl_i_r: + ImN = 2; + RegN = 1; + Width = 32; + break; + case S2_addasl_rrri: + case S4_andi_asl_ri: + case S4_ori_asl_ri: + case S4_addi_asl_ri: + case S4_subi_asl_ri: + case S2_asl_i_r_acc: + case S2_asl_i_r_and: + case S2_asl_i_r_nac: + case S2_asl_i_r_or: + case S2_asl_i_r_sat: + case S2_asl_i_r_xacc: + ImN = 3; + RegN = 2; + Width = 32; + break; + default: + return false; + } + + if (RegN != OpN) + return false; + + assert(MI.getOperand(ImN).isImm()); + unsigned S = MI.getOperand(ImN).getImm(); + if (S == 0) + return false; + LostB = Width-S; + LostE = Width; + return true; +} + + +// Check if the instruction is a lossy shift right, where the input being +// shifted is the operand OpN of MI. If true, [LostB, LostE) is the range +// of bit indices that are lost. +bool RedundantInstrElimination::isLossyShiftRight(const MachineInstr &MI, + unsigned OpN, unsigned &LostB, unsigned &LostE) { + using namespace Hexagon; + unsigned Opc = MI.getOpcode(); + unsigned ImN, RegN; + switch (Opc) { + case S2_asr_i_p: + case S2_lsr_i_p: + ImN = 2; + RegN = 1; + break; + case S2_asr_i_p_acc: + case S2_asr_i_p_and: + case S2_asr_i_p_nac: + case S2_asr_i_p_or: + case S2_lsr_i_p_acc: + case S2_lsr_i_p_and: + case S2_lsr_i_p_nac: + case S2_lsr_i_p_or: + case S2_lsr_i_p_xacc: + ImN = 3; + RegN = 2; + break; + case S2_asr_i_r: + case S2_lsr_i_r: + ImN = 2; + RegN = 1; + break; + case S4_andi_lsr_ri: + case S4_ori_lsr_ri: + case S4_addi_lsr_ri: + case S4_subi_lsr_ri: + case S2_asr_i_r_acc: + case S2_asr_i_r_and: + case S2_asr_i_r_nac: + case S2_asr_i_r_or: + case S2_lsr_i_r_acc: + case S2_lsr_i_r_and: + case S2_lsr_i_r_nac: + case S2_lsr_i_r_or: + case S2_lsr_i_r_xacc: + ImN = 3; + RegN = 2; + break; + + default: + return false; + } + + if (RegN != OpN) + return false; + + assert(MI.getOperand(ImN).isImm()); + unsigned S = MI.getOperand(ImN).getImm(); + LostB = 0; + LostE = S; + return true; +} + + +// Calculate the bit vector that corresponds to the used bits of register Reg. +// The vector Bits has the same size, as the size of Reg in bits. If the cal- +// culation fails (i.e. the used bits are unknown), it returns false. Other- +// wise, it returns true and sets the corresponding bits in Bits. +bool RedundantInstrElimination::computeUsedBits(unsigned Reg, BitVector &Bits) { + BitVector Used(Bits.size()); + RegisterSet Visited; + std::vector<unsigned> Pending; + Pending.push_back(Reg); + + for (unsigned i = 0; i < Pending.size(); ++i) { + unsigned R = Pending[i]; + if (Visited.has(R)) + continue; + Visited.insert(R); + for (auto I = MRI.use_begin(R), E = MRI.use_end(); I != E; ++I) { + BitTracker::RegisterRef UR = *I; + unsigned B, W; + if (!HBS::getSubregMask(UR, B, W, MRI)) + return false; + MachineInstr &UseI = *I->getParent(); + if (UseI.isPHI() || UseI.isCopy()) { + unsigned DefR = UseI.getOperand(0).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DefR)) + return false; + Pending.push_back(DefR); + } else { + if (!computeUsedBits(UseI, I.getOperandNo(), Used, B)) + return false; + } + } + } + Bits |= Used; + return true; +} + + +// Calculate the bits used by instruction MI in a register in operand OpN. +// Return true/false if the calculation succeeds/fails. If is succeeds, set +// used bits in Bits. This function does not reset any bits in Bits, so +// subsequent calls over different instructions will result in the union +// of the used bits in all these instructions. +// The register in question may be used with a sub-register, whereas Bits +// holds the bits for the entire register. To keep track of that, the +// argument Begin indicates where in Bits is the lowest-significant bit +// of the register used in operand OpN. For example, in instruction: +// vreg1 = S2_lsr_i_r vreg2:subreg_hireg, 10 +// the operand 1 is a 32-bit register, which happens to be a subregister +// of the 64-bit register vreg2, and that subregister starts at position 32. +// In this case Begin=32, since Bits[32] would be the lowest-significant bit +// of vreg2:subreg_hireg. +bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI, + unsigned OpN, BitVector &Bits, uint16_t Begin) { + unsigned Opc = MI.getOpcode(); + BitVector T(Bits.size()); + bool GotBits = HBS::getUsedBits(Opc, OpN, T, Begin, HII); + // Even if we don't have bits yet, we could still provide some information + // if the instruction is a lossy shift: the lost bits will be marked as + // not used. + unsigned LB, LE; + if (isLossyShiftLeft(MI, OpN, LB, LE) || isLossyShiftRight(MI, OpN, LB, LE)) { + assert(MI.getOperand(OpN).isReg()); + BitTracker::RegisterRef RR = MI.getOperand(OpN); + const TargetRegisterClass *RC = HBS::getFinalVRegClass(RR, MRI); + uint16_t Width = RC->getSize()*8; + + if (!GotBits) + T.set(Begin, Begin+Width); + assert(LB <= LE && LB < Width && LE <= Width); + T.reset(Begin+LB, Begin+LE); + GotBits = true; + } + if (GotBits) + Bits |= T; + return GotBits; +} + + +// Calculates the used bits in RD ("defined register"), and checks if these +// bits in RS ("used register") and RD are identical. +bool RedundantInstrElimination::usedBitsEqual(BitTracker::RegisterRef RD, + BitTracker::RegisterRef RS) { + const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg); + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + + unsigned DB, DW; + if (!HBS::getSubregMask(RD, DB, DW, MRI)) + return false; + unsigned SB, SW; + if (!HBS::getSubregMask(RS, SB, SW, MRI)) + return false; + if (SW != DW) + return false; + + BitVector Used(DC.width()); + if (!computeUsedBits(RD.Reg, Used)) + return false; + + for (unsigned i = 0; i != DW; ++i) + if (Used[i+DB] && DC[DB+i] != SC[SB+i]) + return false; + return true; +} + + +bool RedundantInstrElimination::processBlock(MachineBasicBlock &B, + const RegisterSet&) { + bool Changed = false; + + for (auto I = B.begin(), E = B.end(), NextI = I; I != E; ++I) { + NextI = std::next(I); + MachineInstr *MI = &*I; + + if (MI->getOpcode() == TargetOpcode::COPY) + continue; + if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm()) + continue; + unsigned NumD = MI->getDesc().getNumDefs(); + if (NumD != 1) + continue; + + BitTracker::RegisterRef RD = MI->getOperand(0); + if (!BT.has(RD.Reg)) + continue; + const BitTracker::RegisterCell &DC = BT.lookup(RD.Reg); + + // Find a source operand that is equal to the result. + for (auto &Op : MI->uses()) { + if (!Op.isReg()) + continue; + BitTracker::RegisterRef RS = Op; + if (!BT.has(RS.Reg)) + continue; + if (!HBS::isTransparentCopy(RD, RS, MRI)) + continue; + + unsigned BN, BW; + if (!HBS::getSubregMask(RS, BN, BW, MRI)) + continue; + + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + if (!usedBitsEqual(RD, RS) && !HBS::isEqual(DC, 0, SC, BN, BW)) + continue; + + // If found, replace the instruction with a COPY. + DebugLoc DL = MI->getDebugLoc(); + const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI); + unsigned NewR = MRI.createVirtualRegister(FRC); + BuildMI(B, I, DL, HII.get(TargetOpcode::COPY), NewR) + .addReg(RS.Reg, 0, RS.Sub); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), SC); + Changed = true; + break; + } + } + + return Changed; +} + + +// +// Const generation +// +// Recognize instructions that produce constant values known at compile-time. +// Replace them with register definitions that load these constants directly. +namespace { + class ConstGeneration : public Transformation { + public: + ConstGeneration(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + bool isTfrConst(const MachineInstr *MI) const; + bool isConst(unsigned R, int64_t &V) const; + unsigned genTfrConst(const TargetRegisterClass *RC, int64_t C, + MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; +} + +bool ConstGeneration::isConst(unsigned R, int64_t &C) const { + if (!BT.has(R)) + return false; + const BitTracker::RegisterCell &RC = BT.lookup(R); + int64_t T = 0; + for (unsigned i = RC.width(); i > 0; --i) { + const BitTracker::BitValue &V = RC[i-1]; + T <<= 1; + if (V.is(1)) + T |= 1; + else if (!V.is(0)) + return false; + } + C = T; + return true; +} + + +bool ConstGeneration::isTfrConst(const MachineInstr *MI) const { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + case Hexagon::A2_tfrsi: + case Hexagon::A2_tfrpi: + case Hexagon::TFR_PdTrue: + case Hexagon::TFR_PdFalse: + case Hexagon::CONST32_Int_Real: + case Hexagon::CONST64_Int_Real: + return true; + } + return false; +} + + +// Generate a transfer-immediate instruction that is appropriate for the +// register class and the actual value being transferred. +unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C, + MachineBasicBlock &B, MachineBasicBlock::iterator At, DebugLoc &DL) { + unsigned Reg = MRI.createVirtualRegister(RC); + if (RC == &Hexagon::IntRegsRegClass) { + BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrsi), Reg) + .addImm(int32_t(C)); + return Reg; + } + + if (RC == &Hexagon::DoubleRegsRegClass) { + if (isInt<8>(C)) { + BuildMI(B, At, DL, HII.get(Hexagon::A2_tfrpi), Reg) + .addImm(C); + return Reg; + } + + unsigned Lo = Lo_32(C), Hi = Hi_32(C); + if (isInt<8>(Lo) || isInt<8>(Hi)) { + unsigned Opc = isInt<8>(Lo) ? Hexagon::A2_combineii + : Hexagon::A4_combineii; + BuildMI(B, At, DL, HII.get(Opc), Reg) + .addImm(int32_t(Hi)) + .addImm(int32_t(Lo)); + return Reg; + } + + BuildMI(B, At, DL, HII.get(Hexagon::CONST64_Int_Real), Reg) + .addImm(C); + return Reg; + } + + if (RC == &Hexagon::PredRegsRegClass) { + unsigned Opc; + if (C == 0) + Opc = Hexagon::TFR_PdFalse; + else if ((C & 0xFF) == 0xFF) + Opc = Hexagon::TFR_PdTrue; + else + return 0; + BuildMI(B, At, DL, HII.get(Opc), Reg); + return Reg; + } + + return 0; +} + + +bool ConstGeneration::processBlock(MachineBasicBlock &B, const RegisterSet&) { + bool Changed = false; + RegisterSet Defs; + + for (auto I = B.begin(), E = B.end(); I != E; ++I) { + if (isTfrConst(I)) + continue; + Defs.clear(); + HBS::getInstrDefs(*I, Defs); + if (Defs.count() != 1) + continue; + unsigned DR = Defs.find_first(); + if (!TargetRegisterInfo::isVirtualRegister(DR)) + continue; + int64_t C; + if (isConst(DR, C)) { + DebugLoc DL = I->getDebugLoc(); + auto At = I->isPHI() ? B.getFirstNonPHI() : I; + unsigned ImmReg = genTfrConst(MRI.getRegClass(DR), C, B, At, DL); + if (ImmReg) { + HBS::replaceReg(DR, ImmReg, MRI); + BT.put(ImmReg, BT.lookup(DR)); + Changed = true; + } + } + } + return Changed; +} + + +// +// Copy generation +// +// Identify pairs of available registers which hold identical values. +// In such cases, only one of them needs to be calculated, the other one +// will be defined as a copy of the first. +// +// Copy propagation +// +// Eliminate register copies RD = RS, by replacing the uses of RD with +// with uses of RS. +namespace { + class CopyGeneration : public Transformation { + public: + CopyGeneration(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + bool findMatch(const BitTracker::RegisterRef &Inp, + BitTracker::RegisterRef &Out, const RegisterSet &AVs); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; + + class CopyPropagation : public Transformation { + public: + CopyPropagation(const HexagonRegisterInfo &hri, MachineRegisterInfo &mri) + : Transformation(false), MRI(mri) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + static bool isCopyReg(unsigned Opc); + private: + bool propagateRegCopy(MachineInstr &MI); + + MachineRegisterInfo &MRI; + }; + +} + + +/// Check if there is a register in AVs that is identical to Inp. If so, +/// set Out to the found register. The output may be a pair Reg:Sub. +bool CopyGeneration::findMatch(const BitTracker::RegisterRef &Inp, + BitTracker::RegisterRef &Out, const RegisterSet &AVs) { + if (!BT.has(Inp.Reg)) + return false; + const BitTracker::RegisterCell &InpRC = BT.lookup(Inp.Reg); + unsigned B, W; + if (!HBS::getSubregMask(Inp, B, W, MRI)) + return false; + + for (unsigned R = AVs.find_first(); R; R = AVs.find_next(R)) { + if (!BT.has(R) || !HBS::isTransparentCopy(R, Inp, MRI)) + continue; + const BitTracker::RegisterCell &RC = BT.lookup(R); + unsigned RW = RC.width(); + if (W == RW) { + if (MRI.getRegClass(Inp.Reg) != MRI.getRegClass(R)) + continue; + if (!HBS::isEqual(InpRC, B, RC, 0, W)) + continue; + Out.Reg = R; + Out.Sub = 0; + return true; + } + // Check if there is a super-register, whose part (with a subregister) + // is equal to the input. + // Only do double registers for now. + if (W*2 != RW) + continue; + if (MRI.getRegClass(R) != &Hexagon::DoubleRegsRegClass) + continue; + + if (HBS::isEqual(InpRC, B, RC, 0, W)) + Out.Sub = Hexagon::subreg_loreg; + else if (HBS::isEqual(InpRC, B, RC, W, W)) + Out.Sub = Hexagon::subreg_hireg; + else + continue; + Out.Reg = R; + return true; + } + return false; +} + + +bool CopyGeneration::processBlock(MachineBasicBlock &B, + const RegisterSet &AVs) { + RegisterSet AVB(AVs); + bool Changed = false; + RegisterSet Defs; + + for (auto I = B.begin(), E = B.end(), NextI = I; I != E; + ++I, AVB.insert(Defs)) { + NextI = std::next(I); + Defs.clear(); + HBS::getInstrDefs(*I, Defs); + + unsigned Opc = I->getOpcode(); + if (CopyPropagation::isCopyReg(Opc)) + continue; + + for (unsigned R = Defs.find_first(); R; R = Defs.find_next(R)) { + BitTracker::RegisterRef MR; + if (!findMatch(R, MR, AVB)) + continue; + DebugLoc DL = I->getDebugLoc(); + auto *FRC = HBS::getFinalVRegClass(MR, MRI); + unsigned NewR = MRI.createVirtualRegister(FRC); + auto At = I->isPHI() ? B.getFirstNonPHI() : I; + BuildMI(B, At, DL, HII.get(TargetOpcode::COPY), NewR) + .addReg(MR.Reg, 0, MR.Sub); + BT.put(BitTracker::RegisterRef(NewR), BT.get(MR)); + } + } + + return Changed; +} + + +bool CopyPropagation::isCopyReg(unsigned Opc) { + switch (Opc) { + case TargetOpcode::COPY: + case TargetOpcode::REG_SEQUENCE: + case Hexagon::A2_tfr: + case Hexagon::A2_tfrp: + case Hexagon::A2_combinew: + case Hexagon::A4_combineir: + case Hexagon::A4_combineri: + return true; + default: + break; + } + return false; +} + + +bool CopyPropagation::propagateRegCopy(MachineInstr &MI) { + bool Changed = false; + unsigned Opc = MI.getOpcode(); + BitTracker::RegisterRef RD = MI.getOperand(0); + assert(MI.getOperand(0).getSubReg() == 0); + + switch (Opc) { + case TargetOpcode::COPY: + case Hexagon::A2_tfr: + case Hexagon::A2_tfrp: { + BitTracker::RegisterRef RS = MI.getOperand(1); + if (!HBS::isTransparentCopy(RD, RS, MRI)) + break; + if (RS.Sub != 0) + Changed = HBS::replaceRegWithSub(RD.Reg, RS.Reg, RS.Sub, MRI); + else + Changed = HBS::replaceReg(RD.Reg, RS.Reg, MRI); + break; + } + case TargetOpcode::REG_SEQUENCE: { + BitTracker::RegisterRef SL, SH; + if (HBS::parseRegSequence(MI, SL, SH)) { + Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg, + SL.Reg, SL.Sub, MRI); + Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg, + SH.Reg, SH.Sub, MRI); + } + break; + } + case Hexagon::A2_combinew: { + BitTracker::RegisterRef RH = MI.getOperand(1), RL = MI.getOperand(2); + Changed = HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_loreg, + RL.Reg, RL.Sub, MRI); + Changed |= HBS::replaceSubWithSub(RD.Reg, Hexagon::subreg_hireg, + RH.Reg, RH.Sub, MRI); + break; + } + case Hexagon::A4_combineir: + case Hexagon::A4_combineri: { + unsigned SrcX = (Opc == Hexagon::A4_combineir) ? 2 : 1; + unsigned Sub = (Opc == Hexagon::A4_combineir) ? Hexagon::subreg_loreg + : Hexagon::subreg_hireg; + BitTracker::RegisterRef RS = MI.getOperand(SrcX); + Changed = HBS::replaceSubWithSub(RD.Reg, Sub, RS.Reg, RS.Sub, MRI); + break; + } + } + return Changed; +} + + +bool CopyPropagation::processBlock(MachineBasicBlock &B, const RegisterSet&) { + std::vector<MachineInstr*> Instrs; + for (auto I = B.rbegin(), E = B.rend(); I != E; ++I) + Instrs.push_back(&*I); + + bool Changed = false; + for (auto I : Instrs) { + unsigned Opc = I->getOpcode(); + if (!CopyPropagation::isCopyReg(Opc)) + continue; + Changed |= propagateRegCopy(*I); + } + + return Changed; +} + + +// +// Bit simplification +// +// Recognize patterns that can be simplified and replace them with the +// simpler forms. +// This is by no means complete +namespace { + class BitSimplification : public Transformation { + public: + BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii, + MachineRegisterInfo &mri) + : Transformation(true), HII(hii), MRI(mri), BT(bt) {} + bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override; + private: + struct RegHalf : public BitTracker::RegisterRef { + bool Low; // Low/High halfword. + }; + + bool matchHalf(unsigned SelfR, const BitTracker::RegisterCell &RC, + unsigned B, RegHalf &RH); + + bool matchPackhl(unsigned SelfR, const BitTracker::RegisterCell &RC, + BitTracker::RegisterRef &Rs, BitTracker::RegisterRef &Rt); + unsigned getCombineOpcode(bool HLow, bool LLow); + + bool genStoreUpperHalf(MachineInstr *MI); + bool genStoreImmediate(MachineInstr *MI); + bool genPackhl(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool genExtractHalf(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool genCombineHalf(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD, + const BitTracker::RegisterCell &RC); + + const HexagonInstrInfo &HII; + MachineRegisterInfo &MRI; + BitTracker &BT; + }; +} + + +// Check if the bits [B..B+16) in register cell RC form a valid halfword, +// i.e. [0..16), [16..32), etc. of some register. If so, return true and +// set the information about the found register in RH. +bool BitSimplification::matchHalf(unsigned SelfR, + const BitTracker::RegisterCell &RC, unsigned B, RegHalf &RH) { + // XXX This could be searching in the set of available registers, in case + // the match is not exact. + + // Match 16-bit chunks, where the RC[B..B+15] references exactly one + // register and all the bits B..B+15 match between RC and the register. + // This is meant to match "v1[0-15]", where v1 = { [0]:0 [1-15]:v1... }, + // and RC = { [0]:0 [1-15]:v1[1-15]... }. + bool Low = false; + unsigned I = B; + while (I < B+16 && RC[I].num()) + I++; + if (I == B+16) + return false; + + unsigned Reg = RC[I].RefI.Reg; + unsigned P = RC[I].RefI.Pos; // The RefI.Pos will be advanced by I-B. + if (P < I-B) + return false; + unsigned Pos = P - (I-B); + + if (Reg == 0 || Reg == SelfR) // Don't match "self". + return false; + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return false; + if (!BT.has(Reg)) + return false; + + const BitTracker::RegisterCell &SC = BT.lookup(Reg); + if (Pos+16 > SC.width()) + return false; + + for (unsigned i = 0; i < 16; ++i) { + const BitTracker::BitValue &RV = RC[i+B]; + if (RV.Type == BitTracker::BitValue::Ref) { + if (RV.RefI.Reg != Reg) + return false; + if (RV.RefI.Pos != i+Pos) + return false; + continue; + } + if (RC[i+B] != SC[i+Pos]) + return false; + } + + unsigned Sub = 0; + switch (Pos) { + case 0: + Sub = Hexagon::subreg_loreg; + Low = true; + break; + case 16: + Sub = Hexagon::subreg_loreg; + Low = false; + break; + case 32: + Sub = Hexagon::subreg_hireg; + Low = true; + break; + case 48: + Sub = Hexagon::subreg_hireg; + Low = false; + break; + default: + return false; + } + + RH.Reg = Reg; + RH.Sub = Sub; + RH.Low = Low; + // If the subregister is not valid with the register, set it to 0. + if (!HBS::getFinalVRegClass(RH, MRI)) + RH.Sub = 0; + + return true; +} + + +// Check if RC matches the pattern of a S2_packhl. If so, return true and +// set the inputs Rs and Rt. +bool BitSimplification::matchPackhl(unsigned SelfR, + const BitTracker::RegisterCell &RC, BitTracker::RegisterRef &Rs, + BitTracker::RegisterRef &Rt) { + RegHalf L1, H1, L2, H2; + + if (!matchHalf(SelfR, RC, 0, L2) || !matchHalf(SelfR, RC, 16, L1)) + return false; + if (!matchHalf(SelfR, RC, 32, H2) || !matchHalf(SelfR, RC, 48, H1)) + return false; + + // Rs = H1.L1, Rt = H2.L2 + if (H1.Reg != L1.Reg || H1.Sub != L1.Sub || H1.Low || !L1.Low) + return false; + if (H2.Reg != L2.Reg || H2.Sub != L2.Sub || H2.Low || !L2.Low) + return false; + + Rs = H1; + Rt = H2; + return true; +} + + +unsigned BitSimplification::getCombineOpcode(bool HLow, bool LLow) { + return HLow ? LLow ? Hexagon::A2_combine_ll + : Hexagon::A2_combine_lh + : LLow ? Hexagon::A2_combine_hl + : Hexagon::A2_combine_hh; +} + + +// If MI stores the upper halfword of a register (potentially obtained via +// shifts or extracts), replace it with a storerf instruction. This could +// cause the "extraction" code to become dead. +bool BitSimplification::genStoreUpperHalf(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + if (Opc != Hexagon::S2_storerh_io) + return false; + + MachineOperand &ValOp = MI->getOperand(2); + BitTracker::RegisterRef RS = ValOp; + if (!BT.has(RS.Reg)) + return false; + const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg); + RegHalf H; + if (!matchHalf(0, RC, 0, H)) + return false; + if (H.Low) + return false; + MI->setDesc(HII.get(Hexagon::S2_storerf_io)); + ValOp.setReg(H.Reg); + ValOp.setSubReg(H.Sub); + return true; +} + + +// If MI stores a value known at compile-time, and the value is within a range +// that avoids using constant-extenders, replace it with a store-immediate. +bool BitSimplification::genStoreImmediate(MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + unsigned Align = 0; + switch (Opc) { + case Hexagon::S2_storeri_io: + Align++; + case Hexagon::S2_storerh_io: + Align++; + case Hexagon::S2_storerb_io: + break; + default: + return false; + } + + // Avoid stores to frame-indices (due to an unknown offset). + if (!MI->getOperand(0).isReg()) + return false; + MachineOperand &OffOp = MI->getOperand(1); + if (!OffOp.isImm()) + return false; + + int64_t Off = OffOp.getImm(); + // Offset is u6:a. Sadly, there is no isShiftedUInt(n,x). + if (!isUIntN(6+Align, Off) || (Off & ((1<<Align)-1))) + return false; + // Source register: + BitTracker::RegisterRef RS = MI->getOperand(2); + if (!BT.has(RS.Reg)) + return false; + const BitTracker::RegisterCell &RC = BT.lookup(RS.Reg); + uint64_t U; + if (!HBS::getConst(RC, 0, RC.width(), U)) + return false; + + // Only consider 8-bit values to avoid constant-extenders. + int V; + switch (Opc) { + case Hexagon::S2_storerb_io: + V = int8_t(U); + break; + case Hexagon::S2_storerh_io: + V = int16_t(U); + break; + case Hexagon::S2_storeri_io: + V = int32_t(U); + break; + } + if (!isInt<8>(V)) + return false; + + MI->RemoveOperand(2); + switch (Opc) { + case Hexagon::S2_storerb_io: + MI->setDesc(HII.get(Hexagon::S4_storeirb_io)); + break; + case Hexagon::S2_storerh_io: + MI->setDesc(HII.get(Hexagon::S4_storeirh_io)); + break; + case Hexagon::S2_storeri_io: + MI->setDesc(HII.get(Hexagon::S4_storeiri_io)); + break; + } + MI->addOperand(MachineOperand::CreateImm(V)); + return true; +} + + +// If MI is equivalent o S2_packhl, generate the S2_packhl. MI could be the +// last instruction in a sequence that results in something equivalent to +// the pack-halfwords. The intent is to cause the entire sequence to become +// dead. +bool BitSimplification::genPackhl(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + unsigned Opc = MI->getOpcode(); + if (Opc == Hexagon::S2_packhl) + return false; + BitTracker::RegisterRef Rs, Rt; + if (!matchPackhl(RD.Reg, RC, Rs, Rt)) + return false; + + MachineBasicBlock &B = *MI->getParent(); + unsigned NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(B, MI, DL, HII.get(Hexagon::S2_packhl), NewR) + .addReg(Rs.Reg, 0, Rs.Sub) + .addReg(Rt.Reg, 0, Rt.Sub); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; +} + + +// If MI produces halfword of the input in the low half of the output, +// replace it with zero-extend or extractu. +bool BitSimplification::genExtractHalf(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + RegHalf L; + // Check for halfword in low 16 bits, zeros elsewhere. + if (!matchHalf(RD.Reg, RC, 0, L) || !HBS::isZero(RC, 16, 16)) + return false; + + unsigned Opc = MI->getOpcode(); + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + // Prefer zxth, since zxth can go in any slot, while extractu only in + // slots 2 and 3. + unsigned NewR = 0; + if (L.Low && Opc != Hexagon::A2_zxth) { + NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + BuildMI(B, MI, DL, HII.get(Hexagon::A2_zxth), NewR) + .addReg(L.Reg, 0, L.Sub); + } else if (!L.Low && Opc != Hexagon::S2_extractu) { + NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + BuildMI(B, MI, DL, HII.get(Hexagon::S2_extractu), NewR) + .addReg(L.Reg, 0, L.Sub) + .addImm(16) + .addImm(16); + } + if (NewR == 0) + return false; + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; +} + + +// If MI is equivalent to a combine(.L/.H, .L/.H) replace with with the +// combine. +bool BitSimplification::genCombineHalf(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + RegHalf L, H; + // Check for combine h/l + if (!matchHalf(RD.Reg, RC, 0, L) || !matchHalf(RD.Reg, RC, 16, H)) + return false; + // Do nothing if this is just a reg copy. + if (L.Reg == H.Reg && L.Sub == H.Sub && !H.Low && L.Low) + return false; + + unsigned Opc = MI->getOpcode(); + unsigned COpc = getCombineOpcode(H.Low, L.Low); + if (COpc == Opc) + return false; + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + BuildMI(B, MI, DL, HII.get(COpc), NewR) + .addReg(H.Reg, 0, H.Sub) + .addReg(L.Reg, 0, L.Sub); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; +} + + +// If MI resets high bits of a register and keeps the lower ones, replace it +// with zero-extend byte/half, and-immediate, or extractu, as appropriate. +bool BitSimplification::genExtractLow(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::A2_zxtb: + case Hexagon::A2_zxth: + case Hexagon::S2_extractu: + return false; + } + if (Opc == Hexagon::A2_andir && MI->getOperand(2).isImm()) { + int32_t Imm = MI->getOperand(2).getImm(); + if (isInt<10>(Imm)) + return false; + } + + if (MI->hasUnmodeledSideEffects() || MI->isInlineAsm()) + return false; + unsigned W = RC.width(); + while (W > 0 && RC[W-1].is(0)) + W--; + if (W == 0 || W == RC.width()) + return false; + unsigned NewOpc = (W == 8) ? Hexagon::A2_zxtb + : (W == 16) ? Hexagon::A2_zxth + : (W < 10) ? Hexagon::A2_andir + : Hexagon::S2_extractu; + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + for (auto &Op : MI->uses()) { + if (!Op.isReg()) + continue; + BitTracker::RegisterRef RS = Op; + if (!BT.has(RS.Reg)) + continue; + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + unsigned BN, BW; + if (!HBS::getSubregMask(RS, BN, BW, MRI)) + continue; + if (BW < W || !HBS::isEqual(RC, 0, SC, BN, W)) + continue; + + unsigned NewR = MRI.createVirtualRegister(&Hexagon::IntRegsRegClass); + auto MIB = BuildMI(B, MI, DL, HII.get(NewOpc), NewR) + .addReg(RS.Reg, 0, RS.Sub); + if (NewOpc == Hexagon::A2_andir) + MIB.addImm((1 << W) - 1); + else if (NewOpc == Hexagon::S2_extractu) + MIB.addImm(W).addImm(0); + HBS::replaceSubWithSub(RD.Reg, RD.Sub, NewR, 0, MRI); + BT.put(BitTracker::RegisterRef(NewR), RC); + return true; + } + return false; +} + + +// Check for tstbit simplification opportunity, where the bit being checked +// can be tracked back to another register. For example: +// vreg2 = S2_lsr_i_r vreg1, 5 +// vreg3 = S2_tstbit_i vreg2, 0 +// => +// vreg3 = S2_tstbit_i vreg1, 5 +bool BitSimplification::simplifyTstbit(MachineInstr *MI, + BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) { + unsigned Opc = MI->getOpcode(); + if (Opc != Hexagon::S2_tstbit_i) + return false; + + unsigned BN = MI->getOperand(2).getImm(); + BitTracker::RegisterRef RS = MI->getOperand(1); + unsigned F, W; + DebugLoc DL = MI->getDebugLoc(); + if (!BT.has(RS.Reg) || !HBS::getSubregMask(RS, F, W, MRI)) + return false; + MachineBasicBlock &B = *MI->getParent(); + + const BitTracker::RegisterCell &SC = BT.lookup(RS.Reg); + const BitTracker::BitValue &V = SC[F+BN]; + if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg != RS.Reg) { + const TargetRegisterClass *TC = MRI.getRegClass(V.RefI.Reg); + // Need to map V.RefI.Reg to a 32-bit register, i.e. if it is + // a double register, need to use a subregister and adjust bit + // number. + unsigned P = UINT_MAX; + BitTracker::RegisterRef RR(V.RefI.Reg, 0); + if (TC == &Hexagon::DoubleRegsRegClass) { + P = V.RefI.Pos; + RR.Sub = Hexagon::subreg_loreg; + if (P >= 32) { + P -= 32; + RR.Sub = Hexagon::subreg_hireg; + } + } else if (TC == &Hexagon::IntRegsRegClass) { + P = V.RefI.Pos; + } + if (P != UINT_MAX) { + unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); + BuildMI(B, MI, DL, HII.get(Hexagon::S2_tstbit_i), NewR) + .addReg(RR.Reg, 0, RR.Sub) + .addImm(P); + HBS::replaceReg(RD.Reg, NewR, MRI); + BT.put(NewR, RC); + return true; + } + } else if (V.is(0) || V.is(1)) { + unsigned NewR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass); + unsigned NewOpc = V.is(0) ? Hexagon::TFR_PdFalse : Hexagon::TFR_PdTrue; + BuildMI(B, MI, DL, HII.get(NewOpc), NewR); + HBS::replaceReg(RD.Reg, NewR, MRI); + return true; + } + + return false; +} + + +bool BitSimplification::processBlock(MachineBasicBlock &B, + const RegisterSet &AVs) { + bool Changed = false; + RegisterSet AVB = AVs; + RegisterSet Defs; + + for (auto I = B.begin(), E = B.end(); I != E; ++I, AVB.insert(Defs)) { + MachineInstr *MI = &*I; + Defs.clear(); + HBS::getInstrDefs(*MI, Defs); + + unsigned Opc = MI->getOpcode(); + if (Opc == TargetOpcode::COPY || Opc == TargetOpcode::REG_SEQUENCE) + continue; + + if (MI->mayStore()) { + bool T = genStoreUpperHalf(MI); + T = T || genStoreImmediate(MI); + Changed |= T; + continue; + } + + if (Defs.count() != 1) + continue; + const MachineOperand &Op0 = MI->getOperand(0); + if (!Op0.isReg() || !Op0.isDef()) + continue; + BitTracker::RegisterRef RD = Op0; + if (!BT.has(RD.Reg)) + continue; + const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI); + const BitTracker::RegisterCell &RC = BT.lookup(RD.Reg); + + if (FRC->getID() == Hexagon::DoubleRegsRegClassID) { + bool T = genPackhl(MI, RD, RC); + Changed |= T; + continue; + } + + if (FRC->getID() == Hexagon::IntRegsRegClassID) { + bool T = genExtractHalf(MI, RD, RC); + T = T || genCombineHalf(MI, RD, RC); + T = T || genExtractLow(MI, RD, RC); + Changed |= T; + continue; + } + + if (FRC->getID() == Hexagon::PredRegsRegClassID) { + bool T = simplifyTstbit(MI, RD, RC); + Changed |= T; + continue; + } + } + return Changed; +} + + +bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) { + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + auto &HRI = *HST.getRegisterInfo(); + auto &HII = *HST.getInstrInfo(); + + MDT = &getAnalysis<MachineDominatorTree>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + bool Changed; + + Changed = DeadCodeElimination(MF, *MDT).run(); + + const HexagonEvaluator HE(HRI, MRI, HII, MF); + BitTracker BT(HE, MF); + DEBUG(BT.trace(true)); + BT.run(); + + MachineBasicBlock &Entry = MF.front(); + + RegisterSet AIG; // Available registers for IG. + ConstGeneration ImmG(BT, HII, MRI); + Changed |= visitBlock(Entry, ImmG, AIG); + + RegisterSet ARE; // Available registers for RIE. + RedundantInstrElimination RIE(BT, HII, MRI); + Changed |= visitBlock(Entry, RIE, ARE); + + RegisterSet ACG; // Available registers for CG. + CopyGeneration CopyG(BT, HII, MRI); + Changed |= visitBlock(Entry, CopyG, ACG); + + RegisterSet ACP; // Available registers for CP. + CopyPropagation CopyP(HRI, MRI); + Changed |= visitBlock(Entry, CopyP, ACP); + + Changed = DeadCodeElimination(MF, *MDT).run() || Changed; + + BT.run(); + RegisterSet ABS; // Available registers for BS. + BitSimplification BitS(BT, HII, MRI); + Changed |= visitBlock(Entry, BitS, ABS); + + Changed = DeadCodeElimination(MF, *MDT).run() || Changed; + + if (Changed) { + for (auto &B : MF) + for (auto &I : B) + I.clearKillInfo(); + DeadCodeElimination(MF, *MDT).run(); + } + return Changed; +} + + +// Recognize loops where the code at the end of the loop matches the code +// before the entry of the loop, and the matching code is such that is can +// be simplified. This pass relies on the bit simplification above and only +// prepares code in a way that can be handled by the bit simplifcation. +// +// This is the motivating testcase (and explanation): +// +// { +// loop0(.LBB0_2, r1) // %for.body.preheader +// r5:4 = memd(r0++#8) +// } +// { +// r3 = lsr(r4, #16) +// r7:6 = combine(r5, r5) +// } +// { +// r3 = insert(r5, #16, #16) +// r7:6 = vlsrw(r7:6, #16) +// } +// .LBB0_2: +// { +// memh(r2+#4) = r5 +// memh(r2+#6) = r6 # R6 is really R5.H +// } +// { +// r2 = add(r2, #8) +// memh(r2+#0) = r4 +// memh(r2+#2) = r3 # R3 is really R4.H +// } +// { +// r5:4 = memd(r0++#8) +// } +// { # "Shuffling" code that sets up R3 and R6 +// r3 = lsr(r4, #16) # so that their halves can be stored in the +// r7:6 = combine(r5, r5) # next iteration. This could be folded into +// } # the stores if the code was at the beginning +// { # of the loop iteration. Since the same code +// r3 = insert(r5, #16, #16) # precedes the loop, it can actually be moved +// r7:6 = vlsrw(r7:6, #16) # there. +// }:endloop0 +// +// +// The outcome: +// +// { +// loop0(.LBB0_2, r1) +// r5:4 = memd(r0++#8) +// } +// .LBB0_2: +// { +// memh(r2+#4) = r5 +// memh(r2+#6) = r5.h +// } +// { +// r2 = add(r2, #8) +// memh(r2+#0) = r4 +// memh(r2+#2) = r4.h +// } +// { +// r5:4 = memd(r0++#8) +// }:endloop0 + +namespace llvm { + FunctionPass *createHexagonLoopRescheduling(); + void initializeHexagonLoopReschedulingPass(PassRegistry&); +} + +namespace { + class HexagonLoopRescheduling : public MachineFunctionPass { + public: + static char ID; + HexagonLoopRescheduling() : MachineFunctionPass(ID), + HII(0), HRI(0), MRI(0), BTP(0) { + initializeHexagonLoopReschedulingPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; + MachineRegisterInfo *MRI; + BitTracker *BTP; + + struct LoopCand { + LoopCand(MachineBasicBlock *lb, MachineBasicBlock *pb, + MachineBasicBlock *eb) : LB(lb), PB(pb), EB(eb) {} + MachineBasicBlock *LB, *PB, *EB; + }; + typedef std::vector<MachineInstr*> InstrList; + struct InstrGroup { + BitTracker::RegisterRef Inp, Out; + InstrList Ins; + }; + struct PhiInfo { + PhiInfo(MachineInstr &P, MachineBasicBlock &B); + unsigned DefR; + BitTracker::RegisterRef LR, PR; + MachineBasicBlock *LB, *PB; + }; + + static unsigned getDefReg(const MachineInstr *MI); + bool isConst(unsigned Reg) const; + bool isBitShuffle(const MachineInstr *MI, unsigned DefR) const; + bool isStoreInput(const MachineInstr *MI, unsigned DefR) const; + bool isShuffleOf(unsigned OutR, unsigned InpR) const; + bool isSameShuffle(unsigned OutR1, unsigned InpR1, unsigned OutR2, + unsigned &InpR2) const; + void moveGroup(InstrGroup &G, MachineBasicBlock &LB, MachineBasicBlock &PB, + MachineBasicBlock::iterator At, unsigned OldPhiR, unsigned NewPredR); + bool processLoop(LoopCand &C); + }; +} + +char HexagonLoopRescheduling::ID = 0; + +INITIALIZE_PASS(HexagonLoopRescheduling, "hexagon-loop-resched", + "Hexagon Loop Rescheduling", false, false) + + +HexagonLoopRescheduling::PhiInfo::PhiInfo(MachineInstr &P, + MachineBasicBlock &B) { + DefR = HexagonLoopRescheduling::getDefReg(&P); + LB = &B; + PB = nullptr; + for (unsigned i = 1, n = P.getNumOperands(); i < n; i += 2) { + const MachineOperand &OpB = P.getOperand(i+1); + if (OpB.getMBB() == &B) { + LR = P.getOperand(i); + continue; + } + PB = OpB.getMBB(); + PR = P.getOperand(i); + } +} + + +unsigned HexagonLoopRescheduling::getDefReg(const MachineInstr *MI) { + RegisterSet Defs; + HBS::getInstrDefs(*MI, Defs); + if (Defs.count() != 1) + return 0; + return Defs.find_first(); +} + + +bool HexagonLoopRescheduling::isConst(unsigned Reg) const { + if (!BTP->has(Reg)) + return false; + const BitTracker::RegisterCell &RC = BTP->lookup(Reg); + for (unsigned i = 0, w = RC.width(); i < w; ++i) { + const BitTracker::BitValue &V = RC[i]; + if (!V.is(0) && !V.is(1)) + return false; + } + return true; +} + + +bool HexagonLoopRescheduling::isBitShuffle(const MachineInstr *MI, + unsigned DefR) const { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case TargetOpcode::COPY: + case Hexagon::S2_lsr_i_r: + case Hexagon::S2_asr_i_r: + case Hexagon::S2_asl_i_r: + case Hexagon::S2_lsr_i_p: + case Hexagon::S2_asr_i_p: + case Hexagon::S2_asl_i_p: + case Hexagon::S2_insert: + case Hexagon::A2_or: + case Hexagon::A2_orp: + case Hexagon::A2_and: + case Hexagon::A2_andp: + case Hexagon::A2_combinew: + case Hexagon::A4_combineri: + case Hexagon::A4_combineir: + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + case Hexagon::A2_combine_ll: + case Hexagon::A2_combine_lh: + case Hexagon::A2_combine_hl: + case Hexagon::A2_combine_hh: + return true; + } + return false; +} + + +bool HexagonLoopRescheduling::isStoreInput(const MachineInstr *MI, + unsigned InpR) const { + for (unsigned i = 0, n = MI->getNumOperands(); i < n; ++i) { + const MachineOperand &Op = MI->getOperand(i); + if (!Op.isReg()) + continue; + if (Op.getReg() == InpR) + return i == n-1; + } + return false; +} + + +bool HexagonLoopRescheduling::isShuffleOf(unsigned OutR, unsigned InpR) const { + if (!BTP->has(OutR) || !BTP->has(InpR)) + return false; + const BitTracker::RegisterCell &OutC = BTP->lookup(OutR); + for (unsigned i = 0, w = OutC.width(); i < w; ++i) { + const BitTracker::BitValue &V = OutC[i]; + if (V.Type != BitTracker::BitValue::Ref) + continue; + if (V.RefI.Reg != InpR) + return false; + } + return true; +} + + +bool HexagonLoopRescheduling::isSameShuffle(unsigned OutR1, unsigned InpR1, + unsigned OutR2, unsigned &InpR2) const { + if (!BTP->has(OutR1) || !BTP->has(InpR1) || !BTP->has(OutR2)) + return false; + const BitTracker::RegisterCell &OutC1 = BTP->lookup(OutR1); + const BitTracker::RegisterCell &OutC2 = BTP->lookup(OutR2); + unsigned W = OutC1.width(); + unsigned MatchR = 0; + if (W != OutC2.width()) + return false; + for (unsigned i = 0; i < W; ++i) { + const BitTracker::BitValue &V1 = OutC1[i], &V2 = OutC2[i]; + if (V1.Type != V2.Type || V1.Type == BitTracker::BitValue::One) + return false; + if (V1.Type != BitTracker::BitValue::Ref) + continue; + if (V1.RefI.Pos != V2.RefI.Pos) + return false; + if (V1.RefI.Reg != InpR1) + return false; + if (V2.RefI.Reg == 0 || V2.RefI.Reg == OutR2) + return false; + if (!MatchR) + MatchR = V2.RefI.Reg; + else if (V2.RefI.Reg != MatchR) + return false; + } + InpR2 = MatchR; + return true; +} + + +void HexagonLoopRescheduling::moveGroup(InstrGroup &G, MachineBasicBlock &LB, + MachineBasicBlock &PB, MachineBasicBlock::iterator At, unsigned OldPhiR, + unsigned NewPredR) { + DenseMap<unsigned,unsigned> RegMap; + + const TargetRegisterClass *PhiRC = MRI->getRegClass(NewPredR); + unsigned PhiR = MRI->createVirtualRegister(PhiRC); + BuildMI(LB, At, At->getDebugLoc(), HII->get(TargetOpcode::PHI), PhiR) + .addReg(NewPredR) + .addMBB(&PB) + .addReg(G.Inp.Reg) + .addMBB(&LB); + RegMap.insert(std::make_pair(G.Inp.Reg, PhiR)); + + for (unsigned i = G.Ins.size(); i > 0; --i) { + const MachineInstr *SI = G.Ins[i-1]; + unsigned DR = getDefReg(SI); + const TargetRegisterClass *RC = MRI->getRegClass(DR); + unsigned NewDR = MRI->createVirtualRegister(RC); + DebugLoc DL = SI->getDebugLoc(); + + auto MIB = BuildMI(LB, At, DL, HII->get(SI->getOpcode()), NewDR); + for (unsigned j = 0, m = SI->getNumOperands(); j < m; ++j) { + const MachineOperand &Op = SI->getOperand(j); + if (!Op.isReg()) { + MIB.addOperand(Op); + continue; + } + if (!Op.isUse()) + continue; + unsigned UseR = RegMap[Op.getReg()]; + MIB.addReg(UseR, 0, Op.getSubReg()); + } + RegMap.insert(std::make_pair(DR, NewDR)); + } + + HBS::replaceReg(OldPhiR, RegMap[G.Out.Reg], *MRI); +} + + +bool HexagonLoopRescheduling::processLoop(LoopCand &C) { + DEBUG(dbgs() << "Processing loop in BB#" << C.LB->getNumber() << "\n"); + std::vector<PhiInfo> Phis; + for (auto &I : *C.LB) { + if (!I.isPHI()) + break; + unsigned PR = getDefReg(&I); + if (isConst(PR)) + continue; + bool BadUse = false, GoodUse = false; + for (auto UI = MRI->use_begin(PR), UE = MRI->use_end(); UI != UE; ++UI) { + MachineInstr *UseI = UI->getParent(); + if (UseI->getParent() != C.LB) { + BadUse = true; + break; + } + if (isBitShuffle(UseI, PR) || isStoreInput(UseI, PR)) + GoodUse = true; + } + if (BadUse || !GoodUse) + continue; + + Phis.push_back(PhiInfo(I, *C.LB)); + } + + DEBUG({ + dbgs() << "Phis: {"; + for (auto &I : Phis) { + dbgs() << ' ' << PrintReg(I.DefR, HRI) << "=phi(" + << PrintReg(I.PR.Reg, HRI, I.PR.Sub) << ":b" << I.PB->getNumber() + << ',' << PrintReg(I.LR.Reg, HRI, I.LR.Sub) << ":b" + << I.LB->getNumber() << ')'; + } + dbgs() << " }\n"; + }); + + if (Phis.empty()) + return false; + + bool Changed = false; + InstrList ShufIns; + + // Go backwards in the block: for each bit shuffling instruction, check + // if that instruction could potentially be moved to the front of the loop: + // the output of the loop cannot be used in a non-shuffling instruction + // in this loop. + for (auto I = C.LB->rbegin(), E = C.LB->rend(); I != E; ++I) { + if (I->isTerminator()) + continue; + if (I->isPHI()) + break; + + RegisterSet Defs; + HBS::getInstrDefs(*I, Defs); + if (Defs.count() != 1) + continue; + unsigned DefR = Defs.find_first(); + if (!TargetRegisterInfo::isVirtualRegister(DefR)) + continue; + if (!isBitShuffle(&*I, DefR)) + continue; + + bool BadUse = false; + for (auto UI = MRI->use_begin(DefR), UE = MRI->use_end(); UI != UE; ++UI) { + MachineInstr *UseI = UI->getParent(); + if (UseI->getParent() == C.LB) { + if (UseI->isPHI()) { + // If the use is in a phi node in this loop, then it should be + // the value corresponding to the back edge. + unsigned Idx = UI.getOperandNo(); + if (UseI->getOperand(Idx+1).getMBB() != C.LB) + BadUse = true; + } else { + auto F = std::find(ShufIns.begin(), ShufIns.end(), UseI); + if (F == ShufIns.end()) + BadUse = true; + } + } else { + // There is a use outside of the loop, but there is no epilog block + // suitable for a copy-out. + if (C.EB == nullptr) + BadUse = true; + } + if (BadUse) + break; + } + + if (BadUse) + continue; + ShufIns.push_back(&*I); + } + + // Partition the list of shuffling instructions into instruction groups, + // where each group has to be moved as a whole (i.e. a group is a chain of + // dependent instructions). A group produces a single live output register, + // which is meant to be the input of the loop phi node (although this is + // not checked here yet). It also uses a single register as its input, + // which is some value produced in the loop body. After moving the group + // to the beginning of the loop, that input register would need to be + // the loop-carried register (through a phi node) instead of the (currently + // loop-carried) output register. + typedef std::vector<InstrGroup> InstrGroupList; + InstrGroupList Groups; + + for (unsigned i = 0, n = ShufIns.size(); i < n; ++i) { + MachineInstr *SI = ShufIns[i]; + if (SI == nullptr) + continue; + + InstrGroup G; + G.Ins.push_back(SI); + G.Out.Reg = getDefReg(SI); + RegisterSet Inputs; + HBS::getInstrUses(*SI, Inputs); + + for (unsigned j = i+1; j < n; ++j) { + MachineInstr *MI = ShufIns[j]; + if (MI == nullptr) + continue; + RegisterSet Defs; + HBS::getInstrDefs(*MI, Defs); + // If this instruction does not define any pending inputs, skip it. + if (!Defs.intersects(Inputs)) + continue; + // Otherwise, add it to the current group and remove the inputs that + // are defined by MI. + G.Ins.push_back(MI); + Inputs.remove(Defs); + // Then add all registers used by MI. + HBS::getInstrUses(*MI, Inputs); + ShufIns[j] = nullptr; + } + + // Only add a group if it requires at most one register. + if (Inputs.count() > 1) + continue; + auto LoopInpEq = [G] (const PhiInfo &P) -> bool { + return G.Out.Reg == P.LR.Reg; + }; + if (std::find_if(Phis.begin(), Phis.end(), LoopInpEq) == Phis.end()) + continue; + + G.Inp.Reg = Inputs.find_first(); + Groups.push_back(G); + } + + DEBUG({ + for (unsigned i = 0, n = Groups.size(); i < n; ++i) { + InstrGroup &G = Groups[i]; + dbgs() << "Group[" << i << "] inp: " + << PrintReg(G.Inp.Reg, HRI, G.Inp.Sub) + << " out: " << PrintReg(G.Out.Reg, HRI, G.Out.Sub) << "\n"; + for (unsigned j = 0, m = G.Ins.size(); j < m; ++j) + dbgs() << " " << *G.Ins[j]; + } + }); + + for (unsigned i = 0, n = Groups.size(); i < n; ++i) { + InstrGroup &G = Groups[i]; + if (!isShuffleOf(G.Out.Reg, G.Inp.Reg)) + continue; + auto LoopInpEq = [G] (const PhiInfo &P) -> bool { + return G.Out.Reg == P.LR.Reg; + }; + auto F = std::find_if(Phis.begin(), Phis.end(), LoopInpEq); + if (F == Phis.end()) + continue; + unsigned PredR = 0; + if (!isSameShuffle(G.Out.Reg, G.Inp.Reg, F->PR.Reg, PredR)) { + const MachineInstr *DefPredR = MRI->getVRegDef(F->PR.Reg); + unsigned Opc = DefPredR->getOpcode(); + if (Opc != Hexagon::A2_tfrsi && Opc != Hexagon::A2_tfrpi) + continue; + if (!DefPredR->getOperand(1).isImm()) + continue; + if (DefPredR->getOperand(1).getImm() != 0) + continue; + const TargetRegisterClass *RC = MRI->getRegClass(G.Inp.Reg); + if (RC != MRI->getRegClass(F->PR.Reg)) { + PredR = MRI->createVirtualRegister(RC); + unsigned TfrI = (RC == &Hexagon::IntRegsRegClass) ? Hexagon::A2_tfrsi + : Hexagon::A2_tfrpi; + auto T = C.PB->getFirstTerminator(); + DebugLoc DL = (T != C.PB->end()) ? T->getDebugLoc() : DebugLoc(); + BuildMI(*C.PB, T, DL, HII->get(TfrI), PredR) + .addImm(0); + } else { + PredR = F->PR.Reg; + } + } + assert(MRI->getRegClass(PredR) == MRI->getRegClass(G.Inp.Reg)); + moveGroup(G, *F->LB, *F->PB, F->LB->getFirstNonPHI(), F->DefR, PredR); + Changed = true; + } + + return Changed; +} + + +bool HexagonLoopRescheduling::runOnMachineFunction(MachineFunction &MF) { + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + HII = HST.getInstrInfo(); + HRI = HST.getRegisterInfo(); + MRI = &MF.getRegInfo(); + const HexagonEvaluator HE(*HRI, *MRI, *HII, MF); + BitTracker BT(HE, MF); + DEBUG(BT.trace(true)); + BT.run(); + BTP = &BT; + + std::vector<LoopCand> Cand; + + for (auto &B : MF) { + if (B.pred_size() != 2 || B.succ_size() != 2) + continue; + MachineBasicBlock *PB = nullptr; + bool IsLoop = false; + for (auto PI = B.pred_begin(), PE = B.pred_end(); PI != PE; ++PI) { + if (*PI != &B) + PB = *PI; + else + IsLoop = true; + } + if (!IsLoop) + continue; + + MachineBasicBlock *EB = nullptr; + for (auto SI = B.succ_begin(), SE = B.succ_end(); SI != SE; ++SI) { + if (*SI == &B) + continue; + // Set EP to the epilog block, if it has only 1 predecessor (i.e. the + // edge from B to EP is non-critical. + if ((*SI)->pred_size() == 1) + EB = *SI; + break; + } + + Cand.push_back(LoopCand(&B, PB, EB)); + } + + bool Changed = false; + for (auto &C : Cand) + Changed |= processLoop(C); + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// + +FunctionPass *llvm::createHexagonLoopRescheduling() { + return new HexagonLoopRescheduling(); +} + +FunctionPass *llvm::createHexagonBitSimplify() { + return new HexagonBitSimplify(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp index 021e58a..d5848dc 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -84,6 +84,8 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const { uint16_t RW = getRegBitWidth(RegisterRef(Reg, Sub)); switch (ID) { case DoubleRegsRegClassID: + case VecDblRegsRegClassID: + case VecDblRegs128BRegClassID: return (Sub == subreg_loreg) ? BT::BitMask(0, RW-1) : BT::BitMask(RW, 2*RW-1); default: @@ -95,30 +97,29 @@ BT::BitMask HexagonEvaluator::mask(unsigned Reg, unsigned Sub) const { llvm_unreachable("Unexpected register/subregister"); } - namespace { - struct RegisterRefs : public std::vector<BT::RegisterRef> { - typedef std::vector<BT::RegisterRef> Base; - RegisterRefs(const MachineInstr *MI); - const BT::RegisterRef &operator[](unsigned n) const { - // The main purpose of this operator is to assert with bad argument. - assert(n < size()); - return Base::operator[](n); - } - }; +class RegisterRefs { + std::vector<BT::RegisterRef> Vector; - RegisterRefs::RegisterRefs(const MachineInstr *MI) - : Base(MI->getNumOperands()) { - for (unsigned i = 0, n = size(); i < n; ++i) { +public: + RegisterRefs(const MachineInstr *MI) : Vector(MI->getNumOperands()) { + for (unsigned i = 0, n = Vector.size(); i < n; ++i) { const MachineOperand &MO = MI->getOperand(i); if (MO.isReg()) - at(i) = BT::RegisterRef(MO); + Vector[i] = BT::RegisterRef(MO); // For indices that don't correspond to registers, the entry will // remain constructed via the default constructor. } } -} + size_t size() const { return Vector.size(); } + const BT::RegisterRef &operator[](unsigned n) const { + // The main purpose of this operator is to assert with bad argument. + assert(n < Vector.size()); + return Vector[n]; + } +}; +} bool HexagonEvaluator::evaluate(const MachineInstr *MI, const CellMapType &Inputs, CellMapType &Outputs) const { @@ -189,7 +190,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr *MI, return true; }; // Get the cell corresponding to the N-th operand. - auto cop = [this,Reg,MI,Inputs] (unsigned N, uint16_t W) + auto cop = [this,&Reg,&MI,&Inputs] (unsigned N, uint16_t W) -> BT::RegisterCell { const MachineOperand &Op = MI->getOperand(N); if (Op.isImm()) diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp index 3753b745..efafdd0 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp @@ -102,7 +102,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block. MachineBasicBlock::iterator MII = MBB->getFirstTerminator(); @@ -186,13 +186,11 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { if (case1 || case2) { InvertAndChangeJumpTarget(MI, UncondTarget); - MBB->removeSuccessor(JumpAroundTarget); - MBB->addSuccessor(UncondTarget); + MBB->replaceSuccessor(JumpAroundTarget, UncondTarget); // Remove the unconditional branch in LayoutSucc. LayoutSucc->erase(LayoutSucc->begin()); - LayoutSucc->removeSuccessor(UncondTarget); - LayoutSucc->addSuccessor(JumpAroundTarget); + LayoutSucc->replaceSuccessor(UncondTarget, JumpAroundTarget); // This code performs the conversion for case 2, which moves // the block to the fall-thru case (BB3 in the code above). @@ -210,16 +208,15 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) { // The live-in to LayoutSucc is now all values live-in to // JumpAroundTarget. // - std::vector<unsigned> OrigLiveIn(LayoutSucc->livein_begin(), - LayoutSucc->livein_end()); - std::vector<unsigned> NewLiveIn(JumpAroundTarget->livein_begin(), - JumpAroundTarget->livein_end()); - for (unsigned i = 0; i < OrigLiveIn.size(); ++i) { - LayoutSucc->removeLiveIn(OrigLiveIn[i]); - } - for (unsigned i = 0; i < NewLiveIn.size(); ++i) { - LayoutSucc->addLiveIn(NewLiveIn[i]); - } + std::vector<MachineBasicBlock::RegisterMaskPair> OrigLiveIn( + LayoutSucc->livein_begin(), LayoutSucc->livein_end()); + std::vector<MachineBasicBlock::RegisterMaskPair> NewLiveIn( + JumpAroundTarget->livein_begin(), + JumpAroundTarget->livein_end()); + for (const auto &OrigLI : OrigLiveIn) + LayoutSucc->removeLiveIn(OrigLI.PhysReg); + for (const auto &NewLI : NewLiveIn) + LayoutSucc->addLiveIn(NewLI); } } } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp index 9f5fac1..931db66 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp @@ -59,30 +59,23 @@ namespace { // Numbering map for gep nodes. Used to keep track of ordering for // gep nodes. - struct NodeNumbering : public std::map<const GepNode*,unsigned> { - }; - - struct NodeOrdering : public NodeNumbering { + struct NodeOrdering { NodeOrdering() : LastNum(0) {} -#ifdef _MSC_VER - void special_insert_for_special_msvc(const GepNode *N) -#else - using NodeNumbering::insert; - void insert(const GepNode* N) -#endif - { - insert(std::make_pair(N, ++LastNum)); - } - bool operator() (const GepNode* N1, const GepNode *N2) const { - const_iterator F1 = find(N1), F2 = find(N2); - assert(F1 != end() && F2 != end()); + + void insert(const GepNode *N) { Map.insert(std::make_pair(N, ++LastNum)); } + void clear() { Map.clear(); } + + bool operator()(const GepNode *N1, const GepNode *N2) const { + auto F1 = Map.find(N1), F2 = Map.find(N2); + assert(F1 != Map.end() && F2 != Map.end()); return F1->second < F2->second; } + private: + std::map<const GepNode *, unsigned> Map; unsigned LastNum; }; - class HexagonCommonGEP : public FunctionPass { public: static char ID; @@ -360,11 +353,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI, Us.insert(&UI.getUse()); } Nodes.push_back(N); -#ifdef _MSC_VER - NodeOrder.special_insert_for_special_msvc(N); -#else NodeOrder.insert(N); -#endif // Skip the first index operand, since we only handle 0. This dereferences // the pointer operand. @@ -379,11 +368,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI, Nx->PTy = PtrTy; Nx->Idx = Op; Nodes.push_back(Nx); -#ifdef _MSC_VER - NodeOrder.special_insert_for_special_msvc(Nx); -#else NodeOrder.insert(Nx); -#endif PN = Nx; PtrTy = next_type(PtrTy, Op); @@ -404,7 +389,7 @@ void HexagonCommonGEP::processGepInst(GetElementPtrInst *GepI, void HexagonCommonGEP::collect() { // Establish depth-first traversal order of the dominator tree. ValueVect BO; - getBlockTraversalOrder(Fn->begin(), BO); + getBlockTraversalOrder(&Fn->front(), BO); // The creation of gep nodes requires DT-traversal. When processing a GEP // instruction that uses another GEP instruction as the base pointer, the @@ -737,7 +722,7 @@ namespace { Instruction *In = cast<Instruction>(V); if (In->getParent() != B) continue; - BasicBlock::iterator It = In; + BasicBlock::iterator It = In->getIterator(); if (std::distance(FirstUse, BEnd) < std::distance(It, BEnd)) FirstUse = It; } @@ -1135,7 +1120,7 @@ Value *HexagonCommonGEP::fabricateGEP(NodeVect &NA, BasicBlock::iterator At, ArrayRef<Value*> A(IdxList, IdxC); Type *InpTy = Input->getType(); Type *ElTy = cast<PointerType>(InpTy->getScalarType())->getElementType(); - NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", At); + NewInst = GetElementPtrInst::Create(ElTy, Input, A, "cgep", &*At); DEBUG(dbgs() << "new GEP: " << *NewInst << '\n'); Input = NewInst; } while (nax <= Num); @@ -1213,7 +1198,7 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) { Last = Child; } while (true); - BasicBlock::iterator InsertAt = LastB->getTerminator(); + BasicBlock::iterator InsertAt = LastB->getTerminator()->getIterator(); if (LastUsed || LastCN > 0) { ValueVect Urs; getAllUsersForNode(Root, Urs, NCM); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp new file mode 100644 index 0000000..ee0c318 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -0,0 +1,1063 @@ +//===--- HexagonEarlyIfConv.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements a Hexagon-specific if-conversion pass that runs on the +// SSA form. +// In SSA it is not straightforward to represent instructions that condi- +// tionally define registers, since a conditionally-defined register may +// only be used under the same condition on which the definition was based. +// To avoid complications of this nature, this patch will only generate +// predicated stores, and speculate other instructions from the "if-conver- +// ted" block. +// The code will recognize CFG patterns where a block with a conditional +// branch "splits" into a "true block" and a "false block". Either of these +// could be omitted (in case of a triangle, for example). +// If after conversion of the side block(s) the CFG allows it, the resul- +// ting blocks may be merged. If the "join" block contained PHI nodes, they +// will be replaced with MUX (or MUX-like) instructions to maintain the +// semantics of the PHI. +// +// Example: +// +// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1 +// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0 +// J2_jumpt %vreg41<kill>, <BB#5>, %PC<imp-def,dead> +// J2_jump <BB#4>, %PC<imp-def,dead> +// Successors according to CFG: BB#4(62) BB#5(62) +// +// BB#4: derived from LLVM BB %if.then +// Predecessors according to CFG: BB#3 +// %vreg11<def> = A2_addp %vreg6, %vreg10 +// S2_storerd_io %vreg32, 16, %vreg11 +// Successors according to CFG: BB#5 +// +// BB#5: derived from LLVM BB %if.end +// Predecessors according to CFG: BB#3 BB#4 +// %vreg12<def> = PHI %vreg6, <BB#3>, %vreg11, <BB#4> +// %vreg13<def> = A2_addp %vreg7, %vreg12 +// %vreg42<def> = C2_cmpeqi %vreg9, 10 +// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead> +// J2_jump <BB#6>, %PC<imp-def,dead> +// Successors according to CFG: BB#6(4) BB#3(124) +// +// would become: +// +// %vreg40<def> = L2_loadrub_io %vreg39<kill>, 1 +// %vreg41<def> = S2_tstbit_i %vreg40<kill>, 0 +// spec-> %vreg11<def> = A2_addp %vreg6, %vreg10 +// pred-> S2_pstorerdf_io %vreg41, %vreg32, 16, %vreg11 +// %vreg46<def> = MUX64_rr %vreg41, %vreg6, %vreg11 +// %vreg13<def> = A2_addp %vreg7, %vreg46 +// %vreg42<def> = C2_cmpeqi %vreg9, 10 +// J2_jumpf %vreg42<kill>, <BB#3>, %PC<imp-def,dead> +// J2_jump <BB#6>, %PC<imp-def,dead> +// Successors according to CFG: BB#6 BB#3 + +#define DEBUG_TYPE "hexagon-eif" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "HexagonTargetMachine.h" + +#include <functional> +#include <set> +#include <vector> + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonEarlyIfConversion(); + void initializeHexagonEarlyIfConversionPass(PassRegistry& Registry); +} + +namespace { + cl::opt<bool> EnableHexagonBP("enable-hexagon-br-prob", cl::Hidden, + cl::init(false), cl::desc("Enable branch probability info")); + cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden, + cl::desc("Size limit in Hexagon early if-conversion")); + + struct PrintMB { + PrintMB(const MachineBasicBlock *B) : MB(B) {} + const MachineBasicBlock *MB; + }; + raw_ostream &operator<< (raw_ostream &OS, const PrintMB &P) { + if (!P.MB) + return OS << "<none>"; + return OS << '#' << P.MB->getNumber(); + } + + struct FlowPattern { + FlowPattern() : SplitB(0), TrueB(0), FalseB(0), JoinB(0), PredR(0) {} + FlowPattern(MachineBasicBlock *B, unsigned PR, MachineBasicBlock *TB, + MachineBasicBlock *FB, MachineBasicBlock *JB) + : SplitB(B), TrueB(TB), FalseB(FB), JoinB(JB), PredR(PR) {} + + MachineBasicBlock *SplitB; + MachineBasicBlock *TrueB, *FalseB, *JoinB; + unsigned PredR; + }; + struct PrintFP { + PrintFP(const FlowPattern &P, const TargetRegisterInfo &T) + : FP(P), TRI(T) {} + const FlowPattern &FP; + const TargetRegisterInfo &TRI; + friend raw_ostream &operator<< (raw_ostream &OS, const PrintFP &P); + }; + raw_ostream &operator<<(raw_ostream &OS, + const PrintFP &P) LLVM_ATTRIBUTE_UNUSED; + raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) { + OS << "{ SplitB:" << PrintMB(P.FP.SplitB) + << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI) + << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:" + << PrintMB(P.FP.FalseB) + << ", JoinB:" << PrintMB(P.FP.JoinB) << " }"; + return OS; + } + + class HexagonEarlyIfConversion : public MachineFunctionPass { + public: + static char ID; + HexagonEarlyIfConversion() : MachineFunctionPass(ID), + TII(0), TRI(0), MFN(0), MRI(0), MDT(0), MLI(0) { + initializeHexagonEarlyIfConversionPass(*PassRegistry::getPassRegistry()); + } + const char *getPassName() const override { + return "Hexagon early if conversion"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineBranchProbabilityInfo>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + typedef DenseSet<MachineBasicBlock*> BlockSetType; + + bool isPreheader(const MachineBasicBlock *B) const; + bool matchFlowPattern(MachineBasicBlock *B, MachineLoop *L, + FlowPattern &FP); + bool visitBlock(MachineBasicBlock *B, MachineLoop *L); + bool visitLoop(MachineLoop *L); + + bool hasEHLabel(const MachineBasicBlock *B) const; + bool hasUncondBranch(const MachineBasicBlock *B) const; + bool isValidCandidate(const MachineBasicBlock *B) const; + bool usesUndefVReg(const MachineInstr *MI) const; + bool isValid(const FlowPattern &FP) const; + unsigned countPredicateDefs(const MachineBasicBlock *B) const; + unsigned computePhiCost(MachineBasicBlock *B) const; + bool isProfitable(const FlowPattern &FP) const; + bool isPredicableStore(const MachineInstr *MI) const; + bool isSafeToSpeculate(const MachineInstr *MI) const; + + unsigned getCondStoreOpcode(unsigned Opc, bool IfTrue) const; + void predicateInstr(MachineBasicBlock *ToB, MachineBasicBlock::iterator At, + MachineInstr *MI, unsigned PredR, bool IfTrue); + void predicateBlockNB(MachineBasicBlock *ToB, + MachineBasicBlock::iterator At, MachineBasicBlock *FromB, + unsigned PredR, bool IfTrue); + + void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP); + void convert(const FlowPattern &FP); + + void removeBlock(MachineBasicBlock *B); + void eliminatePhis(MachineBasicBlock *B); + void replacePhiEdges(MachineBasicBlock *OldB, MachineBasicBlock *NewB); + void mergeBlocks(MachineBasicBlock *PredB, MachineBasicBlock *SuccB); + void simplifyFlowGraph(const FlowPattern &FP); + + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineFunction *MFN; + MachineRegisterInfo *MRI; + MachineDominatorTree *MDT; + MachineLoopInfo *MLI; + BlockSetType Deleted; + const MachineBranchProbabilityInfo *MBPI; + }; + + char HexagonEarlyIfConversion::ID = 0; +} + +INITIALIZE_PASS(HexagonEarlyIfConversion, "hexagon-eif", + "Hexagon early if conversion", false, false) + +bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const { + if (B->succ_size() != 1) + return false; + MachineBasicBlock *SB = *B->succ_begin(); + MachineLoop *L = MLI->getLoopFor(SB); + return L && SB == L->getHeader(); +} + + +bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B, + MachineLoop *L, FlowPattern &FP) { + DEBUG(dbgs() << "Checking flow pattern at BB#" << B->getNumber() << "\n"); + + // Interested only in conditional branches, no .new, no new-value, etc. + // Check the terminators directly, it's easier than handling all responses + // from AnalyzeBranch. + MachineBasicBlock *TB = 0, *FB = 0; + MachineBasicBlock::const_iterator T1I = B->getFirstTerminator(); + if (T1I == B->end()) + return false; + unsigned Opc = T1I->getOpcode(); + if (Opc != Hexagon::J2_jumpt && Opc != Hexagon::J2_jumpf) + return false; + unsigned PredR = T1I->getOperand(0).getReg(); + + // Get the layout successor, or 0 if B does not have one. + MachineFunction::iterator NextBI = std::next(MachineFunction::iterator(B)); + MachineBasicBlock *NextB = (NextBI != MFN->end()) ? &*NextBI : 0; + + MachineBasicBlock *T1B = T1I->getOperand(1).getMBB(); + MachineBasicBlock::const_iterator T2I = std::next(T1I); + // The second terminator should be an unconditional branch. + assert(T2I == B->end() || T2I->getOpcode() == Hexagon::J2_jump); + MachineBasicBlock *T2B = (T2I == B->end()) ? NextB + : T2I->getOperand(0).getMBB(); + if (T1B == T2B) { + // XXX merge if T1B == NextB, or convert branch to unconditional. + // mark as diamond with both sides equal? + return false; + } + // Loop could be null for both. + if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L) + return false; + + // Record the true/false blocks in such a way that "true" means "if (PredR)", + // and "false" means "if (!PredR)". + if (Opc == Hexagon::J2_jumpt) + TB = T1B, FB = T2B; + else + TB = T2B, FB = T1B; + + if (!MDT->properlyDominates(B, TB) || !MDT->properlyDominates(B, FB)) + return false; + + // Detect triangle first. In case of a triangle, one of the blocks TB/FB + // can fall through into the other, in other words, it will be executed + // in both cases. We only want to predicate the block that is executed + // conditionally. + unsigned TNP = TB->pred_size(), FNP = FB->pred_size(); + unsigned TNS = TB->succ_size(), FNS = FB->succ_size(); + + // A block is predicable if it has one predecessor (it must be B), and + // it has a single successor. In fact, the block has to end either with + // an unconditional branch (which can be predicated), or with a fall- + // through. + bool TOk = (TNP == 1) && (TNS == 1); + bool FOk = (FNP == 1) && (FNS == 1); + + // If neither is predicable, there is nothing interesting. + if (!TOk && !FOk) + return false; + + MachineBasicBlock *TSB = (TNS > 0) ? *TB->succ_begin() : 0; + MachineBasicBlock *FSB = (FNS > 0) ? *FB->succ_begin() : 0; + MachineBasicBlock *JB = 0; + + if (TOk) { + if (FOk) { + if (TSB == FSB) + JB = TSB; + // Diamond: "if (P) then TB; else FB;". + } else { + // TOk && !FOk + if (TSB == FB) { + JB = FB; + FB = 0; + } + } + } else { + // !TOk && FOk (at least one must be true by now). + if (FSB == TB) { + JB = TB; + TB = 0; + } + } + // Don't try to predicate loop preheaders. + if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) { + DEBUG(dbgs() << "One of blocks " << PrintMB(TB) << ", " << PrintMB(FB) + << " is a loop preheader. Skipping.\n"); + return false; + } + + FP = FlowPattern(B, PredR, TB, FB, JB); + DEBUG(dbgs() << "Detected " << PrintFP(FP, *TRI) << "\n"); + return true; +} + + +// KLUDGE: HexagonInstrInfo::AnalyzeBranch won't work on a block that +// contains EH_LABEL. +bool HexagonEarlyIfConversion::hasEHLabel(const MachineBasicBlock *B) const { + for (auto &I : *B) + if (I.isEHLabel()) + return true; + return false; +} + + +// KLUDGE: HexagonInstrInfo::AnalyzeBranch may be unable to recognize +// that a block can never fall-through. +bool HexagonEarlyIfConversion::hasUncondBranch(const MachineBasicBlock *B) + const { + MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end(); + while (I != E) { + if (I->isBarrier()) + return true; + ++I; + } + return false; +} + + +bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B) + const { + if (!B) + return true; + if (B->isEHPad() || B->hasAddressTaken()) + return false; + if (B->succ_size() == 0) + return false; + + for (auto &MI : *B) { + if (MI.isDebugValue()) + continue; + if (MI.isConditionalBranch()) + return false; + unsigned Opc = MI.getOpcode(); + bool IsJMP = (Opc == Hexagon::J2_jump); + if (!isPredicableStore(&MI) && !IsJMP && !isSafeToSpeculate(&MI)) + return false; + // Look for predicate registers defined by this instruction. It's ok + // to speculate such an instruction, but the predicate register cannot + // be used outside of this block (or else it won't be possible to + // update the use of it after predication). PHI uses will be updated + // to use a result of a MUX, and a MUX cannot be created for predicate + // registers. + for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) { + if (!MO->isReg() || !MO->isDef()) + continue; + unsigned R = MO->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass) + continue; + for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U) + if (U->getParent()->isPHI()) + return false; + } + } + return true; +} + + +bool HexagonEarlyIfConversion::usesUndefVReg(const MachineInstr *MI) const { + for (ConstMIOperands MO(MI); MO.isValid(); ++MO) { + if (!MO->isReg() || !MO->isUse()) + continue; + unsigned R = MO->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + const MachineInstr *DefI = MRI->getVRegDef(R); + // "Undefined" virtual registers are actually defined via IMPLICIT_DEF. + assert(DefI && "Expecting a reaching def in MRI"); + if (DefI->isImplicitDef()) + return true; + } + return false; +} + + +bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const { + if (hasEHLabel(FP.SplitB)) // KLUDGE: see function definition + return false; + if (FP.TrueB && !isValidCandidate(FP.TrueB)) + return false; + if (FP.FalseB && !isValidCandidate(FP.FalseB)) + return false; + // Check the PHIs in the join block. If any of them use a register + // that is defined as IMPLICIT_DEF, do not convert this. This can + // legitimately happen if one side of the split never executes, but + // the compiler is unable to prove it. That side may then seem to + // provide an "undef" value to the join block, however it will never + // execute at run-time. If we convert this case, the "undef" will + // be used in a MUX instruction, and that may seem like actually + // using an undefined value to other optimizations. This could lead + // to trouble further down the optimization stream, cause assertions + // to fail, etc. + if (FP.JoinB) { + const MachineBasicBlock &B = *FP.JoinB; + for (auto &MI : B) { + if (!MI.isPHI()) + break; + if (usesUndefVReg(&MI)) + return false; + unsigned DefR = MI.getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(DefR); + if (RC == &Hexagon::PredRegsRegClass) + return false; + } + } + return true; +} + + +unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const { + assert(B->pred_size() <= 2); + if (B->pred_size() < 2) + return 0; + + unsigned Cost = 0; + MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI(); + for (I = B->begin(); I != E; ++I) { + const MachineOperand &RO1 = I->getOperand(1); + const MachineOperand &RO3 = I->getOperand(3); + assert(RO1.isReg() && RO3.isReg()); + // Must have a MUX if the phi uses a subregister. + if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) { + Cost++; + continue; + } + MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg()); + MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg()); + if (!TII->isPredicable(Def1) || !TII->isPredicable(Def3)) + Cost++; + } + return Cost; +} + + +unsigned HexagonEarlyIfConversion::countPredicateDefs( + const MachineBasicBlock *B) const { + unsigned PredDefs = 0; + for (auto &MI : *B) { + for (ConstMIOperands MO(&MI); MO.isValid(); ++MO) { + if (!MO->isReg() || !MO->isDef()) + continue; + unsigned R = MO->getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + if (MRI->getRegClass(R) == &Hexagon::PredRegsRegClass) + PredDefs++; + } + } + return PredDefs; +} + + +bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const { + if (FP.TrueB && FP.FalseB) { + + // Do not IfCovert if the branch is one sided. + if (MBPI) { + BranchProbability Prob(9, 10); + if (MBPI->getEdgeProbability(FP.SplitB, FP.TrueB) > Prob) + return false; + if (MBPI->getEdgeProbability(FP.SplitB, FP.FalseB) > Prob) + return false; + } + + // If both sides are predicable, convert them if they join, and the + // join block has no other predecessors. + MachineBasicBlock *TSB = *FP.TrueB->succ_begin(); + MachineBasicBlock *FSB = *FP.FalseB->succ_begin(); + if (TSB != FSB) + return false; + if (TSB->pred_size() != 2) + return false; + } + + // Calculate the total size of the predicated blocks. + // Assume instruction counts without branches to be the approximation of + // the code size. If the predicated blocks are smaller than a packet size, + // approximate the spare room in the packet that could be filled with the + // predicated/speculated instructions. + unsigned TS = 0, FS = 0, Spare = 0; + if (FP.TrueB) { + TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator()); + if (TS < HEXAGON_PACKET_SIZE) + Spare += HEXAGON_PACKET_SIZE-TS; + } + if (FP.FalseB) { + FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator()); + if (FS < HEXAGON_PACKET_SIZE) + Spare += HEXAGON_PACKET_SIZE-TS; + } + unsigned TotalIn = TS+FS; + DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: " + << TotalIn << ", spare room: " << Spare << "\n"); + if (TotalIn >= SizeLimit+Spare) + return false; + + // Count the number of PHI nodes that will need to be updated (converted + // to MUX). Those can be later converted to predicated instructions, so + // they aren't always adding extra cost. + // KLUDGE: Also, count the number of predicate register definitions in + // each block. The scheduler may increase the pressure of these and cause + // expensive spills (e.g. bitmnp01). + unsigned TotalPh = 0; + unsigned PredDefs = countPredicateDefs(FP.SplitB); + if (FP.JoinB) { + TotalPh = computePhiCost(FP.JoinB); + PredDefs += countPredicateDefs(FP.JoinB); + } else { + if (FP.TrueB && FP.TrueB->succ_size() > 0) { + MachineBasicBlock *SB = *FP.TrueB->succ_begin(); + TotalPh += computePhiCost(SB); + PredDefs += countPredicateDefs(SB); + } + if (FP.FalseB && FP.FalseB->succ_size() > 0) { + MachineBasicBlock *SB = *FP.FalseB->succ_begin(); + TotalPh += computePhiCost(SB); + PredDefs += countPredicateDefs(SB); + } + } + DEBUG(dbgs() << "Total number of extra muxes from converted phis: " + << TotalPh << "\n"); + if (TotalIn+TotalPh >= SizeLimit+Spare) + return false; + + DEBUG(dbgs() << "Total number of predicate registers: " << PredDefs << "\n"); + if (PredDefs > 4) + return false; + + return true; +} + + +bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B, + MachineLoop *L) { + bool Changed = false; + + // Visit all dominated blocks from the same loop first, then process B. + MachineDomTreeNode *N = MDT->getNode(B); + typedef GraphTraits<MachineDomTreeNode*> GTN; + // We will change CFG/DT during this traversal, so take precautions to + // avoid problems related to invalidated iterators. In fact, processing + // a child C of B cannot cause another child to be removed, but it can + // cause a new child to be added (which was a child of C before C itself + // was removed. This new child C, however, would have been processed + // prior to processing B, so there is no need to process it again. + // Simply keep a list of children of B, and traverse that list. + typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType; + DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N)); + for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) { + MachineBasicBlock *SB = (*I)->getBlock(); + if (!Deleted.count(SB)) + Changed |= visitBlock(SB, L); + } + // When walking down the dominator tree, we want to traverse through + // blocks from nested (other) loops, because they can dominate blocks + // that are in L. Skip the non-L blocks only after the tree traversal. + if (MLI->getLoopFor(B) != L) + return Changed; + + FlowPattern FP; + if (!matchFlowPattern(B, L, FP)) + return Changed; + + if (!isValid(FP)) { + DEBUG(dbgs() << "Conversion is not valid\n"); + return Changed; + } + if (!isProfitable(FP)) { + DEBUG(dbgs() << "Conversion is not profitable\n"); + return Changed; + } + + convert(FP); + simplifyFlowGraph(FP); + return true; +} + + +bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) { + MachineBasicBlock *HB = L ? L->getHeader() : 0; + DEBUG((L ? dbgs() << "Visiting loop H:" << PrintMB(HB) + : dbgs() << "Visiting function") << "\n"); + bool Changed = false; + if (L) { + for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) + Changed |= visitLoop(*I); + } + + MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN); + Changed |= visitBlock(L ? HB : EntryB, L); + return Changed; +} + + +bool HexagonEarlyIfConversion::isPredicableStore(const MachineInstr *MI) + const { + // Exclude post-increment stores. Those return a value, so we cannot + // predicate them. + unsigned Opc = MI->getOpcode(); + using namespace Hexagon; + switch (Opc) { + // Store byte: + case S2_storerb_io: case S4_storerb_rr: + case S2_storerbabs: case S4_storeirb_io: case S2_storerbgp: + // Store halfword: + case S2_storerh_io: case S4_storerh_rr: + case S2_storerhabs: case S4_storeirh_io: case S2_storerhgp: + // Store upper halfword: + case S2_storerf_io: case S4_storerf_rr: + case S2_storerfabs: case S2_storerfgp: + // Store word: + case S2_storeri_io: case S4_storeri_rr: + case S2_storeriabs: case S4_storeiri_io: case S2_storerigp: + // Store doubleword: + case S2_storerd_io: case S4_storerd_rr: + case S2_storerdabs: case S2_storerdgp: + return true; + } + return false; +} + + +bool HexagonEarlyIfConversion::isSafeToSpeculate(const MachineInstr *MI) + const { + if (MI->mayLoad() || MI->mayStore()) + return false; + if (MI->isCall() || MI->isBarrier() || MI->isBranch()) + return false; + if (MI->hasUnmodeledSideEffects()) + return false; + + return true; +} + + +unsigned HexagonEarlyIfConversion::getCondStoreOpcode(unsigned Opc, + bool IfTrue) const { + // Exclude post-increment stores. + using namespace Hexagon; + switch (Opc) { + case S2_storerb_io: + return IfTrue ? S2_pstorerbt_io : S2_pstorerbf_io; + case S4_storerb_rr: + return IfTrue ? S4_pstorerbt_rr : S4_pstorerbf_rr; + case S2_storerbabs: + case S2_storerbgp: + return IfTrue ? S4_pstorerbt_abs : S4_pstorerbf_abs; + case S4_storeirb_io: + return IfTrue ? S4_storeirbt_io : S4_storeirbf_io; + case S2_storerh_io: + return IfTrue ? S2_pstorerht_io : S2_pstorerhf_io; + case S4_storerh_rr: + return IfTrue ? S4_pstorerht_rr : S4_pstorerhf_rr; + case S2_storerhabs: + case S2_storerhgp: + return IfTrue ? S4_pstorerht_abs : S4_pstorerhf_abs; + case S2_storerf_io: + return IfTrue ? S2_pstorerft_io : S2_pstorerff_io; + case S4_storerf_rr: + return IfTrue ? S4_pstorerft_rr : S4_pstorerff_rr; + case S2_storerfabs: + case S2_storerfgp: + return IfTrue ? S4_pstorerft_abs : S4_pstorerff_abs; + case S4_storeirh_io: + return IfTrue ? S4_storeirht_io : S4_storeirhf_io; + case S2_storeri_io: + return IfTrue ? S2_pstorerit_io : S2_pstorerif_io; + case S4_storeri_rr: + return IfTrue ? S4_pstorerit_rr : S4_pstorerif_rr; + case S2_storeriabs: + case S2_storerigp: + return IfTrue ? S4_pstorerit_abs : S4_pstorerif_abs; + case S4_storeiri_io: + return IfTrue ? S4_storeirit_io : S4_storeirif_io; + case S2_storerd_io: + return IfTrue ? S2_pstorerdt_io : S2_pstorerdf_io; + case S4_storerd_rr: + return IfTrue ? S4_pstorerdt_rr : S4_pstorerdf_rr; + case S2_storerdabs: + case S2_storerdgp: + return IfTrue ? S4_pstorerdt_abs : S4_pstorerdf_abs; + } + llvm_unreachable("Unexpected opcode"); + return 0; +} + + +void HexagonEarlyIfConversion::predicateInstr(MachineBasicBlock *ToB, + MachineBasicBlock::iterator At, MachineInstr *MI, + unsigned PredR, bool IfTrue) { + DebugLoc DL; + if (At != ToB->end()) + DL = At->getDebugLoc(); + else if (!ToB->empty()) + DL = ToB->back().getDebugLoc(); + + unsigned Opc = MI->getOpcode(); + + if (isPredicableStore(MI)) { + unsigned COpc = getCondStoreOpcode(Opc, IfTrue); + assert(COpc); + MachineInstrBuilder MIB = BuildMI(*ToB, At, DL, TII->get(COpc)) + .addReg(PredR); + for (MIOperands MO(MI); MO.isValid(); ++MO) + MIB.addOperand(*MO); + + // Set memory references. + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + MIB.setMemRefs(MMOBegin, MMOEnd); + + MI->eraseFromParent(); + return; + } + + if (Opc == Hexagon::J2_jump) { + MachineBasicBlock *TB = MI->getOperand(0).getMBB(); + const MCInstrDesc &D = TII->get(IfTrue ? Hexagon::J2_jumpt + : Hexagon::J2_jumpf); + BuildMI(*ToB, At, DL, D) + .addReg(PredR) + .addMBB(TB); + MI->eraseFromParent(); + return; + } + + // Print the offending instruction unconditionally as we are about to + // abort. + dbgs() << *MI; + llvm_unreachable("Unexpected instruction"); +} + + +// Predicate/speculate non-branch instructions from FromB into block ToB. +// Leave the branches alone, they will be handled later. Btw, at this point +// FromB should have at most one branch, and it should be unconditional. +void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB, + MachineBasicBlock::iterator At, MachineBasicBlock *FromB, + unsigned PredR, bool IfTrue) { + DEBUG(dbgs() << "Predicating block " << PrintMB(FromB) << "\n"); + MachineBasicBlock::iterator End = FromB->getFirstTerminator(); + MachineBasicBlock::iterator I, NextI; + + for (I = FromB->begin(); I != End; I = NextI) { + assert(!I->isPHI()); + NextI = std::next(I); + if (isSafeToSpeculate(&*I)) + ToB->splice(At, FromB, I); + else + predicateInstr(ToB, At, &*I, PredR, IfTrue); + } +} + + +void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB, + const FlowPattern &FP) { + // Visit all PHI nodes in the WhereB block and generate MUX instructions + // in the split block. Update the PHI nodes with the values of the MUX. + auto NonPHI = WhereB->getFirstNonPHI(); + for (auto I = WhereB->begin(); I != NonPHI; ++I) { + MachineInstr *PN = &*I; + // Registers and subregisters corresponding to TrueB, FalseB and SplitB. + unsigned TR = 0, TSR = 0, FR = 0, FSR = 0, SR = 0, SSR = 0; + for (int i = PN->getNumOperands()-2; i > 0; i -= 2) { + const MachineOperand &RO = PN->getOperand(i), &BO = PN->getOperand(i+1); + if (BO.getMBB() == FP.SplitB) + SR = RO.getReg(), SSR = RO.getSubReg(); + else if (BO.getMBB() == FP.TrueB) + TR = RO.getReg(), TSR = RO.getSubReg(); + else if (BO.getMBB() == FP.FalseB) + FR = RO.getReg(), FSR = RO.getSubReg(); + else + continue; + PN->RemoveOperand(i+1); + PN->RemoveOperand(i); + } + if (TR == 0) + TR = SR, TSR = SSR; + else if (FR == 0) + FR = SR, FSR = SSR; + assert(TR && FR); + + using namespace Hexagon; + unsigned DR = PN->getOperand(0).getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(DR); + const MCInstrDesc &D = RC == &IntRegsRegClass ? TII->get(C2_mux) + : TII->get(MUX64_rr); + + MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator(); + DebugLoc DL; + if (MuxAt != FP.SplitB->end()) + DL = MuxAt->getDebugLoc(); + unsigned MuxR = MRI->createVirtualRegister(RC); + BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR) + .addReg(FP.PredR) + .addReg(TR, 0, TSR) + .addReg(FR, 0, FSR); + + PN->addOperand(MachineOperand::CreateReg(MuxR, false)); + PN->addOperand(MachineOperand::CreateMBB(FP.SplitB)); + } +} + + +void HexagonEarlyIfConversion::convert(const FlowPattern &FP) { + MachineBasicBlock *TSB = 0, *FSB = 0; + MachineBasicBlock::iterator OldTI = FP.SplitB->getFirstTerminator(); + assert(OldTI != FP.SplitB->end()); + DebugLoc DL = OldTI->getDebugLoc(); + + if (FP.TrueB) { + TSB = *FP.TrueB->succ_begin(); + predicateBlockNB(FP.SplitB, OldTI, FP.TrueB, FP.PredR, true); + } + if (FP.FalseB) { + FSB = *FP.FalseB->succ_begin(); + MachineBasicBlock::iterator At = FP.SplitB->getFirstTerminator(); + predicateBlockNB(FP.SplitB, At, FP.FalseB, FP.PredR, false); + } + + // Regenerate new terminators in the split block and update the successors. + // First, remember any information that may be needed later and remove the + // existing terminators/successors from the split block. + MachineBasicBlock *SSB = 0; + FP.SplitB->erase(OldTI, FP.SplitB->end()); + while (FP.SplitB->succ_size() > 0) { + MachineBasicBlock *T = *FP.SplitB->succ_begin(); + // It's possible that the split block had a successor that is not a pre- + // dicated block. This could only happen if there was only one block to + // be predicated. Example: + // split_b: + // if (p) jump true_b + // jump unrelated2_b + // unrelated1_b: + // ... + // unrelated2_b: ; can have other predecessors, so it's not "false_b" + // jump other_b + // true_b: ; only reachable from split_b, can be predicated + // ... + // + // Find this successor (SSB) if it exists. + if (T != FP.TrueB && T != FP.FalseB) { + assert(!SSB); + SSB = T; + } + FP.SplitB->removeSuccessor(FP.SplitB->succ_begin()); + } + + // Insert new branches and update the successors of the split block. This + // may create unconditional branches to the layout successor, etc., but + // that will be cleaned up later. For now, make sure that correct code is + // generated. + if (FP.JoinB) { + assert(!SSB || SSB == FP.JoinB); + BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump)) + .addMBB(FP.JoinB); + FP.SplitB->addSuccessor(FP.JoinB); + } else { + bool HasBranch = false; + if (TSB) { + BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jumpt)) + .addReg(FP.PredR) + .addMBB(TSB); + FP.SplitB->addSuccessor(TSB); + HasBranch = true; + } + if (FSB) { + const MCInstrDesc &D = HasBranch ? TII->get(Hexagon::J2_jump) + : TII->get(Hexagon::J2_jumpf); + MachineInstrBuilder MIB = BuildMI(*FP.SplitB, FP.SplitB->end(), DL, D); + if (!HasBranch) + MIB.addReg(FP.PredR); + MIB.addMBB(FSB); + FP.SplitB->addSuccessor(FSB); + } + if (SSB) { + // This cannot happen if both TSB and FSB are set. [TF]SB are the + // successor blocks of the TrueB and FalseB (or null of the TrueB + // or FalseB block is null). SSB is the potential successor block + // of the SplitB that is neither TrueB nor FalseB. + BuildMI(*FP.SplitB, FP.SplitB->end(), DL, TII->get(Hexagon::J2_jump)) + .addMBB(SSB); + FP.SplitB->addSuccessor(SSB); + } + } + + // What is left to do is to update the PHI nodes that could have entries + // referring to predicated blocks. + if (FP.JoinB) { + updatePhiNodes(FP.JoinB, FP); + } else { + if (TSB) + updatePhiNodes(TSB, FP); + if (FSB) + updatePhiNodes(FSB, FP); + // Nothing to update in SSB, since SSB's predecessors haven't changed. + } +} + + +void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) { + DEBUG(dbgs() << "Removing block " << PrintMB(B) << "\n"); + + // Transfer the immediate dominator information from B to its descendants. + MachineDomTreeNode *N = MDT->getNode(B); + MachineDomTreeNode *IDN = N->getIDom(); + if (IDN) { + MachineBasicBlock *IDB = IDN->getBlock(); + typedef GraphTraits<MachineDomTreeNode*> GTN; + typedef SmallVector<MachineDomTreeNode*,4> DTNodeVectType; + DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N)); + for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) { + MachineBasicBlock *SB = (*I)->getBlock(); + MDT->changeImmediateDominator(SB, IDB); + } + } + + while (B->succ_size() > 0) + B->removeSuccessor(B->succ_begin()); + + for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I) + (*I)->removeSuccessor(B, true); + + Deleted.insert(B); + MDT->eraseNode(B); + MFN->erase(B->getIterator()); +} + + +void HexagonEarlyIfConversion::eliminatePhis(MachineBasicBlock *B) { + DEBUG(dbgs() << "Removing phi nodes from block " << PrintMB(B) << "\n"); + MachineBasicBlock::iterator I, NextI, NonPHI = B->getFirstNonPHI(); + for (I = B->begin(); I != NonPHI; I = NextI) { + NextI = std::next(I); + MachineInstr *PN = &*I; + assert(PN->getNumOperands() == 3 && "Invalid phi node"); + MachineOperand &UO = PN->getOperand(1); + unsigned UseR = UO.getReg(), UseSR = UO.getSubReg(); + unsigned DefR = PN->getOperand(0).getReg(); + unsigned NewR = UseR; + if (UseSR) { + // MRI.replaceVregUsesWith does not allow to update the subregister, + // so instead of doing the use-iteration here, create a copy into a + // "non-subregistered" register. + DebugLoc DL = PN->getDebugLoc(); + const TargetRegisterClass *RC = MRI->getRegClass(DefR); + NewR = MRI->createVirtualRegister(RC); + NonPHI = BuildMI(*B, NonPHI, DL, TII->get(TargetOpcode::COPY), NewR) + .addReg(UseR, 0, UseSR); + } + MRI->replaceRegWith(DefR, NewR); + B->erase(I); + } +} + + +void HexagonEarlyIfConversion::replacePhiEdges(MachineBasicBlock *OldB, + MachineBasicBlock *NewB) { + for (auto I = OldB->succ_begin(), E = OldB->succ_end(); I != E; ++I) { + MachineBasicBlock *SB = *I; + MachineBasicBlock::iterator P, N = SB->getFirstNonPHI(); + for (P = SB->begin(); P != N; ++P) { + MachineInstr *PN = &*P; + for (MIOperands MO(PN); MO.isValid(); ++MO) + if (MO->isMBB() && MO->getMBB() == OldB) + MO->setMBB(NewB); + } + } +} + + +void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB, + MachineBasicBlock *SuccB) { + DEBUG(dbgs() << "Merging blocks " << PrintMB(PredB) << " and " + << PrintMB(SuccB) << "\n"); + bool TermOk = hasUncondBranch(SuccB); + eliminatePhis(SuccB); + TII->RemoveBranch(*PredB); + PredB->removeSuccessor(SuccB); + PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end()); + MachineBasicBlock::succ_iterator I, E = SuccB->succ_end(); + for (I = SuccB->succ_begin(); I != E; ++I) + PredB->addSuccessor(*I); + PredB->normalizeSuccProbs(); + replacePhiEdges(SuccB, PredB); + removeBlock(SuccB); + if (!TermOk) + PredB->updateTerminator(); +} + + +void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) { + if (FP.TrueB) + removeBlock(FP.TrueB); + if (FP.FalseB) + removeBlock(FP.FalseB); + + FP.SplitB->updateTerminator(); + if (FP.SplitB->succ_size() != 1) + return; + + MachineBasicBlock *SB = *FP.SplitB->succ_begin(); + if (SB->pred_size() != 1) + return; + + // By now, the split block has only one successor (SB), and SB has only + // one predecessor. We can try to merge them. We will need to update ter- + // minators in FP.Split+SB, and that requires working AnalyzeBranch, which + // fails on Hexagon for blocks that have EH_LABELs. However, if SB ends + // with an unconditional branch, we won't need to touch the terminators. + if (!hasEHLabel(SB) || hasUncondBranch(SB)) + mergeBlocks(FP.SplitB, SB); +} + + +bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) { + auto &ST = MF.getSubtarget(); + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MFN = &MF; + MRI = &MF.getRegInfo(); + MDT = &getAnalysis<MachineDominatorTree>(); + MLI = &getAnalysis<MachineLoopInfo>(); + MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() : + nullptr; + + Deleted.clear(); + bool Changed = false; + + for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I) + Changed |= visitLoop(*I); + Changed |= visitLoop(0); + + return Changed; +} + +//===----------------------------------------------------------------------===// +// Public Constructor Functions +//===----------------------------------------------------------------------===// +FunctionPass *llvm::createHexagonEarlyIfConversion() { + return new HexagonEarlyIfConversion(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp index e4c8d8f..6e2dbc0 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp @@ -74,7 +74,7 @@ bool HexagonExpandPredSpillCode::runOnMachineFunction(MachineFunction &Fn) { // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block. for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); ++MII) { diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp index 21a8996..7a52a1c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -147,6 +147,48 @@ static cl::opt<unsigned> ShrinkLimit("shrink-frame-limit", cl::init(UINT_MAX), cl::Hidden, cl::ZeroOrMore, cl::desc("Max count of stack frame " "shrink-wraps")); +static cl::opt<bool> UseAllocframe("use-allocframe", cl::init(true), + cl::Hidden, cl::desc("Use allocframe more conservatively")); + + +namespace llvm { + void initializeHexagonCallFrameInformationPass(PassRegistry&); + FunctionPass *createHexagonCallFrameInformation(); +} + +namespace { + class HexagonCallFrameInformation : public MachineFunctionPass { + public: + static char ID; + HexagonCallFrameInformation() : MachineFunctionPass(ID) { + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeHexagonCallFrameInformationPass(PR); + } + bool runOnMachineFunction(MachineFunction &MF) override; + }; + + char HexagonCallFrameInformation::ID = 0; +} + +bool HexagonCallFrameInformation::runOnMachineFunction(MachineFunction &MF) { + auto &HFI = *MF.getSubtarget<HexagonSubtarget>().getFrameLowering(); + bool NeedCFI = MF.getMMI().hasDebugInfo() || + MF.getFunction()->needsUnwindTableEntry(); + + if (!NeedCFI) + return false; + HFI.insertCFIInstructions(MF); + return true; +} + +INITIALIZE_PASS(HexagonCallFrameInformation, "hexagon-cfi", + "Hexagon call frame information", false, false) + +FunctionPass *llvm::createHexagonCallFrameInformation() { + return new HexagonCallFrameInformation(); +} + + namespace { /// Map a register pair Reg to the subregister that has the greater "number", /// i.e. D3 (aka R7:6) will be mapped to R7, etc. @@ -370,11 +412,11 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF, insertEpilogueInBlock(*EpilogB); } else { for (auto &B : MF) - if (!B.empty() && B.back().isReturn()) + if (B.isReturnBlock()) insertCSRRestoresInBlock(B, CSI, HRI); for (auto &B : MF) - if (!B.empty() && B.back().isReturn()) + if (B.isReturnBlock()) insertEpilogueInBlock(B); } } @@ -383,10 +425,7 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF, void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); - MachineModuleInfo &MMI = MF.getMMI(); - MachineBasicBlock::iterator MBBI = MBB.begin(); - auto &HTM = static_cast<const HexagonTargetMachine&>(MF.getTarget()); - auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget()); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); auto &HII = *HST.getInstrInfo(); auto &HRI = *HST.getRegisterInfo(); DebugLoc dl; @@ -405,10 +444,6 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { bool AlignStack = (MaxAlign > getStackAlignment()); - // Check if frame moves are needed for EH. - bool needsFrameMoves = MMI.hasDebugInfo() || - MF.getFunction()->needsUnwindTableEntry(); - // Get the number of bytes to allocate from the FrameInfo. unsigned NumBytes = MFI->getStackSize(); unsigned SP = HRI.getStackRegister(); @@ -424,14 +459,7 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { MI->eraseFromParent(); } - // - // Only insert ALLOCFRAME if we need to or at -O0 for the debugger. Think - // that this shouldn't be required, but doing so now because gcc does and - // gdb can't break at the start of the function without it. Will remove if - // this turns out to be a gdb bug. - // - bool NoOpt = (HTM.getOptLevel() == CodeGenOpt::None); - if (!NoOpt && !FuncInfo->hasClobberLR() && !hasFP(MF)) + if (!hasFP(MF)) return; // Check for overflow. @@ -469,92 +497,11 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB) const { .addReg(SP) .addImm(-int64_t(MaxAlign)); } - - if (needsFrameMoves) { - std::vector<MCCFIInstruction> Instructions = MMI.getFrameInstructions(); - MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - - // Advance CFA. DW_CFA_def_cfa - unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true); - unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true); - - // CFA = FP + 8 - unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfa( - FrameLabel, DwFPReg, -8)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - // R31 (return addr) = CFA - #4 - CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - FrameLabel, DwRAReg, -4)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - // R30 (frame ptr) = CFA - #8) - CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( - FrameLabel, DwFPReg, -8)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - - unsigned int regsToMove[] = { - Hexagon::R1, Hexagon::R0, Hexagon::R3, Hexagon::R2, - Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18, - Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22, - Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26, - Hexagon::D0, Hexagon::D1, Hexagon::D8, Hexagon::D9, Hexagon::D10, - Hexagon::D11, Hexagon::D12, Hexagon::D13, Hexagon::NoRegister - }; - - const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); - - for (unsigned i = 0; regsToMove[i] != Hexagon::NoRegister; ++i) { - for (unsigned I = 0, E = CSI.size(); I < E; ++I) { - if (CSI[I].getReg() == regsToMove[i]) { - // Subtract 8 to make room for R30 and R31, which are added above. - int64_t Offset = getFrameIndexOffset(MF, CSI[I].getFrameIdx()) - 8; - - if (regsToMove[i] < Hexagon::D0 || regsToMove[i] > Hexagon::D15) { - unsigned DwarfReg = HRI.getDwarfRegNum(regsToMove[i], true); - unsigned CFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(FrameLabel, - DwarfReg, Offset)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - } else { - // Split the double regs into subregs, and generate appropriate - // cfi_offsets. - // The only reason, we are split double regs is, llvm-mc does not - // understand paired registers for cfi_offset. - // Eg .cfi_offset r1:0, -64 - unsigned HiReg = getMax32BitSubRegister(regsToMove[i], HRI); - unsigned LoReg = getMax32BitSubRegister(regsToMove[i], HRI, false); - unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true); - unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true); - unsigned HiCFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(FrameLabel, - HiDwarfReg, Offset+4)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(HiCFIIndex); - unsigned LoCFIIndex = MMI.addFrameInst( - MCCFIInstruction::createOffset(FrameLabel, - LoDwarfReg, Offset)); - BuildMI(MBB, MBBI, dl, HII.get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(LoCFIIndex); - } - break; - } - } // for CSI.size() - } // for regsToMove - } // needsFrameMoves } void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const { MachineFunction &MF = *MBB.getParent(); - // - // Only insert deallocframe if we need to. Also at -O0. See comment - // in insertPrologueInBlock above. - // - if (!hasFP(MF) && MF.getTarget().getOptLevel() != CodeGenOpt::None) + if (!hasFP(MF)) return; auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget()); @@ -630,12 +577,172 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const { } +namespace { + bool IsAllocFrame(MachineBasicBlock::const_iterator It) { + if (!It->isBundle()) + return It->getOpcode() == Hexagon::S2_allocframe; + auto End = It->getParent()->instr_end(); + MachineBasicBlock::const_instr_iterator I = It.getInstrIterator(); + while (++I != End && I->isBundled()) + if (I->getOpcode() == Hexagon::S2_allocframe) + return true; + return false; + } + + MachineBasicBlock::iterator FindAllocFrame(MachineBasicBlock &B) { + for (auto &I : B) + if (IsAllocFrame(I)) + return I; + return B.end(); + } +} + + +void HexagonFrameLowering::insertCFIInstructions(MachineFunction &MF) const { + for (auto &B : MF) { + auto AF = FindAllocFrame(B); + if (AF == B.end()) + continue; + insertCFIInstructionsAt(B, ++AF); + } +} + + +void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator At) const { + MachineFunction &MF = *MBB.getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + MachineModuleInfo &MMI = MF.getMMI(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + auto &HII = *HST.getInstrInfo(); + auto &HRI = *HST.getRegisterInfo(); + + // If CFI instructions have debug information attached, something goes + // wrong with the final assembly generation: the prolog_end is placed + // in a wrong location. + DebugLoc DL; + const MCInstrDesc &CFID = HII.get(TargetOpcode::CFI_INSTRUCTION); + + MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); + + if (hasFP(MF)) { + unsigned DwFPReg = HRI.getDwarfRegNum(HRI.getFrameRegister(), true); + unsigned DwRAReg = HRI.getDwarfRegNum(HRI.getRARegister(), true); + + // Define CFA via an offset from the value of FP. + // + // -8 -4 0 (SP) + // --+----+----+--------------------- + // | FP | LR | increasing addresses --> + // --+----+----+--------------------- + // | +-- Old SP (before allocframe) + // +-- New FP (after allocframe) + // + // MCCFIInstruction::createDefCfa subtracts the offset from the register. + // MCCFIInstruction::createOffset takes the offset without sign change. + auto DefCfa = MCCFIInstruction::createDefCfa(FrameLabel, DwFPReg, -8); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(DefCfa)); + // R31 (return addr) = CFA - 4 + auto OffR31 = MCCFIInstruction::createOffset(FrameLabel, DwRAReg, -4); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffR31)); + // R30 (frame ptr) = CFA - 8 + auto OffR30 = MCCFIInstruction::createOffset(FrameLabel, DwFPReg, -8); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffR30)); + } + + static unsigned int RegsToMove[] = { + Hexagon::R1, Hexagon::R0, Hexagon::R3, Hexagon::R2, + Hexagon::R17, Hexagon::R16, Hexagon::R19, Hexagon::R18, + Hexagon::R21, Hexagon::R20, Hexagon::R23, Hexagon::R22, + Hexagon::R25, Hexagon::R24, Hexagon::R27, Hexagon::R26, + Hexagon::D0, Hexagon::D1, Hexagon::D8, Hexagon::D9, + Hexagon::D10, Hexagon::D11, Hexagon::D12, Hexagon::D13, + Hexagon::NoRegister + }; + + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + for (unsigned i = 0; RegsToMove[i] != Hexagon::NoRegister; ++i) { + unsigned Reg = RegsToMove[i]; + auto IfR = [Reg] (const CalleeSavedInfo &C) -> bool { + return C.getReg() == Reg; + }; + auto F = std::find_if(CSI.begin(), CSI.end(), IfR); + if (F == CSI.end()) + continue; + + // Subtract 8 to make room for R30 and R31, which are added above. + unsigned FrameReg; + int64_t Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg) - 8; + + if (Reg < Hexagon::D0 || Reg > Hexagon::D15) { + unsigned DwarfReg = HRI.getDwarfRegNum(Reg, true); + auto OffReg = MCCFIInstruction::createOffset(FrameLabel, DwarfReg, + Offset); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffReg)); + } else { + // Split the double regs into subregs, and generate appropriate + // cfi_offsets. + // The only reason, we are split double regs is, llvm-mc does not + // understand paired registers for cfi_offset. + // Eg .cfi_offset r1:0, -64 + + unsigned HiReg = HRI.getSubReg(Reg, Hexagon::subreg_hireg); + unsigned LoReg = HRI.getSubReg(Reg, Hexagon::subreg_loreg); + unsigned HiDwarfReg = HRI.getDwarfRegNum(HiReg, true); + unsigned LoDwarfReg = HRI.getDwarfRegNum(LoReg, true); + auto OffHi = MCCFIInstruction::createOffset(FrameLabel, HiDwarfReg, + Offset+4); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffHi)); + auto OffLo = MCCFIInstruction::createOffset(FrameLabel, LoDwarfReg, + Offset); + BuildMI(MBB, At, DL, CFID) + .addCFIIndex(MMI.addFrameInst(OffLo)); + } + } +} + + bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const HexagonMachineFunctionInfo *FuncInfo = - MF.getInfo<HexagonMachineFunctionInfo>(); - return MFI->hasCalls() || MFI->getStackSize() > 0 || - FuncInfo->hasClobberLR(); + auto &MFI = *MF.getFrameInfo(); + auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + + bool HasFixed = MFI.getNumFixedObjects(); + bool HasPrealloc = const_cast<MachineFrameInfo&>(MFI) + .getLocalFrameObjectCount(); + bool HasExtraAlign = HRI.needsStackRealignment(MF); + bool HasAlloca = MFI.hasVarSizedObjects(); + + // Insert ALLOCFRAME if we need to or at -O0 for the debugger. Think + // that this shouldn't be required, but doing so now because gcc does and + // gdb can't break at the start of the function without it. Will remove if + // this turns out to be a gdb bug. + // + if (MF.getTarget().getOptLevel() == CodeGenOpt::None) + return true; + + // By default we want to use SP (since it's always there). FP requires + // some setup (i.e. ALLOCFRAME). + // Fixed and preallocated objects need FP if the distance from them to + // the SP is unknown (as is with alloca or aligna). + if ((HasFixed || HasPrealloc) && (HasAlloca || HasExtraAlign)) + return true; + + if (MFI.getStackSize() > 0) { + if (UseAllocframe) + return true; + } + + if (MFI.hasCalls() || + MF.getInfo<HexagonMachineFunctionInfo>()->hasClobberLR()) + return true; + + return false; } @@ -718,9 +825,89 @@ static void addCalleeSaveRegistersAsImpOperand(MachineInstr *Inst, } -int HexagonFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { - return MF.getFrameInfo()->getObjectOffset(FI); +int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, unsigned &FrameReg) const { + auto &MFI = *MF.getFrameInfo(); + auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + + // Large parts of this code are shared with HRI::eliminateFrameIndex. + int Offset = MFI.getObjectOffset(FI); + bool HasAlloca = MFI.hasVarSizedObjects(); + bool HasExtraAlign = HRI.needsStackRealignment(MF); + bool NoOpt = MF.getTarget().getOptLevel() == CodeGenOpt::None; + + unsigned SP = HRI.getStackRegister(), FP = HRI.getFrameRegister(); + unsigned AP = 0; + if (const MachineInstr *AI = getAlignaInstr(MF)) + AP = AI->getOperand(0).getReg(); + unsigned FrameSize = MFI.getStackSize(); + + bool UseFP = false, UseAP = false; // Default: use SP (except at -O0). + // Use FP at -O0, except when there are objects with extra alignment. + // That additional alignment requirement may cause a pad to be inserted, + // which will make it impossible to use FP to access objects located + // past the pad. + if (NoOpt && !HasExtraAlign) + UseFP = true; + if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) { + // Fixed and preallocated objects will be located before any padding + // so FP must be used to access them. + UseFP |= (HasAlloca || HasExtraAlign); + } else { + if (HasAlloca) { + if (HasExtraAlign) + UseAP = true; + else + UseFP = true; + } + } + + // If FP was picked, then there had better be FP. + bool HasFP = hasFP(MF); + assert((HasFP || !UseFP) && "This function must have frame pointer"); + + // Having FP implies allocframe. Allocframe will store extra 8 bytes: + // FP/LR. If the base register is used to access an object across these + // 8 bytes, then the offset will need to be adjusted by 8. + // + // After allocframe: + // HexagonISelLowering adds 8 to ---+ + // the offsets of all stack-based | + // arguments (*) | + // | + // getObjectOffset < 0 0 8 getObjectOffset >= 8 + // ------------------------+-----+------------------------> increasing + // <local objects> |FP/LR| <input arguments> addresses + // -----------------+------+-----+------------------------> + // | | + // SP/AP point --+ +-- FP points here (**) + // somewhere on + // this side of FP/LR + // + // (*) See LowerFormalArguments. The FP/LR is assumed to be present. + // (**) *FP == old-FP. FP+0..7 are the bytes of FP/LR. + + // The lowering assumes that FP/LR is present, and so the offsets of + // the formal arguments start at 8. If FP/LR is not there we need to + // reduce the offset by 8. + if (Offset > 0 && !HasFP) + Offset -= 8; + + if (UseFP) + FrameReg = FP; + else if (UseAP) + FrameReg = AP; + else + FrameReg = SP; + + // Calculate the actual offset in the instruction. If there is no FP + // (in other words, no allocframe), then SP will not be adjusted (i.e. + // there will be no SP -= FrameSize), so the frame size should not be + // added to the calculated offset. + int RealOffset = Offset; + if (!UseFP && !UseAP && HasFP) + RealOffset = FrameSize+Offset; + return RealOffset; } @@ -731,7 +918,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI = MBB.begin(); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); if (useSpillFunction(MF, CSI)) { unsigned MaxReg = getMaxCalleeSavedReg(CSI, HRI); @@ -739,7 +926,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, // Call spill function. DebugLoc DL = MI != MBB.end() ? MI->getDebugLoc() : DebugLoc(); MachineInstr *SaveRegsCall = - BuildMI(MBB, MI, DL, TII.get(Hexagon::SAVE_REGISTERS_CALL_V4)) + BuildMI(MBB, MI, DL, HII.get(Hexagon::SAVE_REGISTERS_CALL_V4)) .addExternalSymbol(SpillFun); // Add callee-saved registers as use. addCalleeSaveRegistersAsImpOperand(SaveRegsCall, MaxReg, false); @@ -757,7 +944,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB, bool IsKill = !HRI.isEHReturnCalleeSaveReg(Reg); int FI = CSI[i].getFrameIdx(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); - TII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI); + HII.storeRegToStackSlot(MBB, MI, Reg, IsKill, FI, RC, &HRI); if (IsKill) MBB.addLiveIn(Reg); } @@ -772,7 +959,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI = MBB.getFirstTerminator(); MachineFunction &MF = *MBB.getParent(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); + auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); if (useRestoreFunction(MF, CSI)) { bool HasTC = hasTailCall(MBB) || !hasReturn(MBB); @@ -787,14 +974,14 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, if (HasTC) { unsigned ROpc = Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4; - DeallocCall = BuildMI(MBB, MI, DL, TII.get(ROpc)) + DeallocCall = BuildMI(MBB, MI, DL, HII.get(ROpc)) .addExternalSymbol(RestoreFn); } else { // The block has a return. MachineBasicBlock::iterator It = MBB.getFirstTerminator(); assert(It->isReturn() && std::next(It) == MBB.end()); unsigned ROpc = Hexagon::RESTORE_DEALLOC_RET_JMP_V4; - DeallocCall = BuildMI(MBB, It, DL, TII.get(ROpc)) + DeallocCall = BuildMI(MBB, It, DL, HII.get(ROpc)) .addExternalSymbol(RestoreFn); // Transfer the function live-out registers. DeallocCall->copyImplicitOps(MF, It); @@ -807,7 +994,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB, unsigned Reg = CSI[i].getReg(); const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg); int FI = CSI[i].getFrameIdx(); - TII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI); + HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI); } return true; } @@ -832,9 +1019,9 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized( // via AP, which may not be available at the particular place in the program. MachineFrameInfo *MFI = MF.getFrameInfo(); bool HasAlloca = MFI->hasVarSizedObjects(); - bool HasAligna = (MFI->getMaxAlignment() > getStackAlignment()); + bool NeedsAlign = (MFI->getMaxAlignment() > getStackAlignment()); - if (!HasAlloca || !HasAligna) + if (!HasAlloca || !NeedsAlign) return; unsigned LFS = MFI->getLocalFrameSize(); @@ -864,13 +1051,13 @@ static bool needToReserveScavengingSpillSlots(MachineFunction &MF, // Check for an unused caller-saved register. for ( ; *CallerSavedRegs; ++CallerSavedRegs) { MCPhysReg FreeReg = *CallerSavedRegs; - if (MRI.isPhysRegUsed(FreeReg)) + if (!MRI.reg_nodbg_empty(FreeReg)) continue; // Check aliased register usage. bool IsCurrentRegUsed = false; for (MCRegAliasIterator AI(FreeReg, &HRI, false); AI.isValid(); ++AI) - if (MRI.isPhysRegUsed(*AI)) { + if (!MRI.reg_nodbg_empty(*AI)) { IsCurrentRegUsed = true; break; } @@ -896,7 +1083,7 @@ bool HexagonFrameLowering::replacePredRegPseudoSpillCode(MachineFunction &MF) // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block. MachineBasicBlock::iterator NextII; for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end(); @@ -1210,7 +1397,8 @@ bool HexagonFrameLowering::needsAligna(const MachineFunction &MF) const { } -MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const { +const MachineInstr *HexagonFrameLowering::getAlignaInstr( + const MachineFunction &MF) const { for (auto &B : MF) for (auto &I : B) if (I.getOpcode() == Hexagon::ALIGNA) @@ -1219,6 +1407,7 @@ MachineInstr *HexagonFrameLowering::getAlignaInstr(MachineFunction &MF) const { } +// FIXME: Use Function::optForSize(). inline static bool isOptSize(const MachineFunction &MF) { AttributeSet AF = MF.getFunction()->getAttributes(); return AF.hasAttribute(AttributeSet::FunctionIndex, @@ -1226,8 +1415,7 @@ inline static bool isOptSize(const MachineFunction &MF) { } inline static bool isMinSize(const MachineFunction &MF) { - AttributeSet AF = MF.getFunction()->getAttributes(); - return AF.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + return MF.getFunction()->optForMinSize(); } @@ -1289,4 +1477,3 @@ bool HexagonFrameLowering::useRestoreFunction(MachineFunction &MF, : SpillFuncThreshold; return Threshold < NumCSI; } - diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h index d39ee2c..683b303 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonFrameLowering.h @@ -51,7 +51,8 @@ public: bool targetHandlesStackFrameRounding() const override { return true; } - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; bool hasFP(const MachineFunction &MF) const override; const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) @@ -73,7 +74,9 @@ public: const override; bool needsAligna(const MachineFunction &MF) const; - MachineInstr *getAlignaInstr(MachineFunction &MF) const; + const MachineInstr *getAlignaInstr(const MachineFunction &MF) const; + + void insertCFIInstructions(MachineFunction &MF) const; private: typedef std::vector<CalleeSavedInfo> CSIVect; @@ -86,6 +89,8 @@ private: const HexagonRegisterInfo &HRI) const; bool insertCSRRestoresInBlock(MachineBasicBlock &MBB, const CSIVect &CSI, const HexagonRegisterInfo &HRI) const; + void insertCFIInstructionsAt(MachineBasicBlock &MBB, + MachineBasicBlock::iterator At) const; void adjustForCalleeSavedRegsSpillCall(MachineFunction &MF) const; bool replacePredRegPseudoSpillCode(MachineFunction &MF) const; @@ -94,7 +99,7 @@ private: void findShrunkPrologEpilog(MachineFunction &MF, MachineBasicBlock *&PrologB, MachineBasicBlock *&EpilogB) const; - bool shouldInlineCSR(llvm::MachineFunction&, const CSIVect&) const; + bool shouldInlineCSR(llvm::MachineFunction &MF, const CSIVect &CSI) const; bool useSpillFunction(MachineFunction &MF, const CSIVect &CSI) const; bool useRestoreFunction(MachineFunction &MF, const CSIVect &CSI) const; }; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp index 4d32208..f26e2ff 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -195,7 +195,7 @@ bool HexagonGenExtract::convert(Instruction *In) { return false; } - IRBuilder<> IRB(BB, In); + IRBuilder<> IRB(In); Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; Module *Mod = BB->getParent()->getParent(); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index 096da94..64a2b6c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -77,9 +77,8 @@ namespace { namespace { // Set of virtual registers, based on BitVector. struct RegisterSet : private BitVector { - RegisterSet() : BitVector() {} + RegisterSet() = default; explicit RegisterSet(unsigned s, bool t = false) : BitVector(s, t) {} - RegisterSet(const RegisterSet &RS) : BitVector(RS) {} using BitVector::clear; @@ -1496,7 +1495,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { // version of DCE that preserves lifetime markers. Without it, merging // of stack objects can fail to recognize and merge disjoint objects // leading to unnecessary stack growth. - Changed |= removeDeadCode(MDT->getRootNode()); + Changed = removeDeadCode(MDT->getRootNode()); const HexagonEvaluator HE(*HRI, *MRI, *HII, MF); BitTracker BTLoc(HE, MF); @@ -1534,7 +1533,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { } if (IFMap.empty()) - return false; + return Changed; { NamedRegionTimer _T("pruning", "hexinsert", TimingDetail); @@ -1547,7 +1546,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { } if (IFMap.empty()) - return false; + return Changed; { NamedRegionTimer _T("selection", "hexinsert", TimingDetail); @@ -1572,13 +1571,15 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) { for (unsigned i = 0, n = Out.size(); i < n; ++i) IFMap.erase(Out[i]); } + if (IFMap.empty()) + return Changed; { NamedRegionTimer _T("generation", "hexinsert", TimingDetail); - Changed = generateInserts(); + generateInserts(); } - return Changed; + return true; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp new file mode 100644 index 0000000..c059d56 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenMux.cpp @@ -0,0 +1,319 @@ +//===--- HexagonGenMux.cpp ------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// During instruction selection, MUX instructions are generated for +// conditional assignments. Since such assignments often present an +// opportunity to predicate instructions, HexagonExpandCondsets +// expands MUXes into pairs of conditional transfers, and then proceeds +// with predication of the producers/consumers of the registers involved. +// This happens after exiting from the SSA form, but before the machine +// instruction scheduler. After the scheduler and after the register +// allocation there can be cases of pairs of conditional transfers +// resulting from a MUX where neither of them was further predicated. If +// these transfers are now placed far enough from the instruction defining +// the predicate register, they cannot use the .new form. In such cases it +// is better to collapse them back to a single MUX instruction. + +#define DEBUG_TYPE "hexmux" + +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "HexagonTargetMachine.h" + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonGenMux(); + void initializeHexagonGenMuxPass(PassRegistry& Registry); +} + +namespace { + class HexagonGenMux : public MachineFunctionPass { + public: + static char ID; + HexagonGenMux() : MachineFunctionPass(ID), HII(0), HRI(0) { + initializeHexagonGenMuxPass(*PassRegistry::getPassRegistry()); + } + const char *getPassName() const override { + return "Hexagon generate mux instructions"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; + + struct CondsetInfo { + unsigned PredR; + unsigned TrueX, FalseX; + CondsetInfo() : PredR(0), TrueX(UINT_MAX), FalseX(UINT_MAX) {} + }; + struct DefUseInfo { + BitVector Defs, Uses; + DefUseInfo() : Defs(), Uses() {} + DefUseInfo(const BitVector &D, const BitVector &U) : Defs(D), Uses(U) {} + }; + struct MuxInfo { + MachineBasicBlock::iterator At; + unsigned DefR, PredR; + MachineOperand *SrcT, *SrcF; + MachineInstr *Def1, *Def2; + MuxInfo(MachineBasicBlock::iterator It, unsigned DR, unsigned PR, + MachineOperand *TOp, MachineOperand *FOp, + MachineInstr *D1, MachineInstr *D2) + : At(It), DefR(DR), PredR(PR), SrcT(TOp), SrcF(FOp), Def1(D1), + Def2(D2) {} + }; + typedef DenseMap<MachineInstr*,unsigned> InstrIndexMap; + typedef DenseMap<unsigned,DefUseInfo> DefUseInfoMap; + typedef SmallVector<MuxInfo,4> MuxInfoList; + + bool isRegPair(unsigned Reg) const { + return Hexagon::DoubleRegsRegClass.contains(Reg); + } + void getSubRegs(unsigned Reg, BitVector &SRs) const; + void expandReg(unsigned Reg, BitVector &Set) const; + void getDefsUses(const MachineInstr *MI, BitVector &Defs, + BitVector &Uses) const; + void buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X, + DefUseInfoMap &DUM); + bool isCondTransfer(unsigned Opc) const; + unsigned getMuxOpcode(const MachineOperand &Src1, + const MachineOperand &Src2) const; + bool genMuxInBlock(MachineBasicBlock &B); + }; + + char HexagonGenMux::ID = 0; +} + +INITIALIZE_PASS(HexagonGenMux, "hexagon-mux", + "Hexagon generate mux instructions", false, false) + + +void HexagonGenMux::getSubRegs(unsigned Reg, BitVector &SRs) const { + for (MCSubRegIterator I(Reg, HRI); I.isValid(); ++I) + SRs[*I] = true; +} + + +void HexagonGenMux::expandReg(unsigned Reg, BitVector &Set) const { + if (isRegPair(Reg)) + getSubRegs(Reg, Set); + else + Set[Reg] = true; +} + + +void HexagonGenMux::getDefsUses(const MachineInstr *MI, BitVector &Defs, + BitVector &Uses) const { + // First, get the implicit defs and uses for this instruction. + unsigned Opc = MI->getOpcode(); + const MCInstrDesc &D = HII->get(Opc); + if (const MCPhysReg *R = D.ImplicitDefs) + while (*R) + expandReg(*R++, Defs); + if (const MCPhysReg *R = D.ImplicitUses) + while (*R) + expandReg(*R++, Uses); + + // Look over all operands, and collect explicit defs and uses. + for (ConstMIOperands Mo(MI); Mo.isValid(); ++Mo) { + if (!Mo->isReg() || Mo->isImplicit()) + continue; + unsigned R = Mo->getReg(); + BitVector &Set = Mo->isDef() ? Defs : Uses; + expandReg(R, Set); + } +} + + +void HexagonGenMux::buildMaps(MachineBasicBlock &B, InstrIndexMap &I2X, + DefUseInfoMap &DUM) { + unsigned Index = 0; + unsigned NR = HRI->getNumRegs(); + BitVector Defs(NR), Uses(NR); + + for (MachineBasicBlock::iterator I = B.begin(), E = B.end(); I != E; ++I) { + MachineInstr *MI = &*I; + I2X.insert(std::make_pair(MI, Index)); + Defs.reset(); + Uses.reset(); + getDefsUses(MI, Defs, Uses); + DUM.insert(std::make_pair(Index, DefUseInfo(Defs, Uses))); + Index++; + } +} + + +bool HexagonGenMux::isCondTransfer(unsigned Opc) const { + switch (Opc) { + case Hexagon::A2_tfrt: + case Hexagon::A2_tfrf: + case Hexagon::C2_cmoveit: + case Hexagon::C2_cmoveif: + return true; + } + return false; +} + + +unsigned HexagonGenMux::getMuxOpcode(const MachineOperand &Src1, + const MachineOperand &Src2) const { + bool IsReg1 = Src1.isReg(), IsReg2 = Src2.isReg(); + if (IsReg1) + return IsReg2 ? Hexagon::C2_mux : Hexagon::C2_muxir; + if (IsReg2) + return Hexagon::C2_muxri; + + // Neither is a register. The first source is extendable, but the second + // is not (s8). + if (Src2.isImm() && isInt<8>(Src2.getImm())) + return Hexagon::C2_muxii; + + return 0; +} + + +bool HexagonGenMux::genMuxInBlock(MachineBasicBlock &B) { + bool Changed = false; + InstrIndexMap I2X; + DefUseInfoMap DUM; + buildMaps(B, I2X, DUM); + + typedef DenseMap<unsigned,CondsetInfo> CondsetMap; + CondsetMap CM; + MuxInfoList ML; + + MachineBasicBlock::iterator NextI, End = B.end(); + for (MachineBasicBlock::iterator I = B.begin(); I != End; I = NextI) { + MachineInstr *MI = &*I; + NextI = std::next(I); + unsigned Opc = MI->getOpcode(); + if (!isCondTransfer(Opc)) + continue; + unsigned DR = MI->getOperand(0).getReg(); + if (isRegPair(DR)) + continue; + + unsigned PR = MI->getOperand(1).getReg(); + unsigned Idx = I2X.lookup(MI); + CondsetMap::iterator F = CM.find(DR); + bool IfTrue = HII->isPredicatedTrue(Opc); + + // If there is no record of a conditional transfer for this register, + // or the predicate register differs, create a new record for it. + if (F != CM.end() && F->second.PredR != PR) { + CM.erase(F); + F = CM.end(); + } + if (F == CM.end()) { + auto It = CM.insert(std::make_pair(DR, CondsetInfo())); + F = It.first; + F->second.PredR = PR; + } + CondsetInfo &CI = F->second; + if (IfTrue) + CI.TrueX = Idx; + else + CI.FalseX = Idx; + if (CI.TrueX == UINT_MAX || CI.FalseX == UINT_MAX) + continue; + + // There is now a complete definition of DR, i.e. we have the predicate + // register, the definition if-true, and definition if-false. + + // First, check if both definitions are far enough from the definition + // of the predicate register. + unsigned MinX = std::min(CI.TrueX, CI.FalseX); + unsigned MaxX = std::max(CI.TrueX, CI.FalseX); + unsigned SearchX = (MaxX > 4) ? MaxX-4 : 0; + bool NearDef = false; + for (unsigned X = SearchX; X < MaxX; ++X) { + const DefUseInfo &DU = DUM.lookup(X); + if (!DU.Defs[PR]) + continue; + NearDef = true; + break; + } + if (NearDef) + continue; + + // The predicate register is not defined in the last few instructions. + // Check if the conversion to MUX is possible (either "up", i.e. at the + // place of the earlier partial definition, or "down", where the later + // definition is located). Examine all defs and uses between these two + // definitions. + // SR1, SR2 - source registers from the first and the second definition. + MachineBasicBlock::iterator It1 = B.begin(), It2 = B.begin(); + std::advance(It1, MinX); + std::advance(It2, MaxX); + MachineInstr *Def1 = It1, *Def2 = It2; + MachineOperand *Src1 = &Def1->getOperand(2), *Src2 = &Def2->getOperand(2); + unsigned SR1 = Src1->isReg() ? Src1->getReg() : 0; + unsigned SR2 = Src2->isReg() ? Src2->getReg() : 0; + bool Failure = false, CanUp = true, CanDown = true; + for (unsigned X = MinX+1; X < MaxX; X++) { + const DefUseInfo &DU = DUM.lookup(X); + if (DU.Defs[PR] || DU.Defs[DR] || DU.Uses[DR]) { + Failure = true; + break; + } + if (CanDown && DU.Defs[SR1]) + CanDown = false; + if (CanUp && DU.Defs[SR2]) + CanUp = false; + } + if (Failure || (!CanUp && !CanDown)) + continue; + + MachineOperand *SrcT = (MinX == CI.TrueX) ? Src1 : Src2; + MachineOperand *SrcF = (MinX == CI.FalseX) ? Src1 : Src2; + // Prefer "down", since this will move the MUX farther away from the + // predicate definition. + MachineBasicBlock::iterator At = CanDown ? Def2 : Def1; + ML.push_back(MuxInfo(At, DR, PR, SrcT, SrcF, Def1, Def2)); + } + + for (unsigned I = 0, N = ML.size(); I < N; ++I) { + MuxInfo &MX = ML[I]; + MachineBasicBlock &B = *MX.At->getParent(); + DebugLoc DL = MX.At->getDebugLoc(); + unsigned MxOpc = getMuxOpcode(*MX.SrcT, *MX.SrcF); + if (!MxOpc) + continue; + BuildMI(B, MX.At, DL, HII->get(MxOpc), MX.DefR) + .addReg(MX.PredR) + .addOperand(*MX.SrcT) + .addOperand(*MX.SrcF); + B.erase(MX.Def1); + B.erase(MX.Def2); + Changed = true; + } + + return Changed; +} + +bool HexagonGenMux::runOnMachineFunction(MachineFunction &MF) { + HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + bool Changed = false; + for (auto &I : MF) + Changed |= genMuxInBlock(I); + return Changed; +} + +FunctionPass *llvm::createHexagonGenMux() { + return new HexagonGenMux(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp index 6905c4f..d9675b5 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -250,7 +250,7 @@ Register HexagonGenPredicate::getPredRegFor(const Register &Reg) { unsigned NewPR = MRI->createVirtualRegister(PredRC); // For convertible instructions, do not modify them, so that they can - // be coverted later. Generate a copy from Reg to NewPR. + // be converted later. Generate a copy from Reg to NewPR. if (isConvertibleToPredForm(DefI)) { MachineBasicBlock::iterator DefIt = DefI; BuildMI(B, std::next(DefIt), DL, TII->get(TargetOpcode::COPY), NewPR) diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp index 53b6bf6..d20a809 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -727,9 +727,9 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop, // Phis that may feed into the loop. LoopFeederMap LoopFeederPhi; - // Check if the inital value may be zero and can be decremented in the first + // Check if the initial value may be zero and can be decremented in the first // iteration. If the value is zero, the endloop instruction will not decrement - // the loop counter, so we shoudn't generate a hardware loop in this case. + // the loop counter, so we shouldn't generate a hardware loop in this case. if (loopCountMayWrapOrUnderFlow(Start, End, Loop->getLoopPreheader(), Loop, LoopFeederPhi)) return nullptr; @@ -1288,14 +1288,14 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI, typedef MachineBasicBlock::instr_iterator instr_iterator; // Check if things are in order to begin with. - for (instr_iterator I = BumpI, E = BB->instr_end(); I != E; ++I) + for (instr_iterator I(BumpI), E = BB->instr_end(); I != E; ++I) if (&*I == CmpI) return true; // Out of order. unsigned PredR = CmpI->getOperand(0).getReg(); bool FoundBump = false; - instr_iterator CmpIt = CmpI, NextIt = std::next(CmpIt); + instr_iterator CmpIt = CmpI->getIterator(), NextIt = std::next(CmpIt); for (instr_iterator I = NextIt, E = BB->instr_end(); I != E; ++I) { MachineInstr *In = &*I; for (unsigned i = 0, n = In->getNumOperands(); i < n; ++i) { @@ -1307,9 +1307,7 @@ bool HexagonHardwareLoops::orderBumpCompare(MachineInstr *BumpI, } if (In == BumpI) { - instr_iterator After = BumpI; - instr_iterator From = CmpI; - BB->splice(std::next(After), BB, From); + BB->splice(++BumpI->getIterator(), BB, CmpI->getIterator()); FoundBump = true; break; } @@ -1440,7 +1438,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow( if (Comparison::isSigned(Cmp)) return false; - // Check if there is a comparison of the inital value. If the initial value + // Check if there is a comparison of the initial value. If the initial value // is greater than or not equal to another value, then assume this is a // range check. if ((Cmp & Comparison::G) || Cmp == Comparison::NE) @@ -1850,7 +1848,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop( } MachineBasicBlock *NewPH = MF->CreateMachineBasicBlock(); - MF->insert(Header, NewPH); + MF->insert(Header->getIterator(), NewPH); if (Header->pred_size() > 2) { // Ensure that the header has only two predecessors: the preheader and diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 9123057..a0da945 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -50,16 +50,21 @@ namespace { class HexagonDAGToDAGISel : public SelectionDAGISel { const HexagonTargetMachine& HTM; const HexagonSubtarget *HST; + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; public: explicit HexagonDAGToDAGISel(HexagonTargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), HTM(tm) { + : SelectionDAGISel(tm, OptLevel), HTM(tm), HST(nullptr), HII(nullptr), + HRI(nullptr) { initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. HST = &MF.getSubtarget<HexagonSubtarget>(); + HII = HST->getInstrInfo(); + HRI = HST->getRegisterInfo(); SelectionDAGISel::runOnMachineFunction(MF); return true; } @@ -104,7 +109,6 @@ public: SDNode *SelectConstantFP(SDNode *N); SDNode *SelectAdd(SDNode *N); SDNode *SelectBitOp(SDNode *N); - bool isConstExtProfitable(SDNode *N) const; // XformMskToBitPosU5Imm - Returns the bit position which // the single bit 32 bit mask represents. @@ -139,8 +143,8 @@ public: // type i32 where the negative literal is transformed into a positive literal // for use in -= memops. inline SDValue XformM5ToU5Imm(signed Imm, SDLoc DL) { - assert( (Imm >= -31 && Imm <= -1) && "Constant out of range for Memops"); - return CurDAG->getTargetConstant( - Imm, DL, MVT::i32); + assert((Imm >= -31 && Imm <= -1) && "Constant out of range for Memops"); + return CurDAG->getTargetConstant(-Imm, DL, MVT::i32); } // XformU7ToU7M1Imm - Return a target constant decremented by 1, in range @@ -203,11 +207,10 @@ void llvm::initializeHexagonDAGToDAGISelPass(PassRegistry &Registry) { // Intrinsics that return a a predicate. -static unsigned doesIntrinsicReturnPredicate(unsigned ID) -{ +static bool doesIntrinsicReturnPredicate(unsigned ID) { switch (ID) { default: - return 0; + return false; case Intrinsic::hexagon_C2_cmpeq: case Intrinsic::hexagon_C2_cmpgt: case Intrinsic::hexagon_C2_cmpgtu: @@ -244,7 +247,7 @@ static unsigned doesIntrinsicReturnPredicate(unsigned ID) case Intrinsic::hexagon_C2_tfrrp: case Intrinsic::hexagon_S2_tstbit_i: case Intrinsic::hexagon_S2_tstbit_r: - return 1; + return true; } } @@ -258,8 +261,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD, SDNode *OffsetNode = Offset.getNode(); int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue(); - const HexagonInstrInfo &TII = *HST->getInstrInfo(); - if (TII.isValidAutoIncImm(LoadedVT, Val)) { + if (HII->isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConst = CurDAG->getTargetConstant(Val, dl, MVT::i32); SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32, MVT::Other, Base, TargetConst, @@ -312,8 +314,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD, SDNode *OffsetNode = Offset.getNode(); int32_t Val = cast<ConstantSDNode>(OffsetNode)->getSExtValue(); - const HexagonInstrInfo &TII = *HST->getInstrInfo(); - if (TII.isValidAutoIncImm(LoadedVT, Val)) { + if (HII->isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32); SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, @@ -378,29 +379,46 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) { // loads. ISD::LoadExtType ExtType = LD->getExtensionType(); bool IsZeroExt = (ExtType == ISD::ZEXTLOAD || ExtType == ISD::EXTLOAD); + bool HasVecOffset = false; // Figure out the opcode. - const HexagonInstrInfo &TII = *HST->getInstrInfo(); if (LoadedVT == MVT::i64) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = Hexagon::L2_loadrd_pi; else Opcode = Hexagon::L2_loadrd_io; } else if (LoadedVT == MVT::i32) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = Hexagon::L2_loadri_pi; else Opcode = Hexagon::L2_loadri_io; } else if (LoadedVT == MVT::i16) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = IsZeroExt ? Hexagon::L2_loadruh_pi : Hexagon::L2_loadrh_pi; else Opcode = IsZeroExt ? Hexagon::L2_loadruh_io : Hexagon::L2_loadrh_io; } else if (LoadedVT == MVT::i8) { - if (TII.isValidAutoIncImm(LoadedVT, Val)) + if (HII->isValidAutoIncImm(LoadedVT, Val)) Opcode = IsZeroExt ? Hexagon::L2_loadrub_pi : Hexagon::L2_loadrb_pi; else Opcode = IsZeroExt ? Hexagon::L2_loadrub_io : Hexagon::L2_loadrb_io; + } else if (LoadedVT == MVT::v16i32 || LoadedVT == MVT::v8i64 || + LoadedVT == MVT::v32i16 || LoadedVT == MVT::v64i8) { + HasVecOffset = true; + if (HII->isValidAutoIncImm(LoadedVT, Val)) { + Opcode = Hexagon::V6_vL32b_pi; + } + else + Opcode = Hexagon::V6_vL32b_ai; + // 128B + } else if (LoadedVT == MVT::v32i32 || LoadedVT == MVT::v16i64 || + LoadedVT == MVT::v64i16 || LoadedVT == MVT::v128i8) { + HasVecOffset = true; + if (HII->isValidAutoIncImm(LoadedVT, Val)) { + Opcode = Hexagon::V6_vL32b_pi_128B; + } + else + Opcode = Hexagon::V6_vL32b_ai_128B; } else llvm_unreachable("unknown memory type"); @@ -411,7 +429,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) { if (LD->getValueType(0) == MVT::i64 && ExtType == ISD::SEXTLOAD) return SelectIndexedLoadSignExtend64(LD, Opcode, dl); - if (TII.isValidAutoIncImm(LoadedVT, Val)) { + if (HII->isValidAutoIncImm(LoadedVT, Val)) { SDValue TargetConstVal = CurDAG->getTargetConstant(Val, dl, MVT::i32); SDNode* Result = CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0), @@ -420,15 +438,25 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) { MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); MemOp[0] = LD->getMemOperand(); cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1); - const SDValue Froms[] = { SDValue(LD, 0), - SDValue(LD, 1), - SDValue(LD, 2) - }; - const SDValue Tos[] = { SDValue(Result, 0), - SDValue(Result, 1), - SDValue(Result, 2) - }; - ReplaceUses(Froms, Tos, 3); + if (HasVecOffset) { + const SDValue Froms[] = { SDValue(LD, 0), + SDValue(LD, 2) + }; + const SDValue Tos[] = { SDValue(Result, 0), + SDValue(Result, 2) + }; + ReplaceUses(Froms, Tos, 2); + } else { + const SDValue Froms[] = { SDValue(LD, 0), + SDValue(LD, 1), + SDValue(LD, 2) + }; + const SDValue Tos[] = { SDValue(Result, 0), + SDValue(Result, 1), + SDValue(Result, 2) + }; + ReplaceUses(Froms, Tos, 3); + } return Result; } else { SDValue TargetConst0 = CurDAG->getTargetConstant(0, dl, MVT::i32); @@ -487,8 +515,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { // Offset value must be within representable range // and must have correct alignment properties. - const HexagonInstrInfo &TII = *HST->getInstrInfo(); - if (TII.isValidAutoIncImm(StoredVT, Val)) { + if (HII->isValidAutoIncImm(StoredVT, Val)) { unsigned Opcode = 0; // Figure out the post inc version of opcode. @@ -496,7 +523,15 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_pi; else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_pi; else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_pi; - else llvm_unreachable("unknown memory type"); + else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 || + StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8) { + Opcode = Hexagon::V6_vS32b_pi; + } + // 128B + else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 || + StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8) { + Opcode = Hexagon::V6_vS32b_pi_128B; + } else llvm_unreachable("unknown memory type"); if (ST->isTruncatingStore() && ValueVT.getSizeInBits() == 64) { assert(StoredVT.getSizeInBits() < 64 && "Not a truncating store"); @@ -530,6 +565,13 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) { else if (StoredVT == MVT::i32) Opcode = Hexagon::S2_storeri_io; else if (StoredVT == MVT::i16) Opcode = Hexagon::S2_storerh_io; else if (StoredVT == MVT::i8) Opcode = Hexagon::S2_storerb_io; + else if (StoredVT == MVT::v16i32 || StoredVT == MVT::v8i64 || + StoredVT == MVT::v32i16 || StoredVT == MVT::v64i8) + Opcode = Hexagon::V6_vS32b_ai; + // 128B + else if (StoredVT == MVT::v32i32 || StoredVT == MVT::v16i64 || + StoredVT == MVT::v64i16 || StoredVT == MVT::v128i8) + Opcode = Hexagon::V6_vS32b_ai_128B; else llvm_unreachable("unknown memory type"); // Build regular store. @@ -1113,14 +1155,12 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { } if (Opc == ISD::AND) { - if (((ValueVT == MVT::i32) && - (!((Val & 0x80000000) || (Val & 0x7fffffff)))) || - ((ValueVT == MVT::i64) && - (!((Val & 0x8000000000000000) || (Val & 0x7fffffff))))) - // If it's simple AND, do the normal op. - return SelectCode(N); - else + // Check if this is a bit-clearing AND, if not select code the usual way. + if ((ValueVT == MVT::i32 && isPowerOf2_32(~Val)) || + (ValueVT == MVT::i64 && isPowerOf2_64(~Val))) Val = ~Val; + else + return SelectCode(N); } // If OR or AND is being fed by shl, srl and, sra don't do this change, @@ -1128,7 +1168,8 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { // Traverse the DAG to see if there is shl, srl and sra. if (Opc == ISD::OR || Opc == ISD::AND) { switch (N->getOperand(0)->getOpcode()) { - default: break; + default: + break; case ISD::SRA: case ISD::SRL: case ISD::SHL: @@ -1137,23 +1178,24 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { } // Make sure it's power of 2. - unsigned bitpos = 0; + unsigned BitPos = 0; if (Opc != ISD::FABS && Opc != ISD::FNEG) { - if (((ValueVT == MVT::i32) && !isPowerOf2_32(Val)) || - ((ValueVT == MVT::i64) && !isPowerOf2_64(Val))) + if ((ValueVT == MVT::i32 && !isPowerOf2_32(Val)) || + (ValueVT == MVT::i64 && !isPowerOf2_64(Val))) return SelectCode(N); // Get the bit position. - bitpos = countTrailingZeros(uint64_t(Val)); + BitPos = countTrailingZeros(uint64_t(Val)); } else { // For fabs and fneg, it's always the 31st bit. - bitpos = 31; + BitPos = 31; } unsigned BitOpc = 0; // Set the right opcode for bitwise operations. - switch(Opc) { - default: llvm_unreachable("Only bit-wise/abs/neg operations are allowed."); + switch (Opc) { + default: + llvm_unreachable("Only bit-wise/abs/neg operations are allowed."); case ISD::AND: case ISD::FABS: BitOpc = Hexagon::S2_clrbit_i; @@ -1169,7 +1211,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { SDNode *Result; // Get the right SDVal for the opcode. - SDValue SDVal = CurDAG->getTargetConstant(bitpos, dl, MVT::i32); + SDValue SDVal = CurDAG->getTargetConstant(BitPos, dl, MVT::i32); if (ValueVT == MVT::i32 || ValueVT == MVT::f32) { Result = CurDAG->getMachineNode(BitOpc, dl, ValueVT, @@ -1198,7 +1240,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { MVT::i32, SDValue(Reg, 0)); // Clear/set/toggle hi or lo registers depending on the bit position. - if (SubValueVT != MVT::f32 && bitpos < 32) { + if (SubValueVT != MVT::f32 && BitPos < 32) { SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT, SubregLO, SDVal); const SDValue Ops[] = { RegClass, SubregHI, SubregHiIdx, @@ -1207,7 +1249,7 @@ SDNode *HexagonDAGToDAGISel::SelectBitOp(SDNode *N) { dl, ValueVT, Ops); } else { if (Opc != ISD::FABS && Opc != ISD::FNEG) - SDVal = CurDAG->getTargetConstant(bitpos - 32, dl, MVT::i32); + SDVal = CurDAG->getTargetConstant(BitPos-32, dl, MVT::i32); SDNode *Result0 = CurDAG->getMachineNode(BitOpc, dl, SubValueVT, SubregHI, SDVal); const SDValue Ops[] = { RegClass, SDValue(Result0, 0), SubregHiIdx, @@ -1328,25 +1370,12 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, return false; } -bool HexagonDAGToDAGISel::isConstExtProfitable(SDNode *N) const { - unsigned UseCount = 0; - unsigned CallCount = 0; - for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) { - // Ignore call instructions. - if (I->getOpcode() == ISD::CopyToReg) - ++CallCount; - UseCount++; - } - - return (UseCount <= 1) || (CallCount > 1); - -} void HexagonDAGToDAGISel::PreprocessISelDAG() { SelectionDAG &DAG = *CurDAG; std::vector<SDNode*> Nodes; - for (auto I = DAG.allnodes_begin(), E = DAG.allnodes_end(); I != E; ++I) - Nodes.push_back(I); + for (SDNode &Node : DAG.allnodes()) + Nodes.push_back(&Node); // Simplify: (or (select c x 0) z) -> (select c (or x z) z) // (or (select c 0 y) z) -> (select c z (or y z)) @@ -1397,11 +1426,10 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() { return; MachineFrameInfo *MFI = MF->getFrameInfo(); - MachineBasicBlock *EntryBB = MF->begin(); + MachineBasicBlock *EntryBB = &MF->front(); unsigned AR = FuncInfo->CreateReg(MVT::i32); unsigned MaxA = MFI->getMaxAlignment(); - auto &HII = *HST.getInstrInfo(); - BuildMI(EntryBB, DebugLoc(), HII.get(Hexagon::ALIGNA), AR) + BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::ALIGNA), AR) .addImm(MaxA); MF->getInfo<HexagonMachineFunctionInfo>()->setStackAlignBaseVReg(AR); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index c739afb..0167090 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -41,8 +41,8 @@ using namespace llvm; #define DEBUG_TYPE "hexagon-lowering" -static cl::opt<bool> -EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden, +static cl::opt<bool> EmitJumpTables("hexagon-emit-jump-tables", + cl::init(true), cl::Hidden, cl::desc("Control jump table emission on Hexagon target")); static cl::opt<bool> EnableHexSDNodeSched("enable-hexagon-sdnode-sched", @@ -98,6 +98,9 @@ public: } // Implement calling convention for Hexagon. + +static bool IsHvxVectorType(MVT ty); + static bool CC_Hexagon(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, @@ -114,6 +117,11 @@ CC_Hexagon64(unsigned ValNo, MVT ValVT, ISD::ArgFlagsTy ArgFlags, CCState &State); static bool +CC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); + +static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); @@ -129,6 +137,11 @@ RetCC_Hexagon64(unsigned ValNo, MVT ValVT, ISD::ArgFlagsTy ArgFlags, CCState &State); static bool +RetCC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); + +static bool CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { @@ -169,15 +182,43 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT, State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); return false; } + if (LocVT == MVT::v2i64 || LocVT == MVT::v4i32 || LocVT == MVT::v8i16 || + LocVT == MVT::v16i8) { + ofst = State.AllocateStack(16, 16); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v4i64 || LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || + LocVT == MVT::v32i8) { + ofst = State.AllocateStack(32, 32); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 || + LocVT == MVT::v64i8 || LocVT == MVT::v512i1) { + ofst = State.AllocateStack(64, 64); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 || + LocVT == MVT::v128i8 || LocVT == MVT::v1024i1) { + ofst = State.AllocateStack(128, 128); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + if (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 || + LocVT == MVT::v256i8) { + ofst = State.AllocateStack(256, 256); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo)); + return false; + } + llvm_unreachable(nullptr); } -static bool -CC_Hexagon (unsigned ValNo, MVT ValVT, - MVT LocVT, CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - +static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { if (ArgFlags.isByVal()) { // Passed on stack. unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), @@ -213,6 +254,17 @@ CC_Hexagon (unsigned ValNo, MVT ValVT, return false; } + if (LocVT == MVT::v8i32 || LocVT == MVT::v16i16 || LocVT == MVT::v32i8) { + unsigned Offset = State.AllocateStack(ArgFlags.getByValSize(), 32); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + + if (IsHvxVectorType(LocVT)) { + if (!CC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) + return false; + } + return true; // CC didn't match. } @@ -260,10 +312,82 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT, return false; } +static bool CC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + + static const MCPhysReg VecLstS[] = { Hexagon::V0, Hexagon::V1, + Hexagon::V2, Hexagon::V3, + Hexagon::V4, Hexagon::V5, + Hexagon::V6, Hexagon::V7, + Hexagon::V8, Hexagon::V9, + Hexagon::V10, Hexagon::V11, + Hexagon::V12, Hexagon::V13, + Hexagon::V14, Hexagon::V15}; + static const MCPhysReg VecLstD[] = { Hexagon::W0, Hexagon::W1, + Hexagon::W2, Hexagon::W3, + Hexagon::W4, Hexagon::W5, + Hexagon::W6, Hexagon::W7}; + auto &MF = State.getMachineFunction(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); + + if ((UseHVX && !UseHVXDbl) && + (LocVT == MVT::v8i64 || LocVT == MVT::v16i32 || LocVT == MVT::v32i16 || + LocVT == MVT::v64i8 || LocVT == MVT::v512i1)) { + if (unsigned Reg = State.AllocateReg(VecLstS)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(64, 64); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + if ((UseHVX && !UseHVXDbl) && + (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 || + LocVT == MVT::v128i8)) { + if (unsigned Reg = State.AllocateReg(VecLstD)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(128, 128); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + // 128B Mode + if ((UseHVX && UseHVXDbl) && + (LocVT == MVT::v32i64 || LocVT == MVT::v64i32 || LocVT == MVT::v128i16 || + LocVT == MVT::v256i8)) { + if (unsigned Reg = State.AllocateReg(VecLstD)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(256, 256); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + if ((UseHVX && UseHVXDbl) && + (LocVT == MVT::v16i64 || LocVT == MVT::v32i32 || LocVT == MVT::v64i16 || + LocVT == MVT::v128i8 || LocVT == MVT::v1024i1)) { + if (unsigned Reg = State.AllocateReg(VecLstS)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + unsigned Offset = State.AllocateStack(128, 128); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; + } + return true; +} + static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State) { - + auto &MF = State.getMachineFunction(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); if (LocVT == MVT::i1 || LocVT == MVT::i8 || @@ -282,8 +406,24 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, } else if (LocVT == MVT::v8i8 || LocVT == MVT::v4i16 || LocVT == MVT::v2i32) { LocVT = MVT::i64; LocInfo = CCValAssign::BCvt; + } else if (LocVT == MVT::v64i8 || LocVT == MVT::v32i16 || + LocVT == MVT::v16i32 || LocVT == MVT::v8i64 || + LocVT == MVT::v512i1) { + LocVT = MVT::v16i32; + ValVT = MVT::v16i32; + LocInfo = CCValAssign::Full; + } else if (LocVT == MVT::v128i8 || LocVT == MVT::v64i16 || + LocVT == MVT::v32i32 || LocVT == MVT::v16i64 || + (LocVT == MVT::v1024i1 && UseHVX && UseHVXDbl)) { + LocVT = MVT::v32i32; + ValVT = MVT::v32i32; + LocInfo = CCValAssign::Full; + } else if (LocVT == MVT::v256i8 || LocVT == MVT::v128i16 || + LocVT == MVT::v64i32 || LocVT == MVT::v32i64) { + LocVT = MVT::v64i32; + ValVT = MVT::v64i32; + LocInfo = CCValAssign::Full; } - if (LocVT == MVT::i32 || LocVT == MVT::f32) { if (!RetCC_Hexagon32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; @@ -293,7 +433,10 @@ static bool RetCC_Hexagon(unsigned ValNo, MVT ValVT, if (!RetCC_Hexagon64(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) return false; } - + if (LocVT == MVT::v16i32 || LocVT == MVT::v32i32 || LocVT == MVT::v64i32) { + if (!RetCC_HexagonVector(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State)) + return false; + } return true; // CC didn't match. } @@ -328,6 +471,52 @@ static bool RetCC_Hexagon64(unsigned ValNo, MVT ValVT, return false; } +static bool RetCC_HexagonVector(unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + auto &MF = State.getMachineFunction(); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); + + unsigned OffSiz = 64; + if (LocVT == MVT::v16i32) { + if (unsigned Reg = State.AllocateReg(Hexagon::V0)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + } else if (LocVT == MVT::v32i32) { + unsigned Req = (UseHVX && UseHVXDbl) ? Hexagon::V0 : Hexagon::W0; + if (unsigned Reg = State.AllocateReg(Req)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + OffSiz = 128; + } else if (LocVT == MVT::v64i32) { + if (unsigned Reg = State.AllocateReg(Hexagon::W0)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return false; + } + OffSiz = 256; + } + + unsigned Offset = State.AllocateStack(OffSiz, OffSiz); + State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo)); + return false; +} + +void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) { + if (VT != PromotedLdStVT) { + setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + + setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); + AddPromotedToType(ISD::STORE, VT.getSimpleVT(), + PromotedLdStVT.getSimpleVT()); + } +} + SDValue HexagonTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { @@ -351,6 +540,15 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } +static bool IsHvxVectorType(MVT ty) { + return (ty == MVT::v8i64 || ty == MVT::v16i32 || ty == MVT::v32i16 || + ty == MVT::v64i8 || + ty == MVT::v16i64 || ty == MVT::v32i32 || ty == MVT::v64i16 || + ty == MVT::v128i8 || + ty == MVT::v32i64 || ty == MVT::v64i32 || ty == MVT::v128i16 || + ty == MVT::v256i8 || + ty == MVT::v512i1 || ty == MVT::v1024i1); +} // LowerReturn - Lower ISD::RET. If a struct is larger than 8 bytes and is // passed by value, the function prototype is modified to return void and @@ -463,19 +661,15 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Check for varargs. int NumNamedVarArgParams = -1; - if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) - { - const Function* CalleeFn = nullptr; - Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32); - if ((CalleeFn = dyn_cast<Function>(GA->getGlobal()))) - { + if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = GAN->getGlobal(); + Callee = DAG.getTargetGlobalAddress(GV, dl, MVT::i32); + if (const Function* F = dyn_cast<Function>(GV)) { // If a function has zero args and is a vararg function, that's // disallowed so it must be an undeclared function. Do not assume // varargs if the callee is undefined. - if (CalleeFn->isVarArg() && - CalleeFn->getFunctionType()->getNumParams() != 0) { - NumNamedVarArgParams = CalleeFn->getFunctionType()->getNumParams(); - } + if (F->isVarArg() && F->getFunctionType()->getNumParams() != 0) + NumNamedVarArgParams = F->getFunctionType()->getNumParams(); } } @@ -519,11 +713,16 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT); + bool NeedsArgAlign = false; + unsigned LargestAlignSeen = 0; // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; + // Record if we need > 8 byte alignment on an argument. + bool ArgAlign = IsHvxVectorType(VA.getValVT()); + NeedsArgAlign |= ArgAlign; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -549,13 +748,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue MemAddr = DAG.getConstant(LocMemOffset, dl, StackPtr.getValueType()); MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr); + if (ArgAlign) + LargestAlignSeen = std::max(LargestAlignSeen, + VA.getLocVT().getStoreSizeInBits() >> 3); if (Flags.isByVal()) { // The argument is a struct passed by value. According to LLVM, "Arg" // is is pointer. MemOpChains.push_back(CreateCopyOfByValArgument(Arg, MemAddr, Chain, Flags, DAG, dl)); } else { - MachinePointerInfo LocPI = MachinePointerInfo::getStack(LocMemOffset); + MachinePointerInfo LocPI = MachinePointerInfo::getStack( + DAG.getMachineFunction(), LocMemOffset); SDValue S = DAG.getStore(Chain, dl, Arg, MemAddr, LocPI, false, false, 0); MemOpChains.push_back(S); @@ -569,6 +772,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } + if (NeedsArgAlign && Subtarget.hasV60TOps()) { + DEBUG(dbgs() << "Function needs byte stack align due to call args\n"); + MachineFrameInfo* MFI = DAG.getMachineFunction().getFrameInfo(); + // V6 vectors passed by value have 64 or 128 byte alignment depending + // on whether we are 64 byte vector mode or 128 byte. + bool UseHVXDbl = Subtarget.useHVXDblOps(); + assert(Subtarget.useHVXOps()); + const unsigned ObjAlign = UseHVXDbl ? 128 : 64; + LargestAlignSeen = std::max(LargestAlignSeen, ObjAlign); + MFI->ensureMaxAlignment(LargestAlignSeen); + } // Transform all store nodes into one single node because all store // nodes are independent of each other. if (!MemOpChains.empty()) @@ -613,12 +827,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. - if (flag_aligned_memcpy) { - const char *MemcpyName = - "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes"; - Callee = DAG.getTargetExternalSymbol(MemcpyName, PtrVT); - flag_aligned_memcpy = false; - } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, PtrVT); } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { @@ -668,7 +877,19 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT, if (Ptr->getOpcode() != ISD::ADD) return false; - if (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) { + auto &HST = static_cast<const HexagonSubtarget&>(DAG.getSubtarget()); + bool UseHVX = HST.useHVXOps(); + bool UseHVXDbl = HST.useHVXDblOps(); + + bool ValidHVXDblType = + (UseHVX && UseHVXDbl) && (VT == MVT::v32i32 || VT == MVT::v16i64 || + VT == MVT::v64i16 || VT == MVT::v128i8); + bool ValidHVXType = + UseHVX && !UseHVXDbl && (VT == MVT::v16i32 || VT == MVT::v8i64 || + VT == MVT::v32i16 || VT == MVT::v64i8); + + if (ValidHVXDblType || ValidHVXType || + VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) { isInc = (Ptr->getOpcode() == ISD::ADD); Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); @@ -679,23 +900,6 @@ static bool getIndexedAddressParts(SDNode *Ptr, EVT VT, return false; } -// TODO: Put this function along with the other isS* functions in -// HexagonISelDAGToDAG.cpp into a common file. Or better still, use the -// functions defined in HexagonOperands.td. -static bool Is_PostInc_S4_Offset(SDNode * S, int ShiftAmount) { - ConstantSDNode *N = cast<ConstantSDNode>(S); - - // immS4 predicate - True if the immediate fits in a 4-bit sign extended. - // field. - int64_t v = (int64_t)N->getSExtValue(); - int64_t m = 0; - if (ShiftAmount > 0) { - m = v % ShiftAmount; - v = v >> ShiftAmount; - } - return (v <= 7) && (v >= -8) && (m == 0); -} - /// getPostIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if this node can be /// combined with a load / store to form a post-indexed load / store. @@ -724,18 +928,20 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, bool isInc = false; bool isLegal = getIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); - // ShiftAmount = number of left-shifted bits in the Hexagon instruction. - int ShiftAmount = VT.getSizeInBits() / 16; - if (isLegal && Is_PostInc_S4_Offset(Offset.getNode(), ShiftAmount)) { - AM = isInc ? ISD::POST_INC : ISD::POST_DEC; - return true; + if (isLegal) { + auto &HII = *Subtarget.getInstrInfo(); + int32_t OffsetVal = cast<ConstantSDNode>(Offset.getNode())->getSExtValue(); + if (HII.isValidAutoIncImm(VT, OffsetVal)) { + AM = isInc ? ISD::POST_INC : ISD::POST_DEC; + return true; + } } return false; } -SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op, - SelectionDAG &DAG) const { +SDValue +HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); MachineFunction &MF = DAG.getMachineFunction(); auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>(); @@ -784,47 +990,6 @@ SDValue HexagonTargetLowering::LowerINLINEASM(SDValue Op, return Op; } - -// -// Taken from the XCore backend. -// -SDValue HexagonTargetLowering:: -LowerBR_JT(SDValue Op, SelectionDAG &DAG) const -{ - SDValue Chain = Op.getOperand(0); - SDValue Table = Op.getOperand(1); - SDValue Index = Op.getOperand(2); - SDLoc dl(Op); - JumpTableSDNode *JT = cast<JumpTableSDNode>(Table); - unsigned JTI = JT->getIndex(); - MachineFunction &MF = DAG.getMachineFunction(); - const MachineJumpTableInfo *MJTI = MF.getJumpTableInfo(); - SDValue TargetJT = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32); - - // Mark all jump table targets as address taken. - const std::vector<MachineJumpTableEntry> &JTE = MJTI->getJumpTables(); - const std::vector<MachineBasicBlock*> &JTBBs = JTE[JTI].MBBs; - for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) { - MachineBasicBlock *MBB = JTBBs[i]; - MBB->setHasAddressTaken(); - // This line is needed to set the hasAddressTaken flag on the BasicBlock - // object. - BlockAddress::get(const_cast<BasicBlock *>(MBB->getBasicBlock())); - } - - SDValue JumpTableBase = DAG.getNode( - HexagonISD::JT, dl, getPointerTy(DAG.getDataLayout()), TargetJT); - SDValue ShiftIndex = DAG.getNode(ISD::SHL, dl, MVT::i32, Index, - DAG.getConstant(2, dl, MVT::i32)); - SDValue JTAddress = DAG.getNode(ISD::ADD, dl, MVT::i32, JumpTableBase, - ShiftIndex); - SDValue LoadTarget = DAG.getLoad(MVT::i32, dl, Chain, JTAddress, - MachinePointerInfo(), false, false, false, - 0); - return DAG.getNode(HexagonISD::BR_JT, dl, MVT::Other, Chain, LoadTarget); -} - - SDValue HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { @@ -850,7 +1015,10 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue AC = DAG.getConstant(A, dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); - return DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC); + SDValue AA = DAG.getNode(HexagonISD::ALLOCA, dl, VTs, Chain, Size, AC); + if (Op.getNode()->getHasDebugValue()) + DAG.TransferDbgValues(Op, AA); + return AA; } SDValue @@ -882,7 +1050,8 @@ const { // equal to) 8 bytes. If not, no address will be passed into callee and // callee return the result direclty through R0/R1. - SmallVector<SDValue, 4> MemOps; + SmallVector<SDValue, 8> MemOps; + bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -908,6 +1077,42 @@ const { RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass); RegInfo.addLiveIn(VA.getLocReg(), VReg); InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + + // Single Vector + } else if ((RegVT == MVT::v8i64 || RegVT == MVT::v16i32 || + RegVT == MVT::v32i16 || RegVT == MVT::v64i8)) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VectorRegsRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + } else if (UseHVX && UseHVXDbl && + ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 || + RegVT == MVT::v64i16 || RegVT == MVT::v128i8))) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VectorRegs128BRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + + // Double Vector + } else if ((RegVT == MVT::v16i64 || RegVT == MVT::v32i32 || + RegVT == MVT::v64i16 || RegVT == MVT::v128i8)) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VecDblRegsRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + } else if (UseHVX && UseHVXDbl && + ((RegVT == MVT::v32i64 || RegVT == MVT::v64i32 || + RegVT == MVT::v128i16 || RegVT == MVT::v256i8))) { + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VecDblRegs128BRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); + } else if (RegVT == MVT::v512i1 || RegVT == MVT::v1024i1) { + assert(0 && "need to support VecPred regs"); + unsigned VReg = + RegInfo.createVirtualRegister(&Hexagon::VecPredRegsRegClass); + RegInfo.addLiveIn(VA.getLocReg(), VReg); + InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT)); } else { assert (0); } @@ -1056,8 +1261,8 @@ SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) - const { +SDValue +HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue PredOp = Op.getOperand(0); SDValue Op1 = Op.getOperand(1), Op2 = Op.getOperand(2); EVT OpVT = Op1.getValueType(); @@ -1163,16 +1368,33 @@ SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT ValTy = Op.getValueType(); - SDLoc dl(Op); - ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); - SDValue Res; - if (CP->isMachineConstantPoolEntry()) - Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), ValTy, - CP->getAlignment()); + ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op); + unsigned Align = CPN->getAlignment(); + Reloc::Model RM = HTM.getRelocationModel(); + unsigned char TF = (RM == Reloc::PIC_) ? HexagonII::MO_PCREL : 0; + + SDValue T; + if (CPN->isMachineConstantPoolEntry()) + T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, TF); else - Res = DAG.getTargetConstantPool(CP->getConstVal(), ValTy, - CP->getAlignment()); - return DAG.getNode(HexagonISD::CP, dl, ValTy, Res); + T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, TF); + if (RM == Reloc::PIC_) + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), ValTy, T); + return DAG.getNode(HexagonISD::CP, SDLoc(Op), ValTy, T); +} + +SDValue +HexagonTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + int Idx = cast<JumpTableSDNode>(Op)->getIndex(); + Reloc::Model RM = HTM.getRelocationModel(); + if (RM == Reloc::PIC_) { + SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), VT, T); + } + + SDValue T = DAG.getTargetJumpTable(Idx, VT); + return DAG.getNode(HexagonISD::JT, SDLoc(Op), VT, T); } SDValue @@ -1219,52 +1441,70 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { return FrameAddr; } -SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, - SelectionDAG& DAG) const { +SDValue +HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const { SDLoc dl(Op); return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0)); } -SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, - SelectionDAG &DAG) const { - SDValue Result; - const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); - int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); +SDValue +HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + auto *GAN = cast<GlobalAddressSDNode>(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); + auto *GV = GAN->getGlobal(); + int64_t Offset = GAN->getOffset(); + + auto &HLOF = *HTM.getObjFileLowering(); + Reloc::Model RM = HTM.getRelocationModel(); - const HexagonTargetObjectFile *TLOF = - static_cast<const HexagonTargetObjectFile *>( - getTargetMachine().getObjFileLowering()); - if (TLOF->IsGlobalInSmallSection(GV, getTargetMachine())) { - return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, Result); + if (RM == Reloc::Static) { + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset); + if (HLOF.IsGlobalInSmallSection(GV, HTM)) + return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, GA); + return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, GA); } - return DAG.getNode(HexagonISD::CONST32, dl, PtrVT, Result); + bool UsePCRel = GV->hasInternalLinkage() || GV->hasHiddenVisibility() || + (GV->hasLocalLinkage() && !isa<Function>(GV)); + if (UsePCRel) { + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset, + HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, GA); + } + + // Use GOT index. + SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT); + SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, HexagonII::MO_GOT); + SDValue Off = DAG.getConstant(Offset, dl, MVT::i32); + return DAG.getNode(HexagonISD::AT_GOT, dl, PtrVT, GOT, GA, Off); } // Specifies that for loads and stores VT can be promoted to PromotedLdStVT. -void HexagonTargetLowering::promoteLdStType(EVT VT, EVT PromotedLdStVT) { - if (VT != PromotedLdStVT) { - setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), - PromotedLdStVT.getSimpleVT()); +SDValue +HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { + const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + SDLoc dl(Op); + EVT PtrVT = getPointerTy(DAG.getDataLayout()); - setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); - AddPromotedToType(ISD::STORE, VT.getSimpleVT(), - PromotedLdStVT.getSimpleVT()); + Reloc::Model RM = HTM.getRelocationModel(); + if (RM == Reloc::Static) { + SDValue A = DAG.getTargetBlockAddress(BA, PtrVT); + return DAG.getNode(HexagonISD::CONST32_GP, dl, PtrVT, A); } + + SDValue A = DAG.getTargetBlockAddress(BA, PtrVT, 0, HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, dl, PtrVT, A); } SDValue -HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { - const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); - SDValue BA_SD = DAG.getTargetBlockAddress(BA, MVT::i32); - SDLoc dl(Op); - return DAG.getNode(HexagonISD::CONST32_GP, dl, - getPointerTy(DAG.getDataLayout()), BA_SD); +HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) + const { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue GOTSym = DAG.getTargetExternalSymbol(HEXAGON_GOT_SYM_NAME, PtrVT, + HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Op), PtrVT, GOTSym); } //===----------------------------------------------------------------------===// @@ -1272,18 +1512,19 @@ HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { //===----------------------------------------------------------------------===// HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, - const HexagonSubtarget &STI) + const HexagonSubtarget &ST) : TargetLowering(TM), HTM(static_cast<const HexagonTargetMachine&>(TM)), - Subtarget(STI) { + Subtarget(ST) { bool IsV4 = !Subtarget.hasV5TOps(); auto &HRI = *Subtarget.getRegisterInfo(); + bool UseHVX = Subtarget.useHVXOps(); + bool UseHVXSgl = Subtarget.useHVXSglOps(); + bool UseHVXDbl = Subtarget.useHVXDblOps(); setPrefLoopAlignment(4); setPrefFunctionAlignment(4); setMinFunctionAlignment(2); setInsertFencesForAtomic(false); - setExceptionPointerRegister(Hexagon::R0); - setExceptionSelectorRegister(Hexagon::R1); setStackPointerRegisterToSaveRestore(HRI.getStackRegister()); if (EnableHexSDNodeSched) @@ -1320,6 +1561,31 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f64, &Hexagon::DoubleRegsRegClass); } + if (Subtarget.hasV60TOps()) { + if (Subtarget.useHVXSglOps()) { + addRegisterClass(MVT::v64i8, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v32i16, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v16i32, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v8i64, &Hexagon::VectorRegsRegClass); + addRegisterClass(MVT::v128i8, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v64i16, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v32i32, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v16i64, &Hexagon::VecDblRegsRegClass); + addRegisterClass(MVT::v512i1, &Hexagon::VecPredRegsRegClass); + } else if (Subtarget.useHVXDblOps()) { + addRegisterClass(MVT::v128i8, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v64i16, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v32i32, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v16i64, &Hexagon::VectorRegs128BRegClass); + addRegisterClass(MVT::v256i8, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v128i16, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v64i32, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v32i64, &Hexagon::VecDblRegs128BRegClass); + addRegisterClass(MVT::v1024i1, &Hexagon::VecPredRegs128BRegClass); + } + + } + // // Handling of scalar operations. // @@ -1336,10 +1602,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ConstantFP, MVT::f64, Legal); // Default: expand setOperationAction(ISD::ConstantPool, MVT::i32, Custom); + setOperationAction(ISD::JumpTable, MVT::i32, Custom); setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); setOperationAction(ISD::INLINEASM, MVT::Other, Custom); setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); + setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Custom legalize GlobalAddress nodes into CONST32. @@ -1361,11 +1629,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); if (EmitJumpTables) - setOperationAction(ISD::BR_JT, MVT::Other, Custom); + setMinimumJumpTableEntries(2); else - setOperationAction(ISD::BR_JT, MVT::Other, Expand); - // Increase jump tables cutover to 5, was 4. - setMinimumJumpTableEntries(MinimumJumpTables); + setMinimumJumpTableEntries(MinimumJumpTables); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); // Hexagon has instructions for add/sub with carry. The problem with // modeling these instructions is that they produce 2 results: Rdd and Px. @@ -1420,9 +1687,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::i64, Expand); for (unsigned IntExpOp : - {ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, - ISD::ROTL, ISD::ROTR, ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS, - ISD::SRL_PARTS, ISD::SMUL_LOHI, ISD::UMUL_LOHI}) { + { ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, + ISD::SDIVREM, ISD::UDIVREM, ISD::ROTL, ISD::ROTR, + ISD::BSWAP, ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS, + ISD::SMUL_LOHI, ISD::UMUL_LOHI }) { setOperationAction(IntExpOp, MVT::i32, Expand); setOperationAction(IntExpOp, MVT::i64, Expand); } @@ -1475,7 +1743,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Set the action for vector operations to "expand", then override it with // either "custom" or "legal" for specific cases. - static unsigned VectExpOps[] = { + static const unsigned VectExpOps[] = { // Integer arithmetic: ISD::ADD, ISD::SUB, ISD::MUL, ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::SDIVREM, ISD::UDIVREM, ISD::ADDC, @@ -1539,7 +1807,21 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v2i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); - + if (UseHVX) { + if (UseHVXSgl) { + setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i64, Custom); + } else if (UseHVXDbl) { + setOperationAction(ISD::CONCAT_VECTORS, MVT::v256i8, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v128i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i32, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i64, Custom); + } else { + llvm_unreachable("Unrecognized HVX mode"); + } + } // Subtarget-specific operation actions. // if (Subtarget.hasV5TOps()) { @@ -1586,7 +1868,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, for (ISD::CondCode FPExpCCV4 : {ISD::SETOEQ, ISD::SETOGT, ISD::SETOLT, ISD::SETOGE, ISD::SETOLE, - ISD::SETUO, ISD::SETO}) { + ISD::SETUO, ISD::SETO}) { setCondCodeAction(FPExpCCV4, MVT::f32, Expand); setCondCodeAction(FPExpCCV4, MVT::f64, Expand); } @@ -1599,6 +1881,13 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setIndexedStoreAction(ISD::POST_INC, LSXTy, Legal); } + if (UseHVXDbl) { + for (MVT VT : {MVT::v128i8, MVT::v64i16, MVT::v32i32, MVT::v16i64}) { + setIndexedLoadAction(ISD::POST_INC, VT, Legal); + setIndexedStoreAction(ISD::POST_INC, VT, Legal); + } + } + computeRegisterProperties(&HRI); // @@ -1720,7 +2009,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::AT_GOT: return "HexagonISD::AT_GOT"; case HexagonISD::AT_PCREL: return "HexagonISD::AT_PCREL"; case HexagonISD::BARRIER: return "HexagonISD::BARRIER"; - case HexagonISD::BR_JT: return "HexagonISD::BR_JT"; case HexagonISD::CALLR: return "HexagonISD::CALLR"; case HexagonISD::CALLv3nr: return "HexagonISD::CALLv3nr"; case HexagonISD::CALLv3: return "HexagonISD::CALLv3"; @@ -1737,7 +2025,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::INSERTRP: return "HexagonISD::INSERTRP"; case HexagonISD::JT: return "HexagonISD::JT"; case HexagonISD::PACKHL: return "HexagonISD::PACKHL"; - case HexagonISD::PIC_ADD: return "HexagonISD::PIC_ADD"; case HexagonISD::POPCOUNT: return "HexagonISD::POPCOUNT"; case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; case HexagonISD::SHUFFEB: return "HexagonISD::SHUFFEB"; @@ -1754,6 +2041,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::VCMPWEQ: return "HexagonISD::VCMPWEQ"; case HexagonISD::VCMPWGT: return "HexagonISD::VCMPWGT"; case HexagonISD::VCMPWGTU: return "HexagonISD::VCMPWGTU"; + case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE"; case HexagonISD::VSHLH: return "HexagonISD::VSHLH"; case HexagonISD::VSHLW: return "HexagonISD::VSHLW"; case HexagonISD::VSPLATB: return "HexagonISD::VSPLTB"; @@ -1923,8 +2211,7 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned Size = VT.getSizeInBits(); - // A vector larger than 64 bits cannot be represented in Hexagon. - // Expand will split the vector. + // Only handle vectors of 64 bits or shorter. if (Size > 64) return SDValue(); @@ -2058,58 +2345,61 @@ SDValue HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); + bool UseHVX = Subtarget.useHVXOps(); EVT VT = Op.getValueType(); unsigned NElts = Op.getNumOperands(); - SDValue Vec = Op.getOperand(0); - EVT VecVT = Vec.getValueType(); - SDValue Width = DAG.getConstant(VecVT.getSizeInBits(), dl, MVT::i64); - SDValue Shifted = DAG.getNode(ISD::SHL, dl, MVT::i64, Width, - DAG.getConstant(32, dl, MVT::i64)); - SDValue ConstVal = DAG.getConstant(0, dl, MVT::i64); - - ConstantSDNode *W = dyn_cast<ConstantSDNode>(Width); - ConstantSDNode *S = dyn_cast<ConstantSDNode>(Shifted); - - if ((VecVT.getSimpleVT() == MVT::v2i16) && (NElts == 2) && W && S) { - if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) { - // We are trying to concat two v2i16 to a single v4i16. - SDValue Vec0 = Op.getOperand(1); - SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec); - return DAG.getNode(ISD::BITCAST, dl, VT, Combined); + SDValue Vec0 = Op.getOperand(0); + EVT VecVT = Vec0.getValueType(); + unsigned Width = VecVT.getSizeInBits(); + + if (NElts == 2) { + MVT ST = VecVT.getSimpleVT(); + // We are trying to concat two v2i16 to a single v4i16, or two v4i8 + // into a single v8i8. + if (ST == MVT::v2i16 || ST == MVT::v4i8) + return DAG.getNode(HexagonISD::COMBINE, dl, VT, Op.getOperand(1), Vec0); + + if (UseHVX) { + assert((Width == 64*8 && Subtarget.useHVXSglOps()) || + (Width == 128*8 && Subtarget.useHVXDblOps())); + SDValue Vec1 = Op.getOperand(1); + MVT OpTy = Subtarget.useHVXSglOps() ? MVT::v16i32 : MVT::v32i32; + MVT ReTy = Subtarget.useHVXSglOps() ? MVT::v32i32 : MVT::v64i32; + SDValue B0 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec0); + SDValue B1 = DAG.getNode(ISD::BITCAST, dl, OpTy, Vec1); + SDValue VC = DAG.getNode(HexagonISD::VCOMBINE, dl, ReTy, B1, B0); + return DAG.getNode(ISD::BITCAST, dl, VT, VC); } } - if ((VecVT.getSimpleVT() == MVT::v4i8) && (NElts == 2) && W && S) { - if ((W->getZExtValue() == 32) && ((S->getZExtValue() >> 32) == 32)) { - // We are trying to concat two v4i8 to a single v8i8. - SDValue Vec0 = Op.getOperand(1); - SDValue Combined = DAG.getNode(HexagonISD::COMBINE, dl, VT, Vec0, Vec); - return DAG.getNode(ISD::BITCAST, dl, VT, Combined); - } - } + if (VT.getSizeInBits() != 32 && VT.getSizeInBits() != 64) + return SDValue(); + + SDValue C0 = DAG.getConstant(0, dl, MVT::i64); + SDValue C32 = DAG.getConstant(32, dl, MVT::i64); + SDValue W = DAG.getConstant(Width, dl, MVT::i64); + // Create the "width" part of the argument to insert_rp/insertp_rp. + SDValue S = DAG.getNode(ISD::SHL, dl, MVT::i64, W, C32); + SDValue V = C0; for (unsigned i = 0, e = NElts; i != e; ++i) { - unsigned OpIdx = NElts - i - 1; - SDValue Operand = Op.getOperand(OpIdx); + unsigned N = NElts-i-1; + SDValue OpN = Op.getOperand(N); - if (VT.getSizeInBits() == 64 && - Operand.getValueType().getSizeInBits() == 32) { + if (VT.getSizeInBits() == 64 && OpN.getValueType().getSizeInBits() == 32) { SDValue C = DAG.getConstant(0, dl, MVT::i32); - Operand = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, Operand); + OpN = DAG.getNode(HexagonISD::COMBINE, dl, VT, C, OpN); } - - SDValue Idx = DAG.getConstant(OpIdx, dl, MVT::i64); - SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, Width); - SDValue Combined = DAG.getNode(ISD::OR, dl, MVT::i64, Shifted, Offset); - const SDValue Ops[] = {ConstVal, Operand, Combined}; - + SDValue Idx = DAG.getConstant(N, dl, MVT::i64); + SDValue Offset = DAG.getNode(ISD::MUL, dl, MVT::i64, Idx, W); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, S, Offset); if (VT.getSizeInBits() == 32) - ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, Ops); + V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i32, {V, OpN, Or}); else - ConstVal = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, Ops); + V = DAG.getNode(HexagonISD::INSERTRP, dl, MVT::i64, {V, OpN, Or}); } - return DAG.getNode(ISD::BITCAST, dl, VT, ConstVal); + return DAG.getNode(ISD::BITCAST, dl, VT, V); } SDValue @@ -2301,6 +2591,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SHL: case ISD::SRL: return LowerVECTOR_SHIFT(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); + case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); // Frame & Return address. Currently unimplemented. case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -2308,8 +2599,8 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG); case ISD::GlobalAddress: return LowerGLOBALADDRESS(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); + case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); - case ISD::BR_JT: return LowerBR_JT(Op, DAG); // Custom lower some vector loads. case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); @@ -2321,6 +2612,16 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } } +/// Returns relocation base for the given PIC jumptable. +SDValue +HexagonTargetLowering::getPICJumpTableRelocBase(SDValue Table, + SelectionDAG &DAG) const { + int Idx = cast<JumpTableSDNode>(Table)->getIndex(); + EVT VT = Table.getValueType(); + SDValue T = DAG.getTargetJumpTable(Idx, VT, HexagonII::MO_PCREL); + return DAG.getNode(HexagonISD::AT_PCREL, SDLoc(Table), VT, T); +} + MachineBasicBlock * HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) @@ -2343,6 +2644,8 @@ HexagonTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, std::pair<unsigned, const TargetRegisterClass *> HexagonTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { + bool UseHVX = Subtarget.useHVXOps(), UseHVXDbl = Subtarget.useHVXDblOps(); + if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': // R0-R31 @@ -2358,6 +2661,42 @@ HexagonTargetLowering::getRegForInlineAsmConstraint( case MVT::f64: return std::make_pair(0U, &Hexagon::DoubleRegsRegClass); } + case 'q': // q0-q3 + switch (VT.SimpleTy) { + default: + llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type"); + case MVT::v1024i1: + case MVT::v512i1: + case MVT::v32i16: + case MVT::v16i32: + case MVT::v64i8: + case MVT::v8i64: + return std::make_pair(0U, &Hexagon::VecPredRegsRegClass); + } + case 'v': // V0-V31 + switch (VT.SimpleTy) { + default: + llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type"); + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: + case MVT::v8i64: + return std::make_pair(0U, &Hexagon::VectorRegsRegClass); + case MVT::v32i32: + case MVT::v64i16: + case MVT::v16i64: + case MVT::v128i8: + if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl) + return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass); + else + return std::make_pair(0U, &Hexagon::VecDblRegsRegClass); + case MVT::v256i8: + case MVT::v128i16: + case MVT::v64i32: + case MVT::v32i64: + return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass); + } + default: llvm_unreachable("Unknown asm register class"); } @@ -2397,6 +2736,14 @@ bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL, return true; } +/// Return true if folding a constant offset with the given GlobalAddress is +/// legal. It is frequently not legal in PIC relocation models. +bool HexagonTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) + const { + return HTM.getRelocationModel() == Reloc::Static; +} + + /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can compare /// a register against the immediate without having to materialize the @@ -2428,8 +2775,8 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization( // *************************************************************************** // If this is a tail call via a function pointer, then don't do it! - if (!(dyn_cast<GlobalAddressSDNode>(Callee)) - && !(dyn_cast<ExternalSymbolSDNode>(Callee))) { + if (!(isa<GlobalAddressSDNode>(Callee)) && + !(isa<ExternalSymbolSDNode>(Callee))) { return false; } @@ -2467,6 +2814,41 @@ bool llvm::isPositiveHalfWord(SDNode *N) { } } +std::pair<const TargetRegisterClass*, uint8_t> +HexagonTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, + MVT VT) const { + const TargetRegisterClass *RRC = nullptr; + + uint8_t Cost = 1; + switch (VT.SimpleTy) { + default: + return TargetLowering::findRepresentativeClass(TRI, VT); + case MVT::v64i8: + case MVT::v32i16: + case MVT::v16i32: + case MVT::v8i64: + RRC = &Hexagon::VectorRegsRegClass; + break; + case MVT::v128i8: + case MVT::v64i16: + case MVT::v32i32: + case MVT::v16i64: + if (Subtarget.hasV60TOps() && Subtarget.useHVXOps() && + Subtarget.useHVXDblOps()) + RRC = &Hexagon::VectorRegs128BRegClass; + else + RRC = &Hexagon::VecDblRegsRegClass; + break; + case MVT::v256i8: + case MVT::v128i16: + case MVT::v64i32: + case MVT::v32i64: + RRC = &Hexagon::VecDblRegs128BRegClass; + break; + } + return std::make_pair(RRC, Cost); +} + Value *HexagonTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { BasicBlock *BB = Builder.GetInsertBlock(); @@ -2498,13 +2880,15 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilder<> &Builder, return Ext; } -bool HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +HexagonTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { // Do not expand loads and stores that don't exceed 64 bits. - return LI->getType()->getPrimitiveSizeInBits() > 64; + return LI->getType()->getPrimitiveSizeInBits() > 64 + ? AtomicExpansionKind::LLOnly + : AtomicExpansionKind::None; } bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Do not expand loads and stores that don't exceed 64 bits. return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() > 64; } - diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 2642abf..bf378b9 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -35,16 +35,14 @@ bool isPositiveHalfWord(SDNode *N); ALLOCA, ARGEXTEND, - PIC_ADD, - AT_GOT, - AT_PCREL, + AT_GOT, // Index in GOT. + AT_PCREL, // Offset relative to PC. CALLv3, // A V3+ call instruction. CALLv3nr, // A V3+ call instruction that doesn't return. CALLR, RET_FLAG, // Return with a flag operand. - BR_JT, // Branch through jump table. BARRIER, // Memory barrier. JT, // Jump table. CP, // Constant pool. @@ -80,6 +78,7 @@ bool isPositiveHalfWord(SDNode *N); INSERTRP, EXTRACTU, EXTRACTURP, + VCOMBINE, TC_RETURN, EH_RETURN, DCFETCH, @@ -127,7 +126,6 @@ bool isPositiveHalfWord(SDNode *N); SDValue LowerEXTRACT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const; @@ -137,6 +135,7 @@ bool isPositiveHalfWord(SDNode *N); SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const override; SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const override; @@ -163,8 +162,23 @@ bool isPositiveHalfWord(SDNode *N); MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return Hexagon::R0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return Hexagon::R1; + } + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; EVT getSetCCResultType(const DataLayout &, LLVMContext &C, EVT VT) const override { if (!VT.isVector()) @@ -200,6 +214,10 @@ bool isPositiveHalfWord(SDNode *N); /// TODO: Handle pre/postinc as well. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + /// Return true if folding a constant offset with the given GlobalAddress + /// is legal. It is frequently not legal in PIC relocation models. + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; /// isLegalICmpImmediate - Return true if the specified immediate is legal @@ -208,20 +226,26 @@ bool isPositiveHalfWord(SDNode *N); /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; + /// Returns relocation base for the given PIC jumptable. + SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) + const override; + // Handling of atomic RMW instructions. - bool hasLoadLinkedStoreConditional() const override { - return true; - } Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override; + AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - AtomicRMWExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) - const override { - return AtomicRMWExpansionKind::LLSC; + AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override { + return AtomicExpansionKind::LLSC; } + + protected: + std::pair<const TargetRegisterClass*, uint8_t> + findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) + const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td new file mode 100644 index 0000000..5a1a69b --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrAlias.td @@ -0,0 +1,462 @@ +//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Hexagon Instruction Mappings +//===----------------------------------------------------------------------===// + + +def : InstAlias<"memb({GP}+#$addr) = $Nt.new", + (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memh({GP}+#$addr) = $Nt.new", + (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memw({GP}+#$addr) = $Nt.new", + (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memb({GP}+#$addr) = $Nt", + (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memh({GP}+#$addr) = $Nt", + (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memh({GP}+#$addr) = $Nt.h", + (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memw({GP}+#$addr) = $Nt", + (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>; +def : InstAlias<"memd({GP}+#$addr) = $Nt", + (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>; + +def : InstAlias<"$Nt = memb({GP}+#$addr)", + (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>; +def : InstAlias<"$Nt = memub({GP}+#$addr)", + (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>; +def : InstAlias<"$Nt = memh({GP}+#$addr)", + (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>; +def : InstAlias<"$Nt = memuh({GP}+#$addr)", + (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>; +def : InstAlias<"$Nt = memw({GP}+#$addr)", + (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>; +def : InstAlias<"$Nt = memd({GP}+#$addr)", + (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>; + +// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt +def : InstAlias<"memb($Rs) = $Rt", + (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memh($Rs) = $Rt", + (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memh($Rs) = $Rt.h", + (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memw($Rs) = $Rt", + (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memb($Rs) = $Rt.new", + (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memh($Rs) = $Rt.new", + (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memw($Rs) = $Rt.new", + (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"memb($Rs) = #$S8", + (S4_storeirb_io IntRegs:$Rs, 0, s8Ext:$S8), 0>; + +def : InstAlias<"memh($Rs) = #$S8", + (S4_storeirh_io IntRegs:$Rs, 0, s8Ext:$S8), 0>; + +def : InstAlias<"memw($Rs) = #$S8", + (S4_storeiri_io IntRegs:$Rs, 0, s8Ext:$S8), 0>; + +def : InstAlias<"memd($Rs) = $Rtt", + (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"memb($Rs) = setbit(#$U5)", + (L4_ior_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memh($Rs) = setbit(#$U5)", + (L4_ior_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memw($Rs) = setbit(#$U5)", + (L4_ior_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memb($Rs) = clrbit(#$U5)", + (L4_iand_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memh($Rs) = clrbit(#$U5)", + (L4_iand_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +def : InstAlias<"memw($Rs) = clrbit(#$U5)", + (L4_iand_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>; + +// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs) +def : InstAlias<"$Rd = memb($Rs)", + (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memub($Rs)", + (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memh($Rs)", + (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memuh($Rs)", + (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memw($Rs)", + (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memd($Rs)", + (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = memubh($Rs)", + (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memubh($Rs)", + (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rd = membh($Rs)", + (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = membh($Rs)", + (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memb_fifo($Rs)", + (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"$Rdd = memh_fifo($Rs)", + (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>; + +// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X) +// to: if ($Pt) $Rd = memXX($Rs) +def : InstAlias<"if ($Pt) $Rd = memb($Rs)", + (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memub($Rs)", + (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memh($Rs)", + (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memuh($Rs)", + (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rd = memw($Rs)", + (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt) $Rdd = memd($Rs)", + (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt +// to: if ($Pt) memXX($Rs) = $Rt +def : InstAlias<"if ($Pt) memb($Rs) = $Rt", + (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = $Rt", + (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h", + (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memw($Rs) = $Rt", + (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memd($Rs) = $Rtt", + (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new", + (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new", + (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new", + (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new", + (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new", + (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new", + (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + + +// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X) +// to: if (!$Pt) $Rd = memXX($Rs) +def : InstAlias<"if (!$Pt) $Rd = memb($Rs)", + (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memub($Rs)", + (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memh($Rs)", + (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)", + (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rd = memw($Rs)", + (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)", + (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt +// to: if (!$Pt) memXX($Rs) = $Rt +def : InstAlias<"if (!$Pt) memb($Rs) = $Rt", + (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = $Rt", + (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h", + (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memw($Rs) = $Rt", + (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt", + (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new", + (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new", + (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new", + (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new", + (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new", + (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new", + (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pt) memb($Rs) = #$S6", + (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt) memh($Rs) = #$S6", + (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt) memw($Rs) = #$S6", + (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6", + (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6", + (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6", + (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt) memb($Rs) = #$S6", + (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt) memh($Rs) = #$S6", + (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt) memw($Rs) = #$S6", + (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6", + (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6", + (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6", + (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6Ext:$S6), 0>; + +// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -= +// to: memXX($Rs) |= $Rt +def : InstAlias<"memb($Rs) &= $Rt", + (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) |= $Rt", + (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) += $Rt", + (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) -= $Rt", + (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) += #$U5", + (L4_iadd_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memb($Rs) -= #$U5", + (L4_isub_memopb_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) &= $Rt", + (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) |= $Rt", + (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) += $Rt", + (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) -= $Rt", + (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) += #$U5", + (L4_iadd_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memh($Rs) -= #$U5", + (L4_isub_memoph_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) &= $Rt", + (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) |= $Rt", + (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) += $Rt", + (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) -= $Rt", + (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) += #$U5", + (L4_iadd_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +def : InstAlias<"memw($Rs) -= #$U5", + (L4_isub_memopw_io IntRegs:$Rs, 0, u5Imm:$U5), 0>, + Requires<[UseMEMOP]>; + +// +// Alias of: if ($Pv.new) memX($Rs) = $Rt +// to: if (p3.new) memX(r17 + #0) = $Rt +def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt", + (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt", + (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h", + (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt", + (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt", + (S4_pstorerdtnew_io + PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt", + (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt", + (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h", + (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt", + (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>; + +def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt", + (S4_pstorerdfnew_io + PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>; + +// +// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ... +// to: if ($Pt.new) $Rd = memub($Rs + #$u6_0) +def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)", + (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)", + (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)", + (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)", + (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)", + (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)", + (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)", + (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)", + (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)", + (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)", + (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)", + (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)", + (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>; + +def : InstAlias<"dcfetch($Rs)", + (Y2_dcfetchbo IntRegs:$Rs, 0), 0>; + +// Alias of some insn mappings, others must be handled by the parser +def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)", + (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; +def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)", + (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; + +// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs) +def : InstAlias<"$Rd = neg($Rs)", + (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>; + +def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>; +def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>; +def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>; +def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>; + +def : InstAlias<"$Pd = $Ps", + (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>; + +def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)", + (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>; + +def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)", + (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>; + +def : InstAlias<"$Rd = mpyui($Rs,$Rt)", + (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>; + +// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a) +def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)", + (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; +def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)", + (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>; + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td new file mode 100644 index 0000000..280832f --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrEnc.td @@ -0,0 +1,1019 @@ +class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { opc{14-4}, src2}; + let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst}; +} + +class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>; +class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>; +class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>; +class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>; +class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>; +class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>; +class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>; +class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>; +class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>; +class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>; +class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>; +class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>; +class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>; +class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>; +class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>; +class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>; +class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>; +class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>; +class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>; +class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>; +class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>; +class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>; +class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>; +class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>; +class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>; +class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>; +class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>; +class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>; +class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>; +class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>; +class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>; +class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>; +class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>; +class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>; +class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>; +class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>; +class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>; +class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>; +class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>; +class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>; +class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>; +class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>; +class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>; +class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>; +class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>; +class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>; +class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>; +class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>; +class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>; +class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>; +class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>; +class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>; +class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>; +class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>; +class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>; +class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>; +class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>; +class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>; +class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>; +class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>; +class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>; +class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>; +class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>; +class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>; +class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>; +class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>; +class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>; +class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>; +class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>; +class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>; +class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>; +class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>; +class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>; +class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>; +class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>; +class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>; +class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>; +class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>; +class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>; +class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>; +class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>; +class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>; +class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>; +class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>; +class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>; +class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>; +class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>; +class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>; +class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>; +class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>; +class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>; +class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>; +class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>; +class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>; +class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>; +class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>; +class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>; +class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>; +class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>; +class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>; +class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>; +class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>; +class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>; +class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>; +class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>; +class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>; +class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>; +class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>; +class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>; +class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>; +class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>; +class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>; +class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>; +class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>; +class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>; +class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>; +class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>; +class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>; +class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>; +class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>; +class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>; +class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>; +class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>; +class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>; +class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>; +class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>; +class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>; +class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>; +class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>; +class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>; +class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>; +class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>; +class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>; +class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>; +class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>; +class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>; +class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>; +class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>; +class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>; +class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>; +class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>; +class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>; +class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>; +class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>; +class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>; +class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>; +class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>; +class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>; +class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>; +class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>; +class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>; +class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>; +class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>; +class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>; +class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>; +class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>; +class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>; +class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>; +class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>; +class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>; +class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>; +class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>; +class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>; +class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>; +class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>; +class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>; +class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>; +class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>; +class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>; +class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>; +class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>; +class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>; +class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>; +class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>; +class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>; +class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>; +class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>; +class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>; +class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>; +class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>; +class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>; +class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>; +class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>; +class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>; +class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>; + +class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon { + bits<2> dst; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} }; + let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} }; +} + +class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>; +class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>; +class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>; +class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>; +class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>; +class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>; +class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>; +class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>; +class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>; +class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>; +class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>; +class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>; +class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>; +class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>; +class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>; +class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>; +class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>; +class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>; +class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>; +class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>; +class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>; +class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>; +class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>; +class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>; +class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>; +class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>; +class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>; +class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>; +class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>; +class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>; +class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>; +class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>; +class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>; +class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>; +class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>; +class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>; +class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>; +class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>; + +class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> dst; + bits<5> src2; + + let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} }; + let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>; +class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>; +class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>; +class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>; +class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>; +class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>; +class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>; +class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>; +class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>; +class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>; +class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>; +class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>; + +class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + + let Inst{31-16} = { 0b00011110000000, opc{5-4} }; + let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>; +class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>; +class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>; +class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>; +class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>; +class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>; +class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>; +class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>; +class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>; +class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>; +class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>; +class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>; +class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>; +class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>; +class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>; +class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>; +class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>; +class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>; +class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>; +class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>; +class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>; +class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>; +class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>; +class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>; +class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>; + +class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<10> src2; + bits<4> src2_vector; + + let src2_vector = src2{9-6}; + let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>; +class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>; +class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>; +class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>; +class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>; +class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>; +class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>; + +class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<11> src2; + bits<4> src2_vector; + + let src2_vector = src2{10-7}; + let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>; +class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>; +class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>; +class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>; +class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>; +class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>; +class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>; + +class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<4> src2_vector; + bits<5> src3; + + let src2_vector = src2{9-6}; + let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<11> src2; + bits<4> src2_vector; + bits<5> src3; + + let src2_vector = src2{10-7}; + let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>; +class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>; +class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>; + +class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>; +class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>; +class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>; + +class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<4> src2_vector; + bits<3> src3; + + let src2_vector = src2{9-6}; + let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>; +class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>; + +class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<11> src2; + bits<4> src2_vector; + bits<3> src3; + + let src2_vector = src2{10-7}; + let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>; +class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>; + +class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<4> src3_vector; + bits<5> src4; + + let src3_vector = src3{9-6}; + let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<11> src3; + bits<4> src3_vector; + bits<5> src4; + + let src3_vector = src3{10-7}; + let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>; +class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>; +class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>; +class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>; +class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>; +class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>; +class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>; +class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>; +class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>; +class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>; + +class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>; +class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>; +class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>; +class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>; +class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>; +class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>; +class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>; +class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>; +class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>; +class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<4> src3_vector; + bits<3> src4; + + let src3_vector = src3{9-6}; + let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>; +class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>; +class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>; +class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<11> src3; + bits<4> src3_vector; + bits<3> src4; + + let src3_vector = src3{10-7}; + let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>; +class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>; +class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>; +class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>; + +// TODO: Change script to generate dst, src1, src2 instead of +// dst, dst2, src1. +class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<9> src2; + bits<3> src2_vector; + + let src2_vector = src2{8-6}; + let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>; +class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>; +class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>; +class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>; +class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>; +class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>; +class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>; + +class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<10> src2; + bits<3> src2_vector; + + let src2_vector = src2{9-7}; + let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>; +class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>; +class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>; +class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>; +class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>; +class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>; +class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>; + + +// TODO: Change script to generate src1, src2 and src3 instead of +// dst, src1, src2. +class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<9> src2; + bits<3> src2_vector; + bits<5> src3; + + let src2_vector = src2{8-6}; + let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} }; + let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>; +class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>; +class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>; + +class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<3> src2_vector; + bits<5> src3; + + let src2_vector = src2{9-7}; + let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} }; + let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>; +class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>; +class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>; + +// TODO: Change script to generate src1, src2 and src3 instead of +// dst, src1, src2. +class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<9> src2; + bits<3> src2_vector; + bits<3> src3; + + let src2_vector = src2{8-6}; + let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>; +class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<10> src2; + bits<3> src2_vector; + bits<3> src3; + + let src2_vector = src2{9-7}; + let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} }; +} + +class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>; +class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>; + +// TODO: Change script to generate src1, src2,src3 and src4 instead of +// dst, src1, src2, src3. +class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<9> src3; + bits<3> src3_vector; + bits<5> src4; + + let src3_vector = src3{8-6}; + let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>; +class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>; +class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>; +class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>; +class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>; +class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>; +class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>; +class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>; +class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>; +class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>; + +// TODO: Change script to generate src1, src2,src3 and src4 instead of +// dst, src1, src2, src3. +class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<3> src3_vector; + bits<5> src4; + + let src3_vector = src3{9-7}; + let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>; +class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>; +class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>; +class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>; +class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>; +class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>; +class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>; +class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>; +class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>; +class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<9> src3; + bits<3> src3_vector; + bits<3> src4; + + let src3_vector = src3{8-6}; + let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>; +class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>; +class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>; +class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<10> src3; + bits<3> src3_vector; + bits<3> src4; + + let src3_vector = src3{9-7}; + let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>; +class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>; +class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>; +class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>; + +class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<1> src2; + + let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} }; + let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} }; +} + +class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>; +class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>; +class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>; +class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>; +class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>; +class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>; +class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>; + +class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon { + bits<5> src1; + bits<1> src2; + bits<5> src3; + + let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} }; + let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} }; +} + +class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>; +class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>; +class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>; + +class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon { + bits<5> src1; + bits<1> src2; + bits<3> src3; + + let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} }; + let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} }; +} + +class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>; +class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>; + +class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<1> src3; + bits<5> src4; + + let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} }; + let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} }; +} + +class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>; +class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>; +class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>; +class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>; +class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>; +class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>; +class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>; +class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>; +class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>; +class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>; + +class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> src2; + bits<1> src3; + bits<3> src4; + + let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} }; + let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} }; +} + +class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>; +class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>; +class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>; +class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>; + + +class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<1> src3; + + let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} }; + let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} }; +} + +class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>; +class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>; +class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>; +class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>; +class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>; +class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>; + +class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon { + bits<5> dst; + bits<2> src1; + bits<5> src2; + + let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} }; + let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} }; +} + +class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>; +class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>; + +class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon { + bits<5> src1; + bits<5> src2; + bits<5> src3; + + let Inst{31-16} = { 0b00011001111, src3{4-0} }; + let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} }; +} + +class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>; +class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>; + + +class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> dst; + bits<5> src2; + + let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 }; + let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} }; +} + +class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>; +class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>; + +class Enc_X_p3op<bits<8> opc> : OpcodeHexagon { + bits<2> src1; + bits<5> dst; + bits<5> src2; + bits<5> src3; + + let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} }; + let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} }; +} + +class V6_vnccombine_enc : Enc_X_p3op<0b00001000>; +class V6_vccombine_enc : Enc_X_p3op<0b00001100>; + +class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<3> src3; + + let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} }; + let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} }; +} + +class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>; +class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>; +class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>; +class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>; +class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>; +class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>; +class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>; +class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>; +class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>; +class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>; +class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>; +class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>; +class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>; +class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>; +class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>; + +class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<3> src3; + + let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} }; + let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} }; +} + +class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>; +class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>; +class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>; +class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>; +class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>; +class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>; +class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>; +class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>; + +class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon { + bits<2> dst; + bits<2> src1; + bits<2> src2; + + let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 }; + let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} }; +} + +class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>; +class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>; +class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>; +class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>; +class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>; + +class V6_pred_not_enc : OpcodeHexagon { + bits<2> dst; + bits<2> src1; + + let Inst{31-16} = { 0b0001111000000011 }; + let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} }; +} + +class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon { + bits<5> dst; + bits<2> src1; + bits<5> src2; + bits<5> src3; + + let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} }; + let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} }; +} + +class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>; +class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>; + +class Enc_X_2op<bits<16> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + + let Inst{31-16} = { opc{15-5}, src1{4-0} }; + let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} }; +} + +class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>; +class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>; +class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>; + + +class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon { + bits<2> dst; + bits<5> src1; + + let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} }; + let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} }; +} + +class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>; +class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>; + +class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<6> src2; + + let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} }; + let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} }; +} + +class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>; +class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>; +class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>; +class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>; +class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>; +class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>; + +class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { opc{14-4}, src1{4-0} }; + let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} }; +} + +class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>; +class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>; +class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>; +class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>; +class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>; +class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>; +class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>; +class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>; + +class Enc_no_operands<bits<25> opc> : OpcodeHexagon { + + let Inst{31-16} = { opc{24-10}, 0 }; + let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 }; +} + +class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>; +class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>; +class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>; +class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>; + +class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon { + bits<5> src1; + + let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} }; + let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 }; +} + +class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>; +class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>; + +class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon { + bits<5> src1; + + let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 }; + let Inst{13-0} = { 0, src1{4-0}, 0b00000000 }; +} + +class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>; +class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>; + +class A5_ACS_enc : OpcodeHexagon { + bits<5> dst1; + bits<2> dst2; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { 0b11101010101, src1{4-0} }; + let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} }; +} + +class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon { + bits<5> dst; + bits<5> src1; + bits<5> src2; + bits<2> src3; + + let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} }; + let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} }; +} + +class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>; +class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>; +class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>; + +class V6_vhistq_enc : OpcodeHexagon { + bits<2> src1; + + let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 }; + let Inst{13-0} = { 0b10000010000000 }; +} + +// TODO: Change script to generate dst1 instead of dst. +class A6_vminub_RdP_enc : OpcodeHexagon { + bits<5> dst1; + bits<2> dst2; + bits<5> src1; + bits<5> src2; + + let Inst{31-16} = { 0b11101010111, src2{4-0} }; + let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} }; +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td index 44bab29..3c5ec17 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormats.td @@ -34,6 +34,8 @@ class SubTarget<bits<6> value> { def HasAnySubT : SubTarget<0x3f>; // 111111 def HasV5SubT : SubTarget<0x3e>; // 111110 +def HasV55SubT : SubTarget<0x3c>; // 111100 +def HasV60SubT : SubTarget<0x38>; // 111000 // Addressing modes for load/store instructions class AddrModeType<bits<3> value> { @@ -57,6 +59,8 @@ def ByteAccess : MemAccessSize<1>;// Byte access instruction (memb). def HalfWordAccess : MemAccessSize<2>;// Half word access instruction (memh). def WordAccess : MemAccessSize<3>;// Word access instruction (memw). def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd) +def Vector64Access : MemAccessSize<7>;// Vector access instruction (memv) +def Vector128Access : MemAccessSize<8>;// Vector access instruction (memv) //===----------------------------------------------------------------------===// @@ -167,14 +171,23 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, bits<1> isFP = 0; let TSFlags {48} = isFP; // Floating-point. + bits<1> hasNewValue2 = 0; + let TSFlags{50} = hasNewValue2; // Second New-value producer insn. + bits<3> opNewValue2 = 0; + let TSFlags{53-51} = opNewValue2; // Second New-value produced operand. + + bits<1> isAccumulator = 0; + let TSFlags{54} = isAccumulator; + // Fields used for relation models. + bit isNonTemporal = 0; + string isNT = ""; // set to "true" for non-temporal vector stores. string BaseOpcode = ""; string CextOpcode = ""; string PredSense = ""; string PNewValue = ""; string NValueST = ""; // Set to "true" for new-value stores. string InputType = ""; // Input is "imm" or "reg" type. - string isMEMri = "false"; // Set to "true" for load/store with MEMri operand. string isFloat = "false"; // Set to "true" for the floating-point load/store. string isBrTaken = !if(isTaken, "true", "false"); // Set to "true"/"false" for jump instructions @@ -182,6 +195,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern, ""); let PNewValue = !if(isPredicatedNew, "new", ""); let NValueST = !if(isNVStore, "true", "false"); + let isNT = !if(isNonTemporal, "true", "false"); // *** Must match MCTargetDesc/HexagonBaseInfo.h *** } @@ -217,6 +231,11 @@ class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0> : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon; +let mayLoad = 1 in +class LD1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>; + // ST Instruction Class in V2/V3 can take SLOT0 only. // ST Instruction Class in V4 can take SLOT0 & SLOT1. // Definition of the instruction class CHANGED from V2/V3 to V4. @@ -234,6 +253,12 @@ class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0> : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon; +// Same as ST0Inst but doesn't derive from OpcodeHexagon. +let mayStore = 1 in +class ST1Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>; + // ST Instruction Class in V2/V3 can take SLOT0 only. // ST Instruction Class in V4 can take SLOT0 & SLOT1. // Definition of the instruction class CHANGED from V2/V3 to V4. @@ -277,6 +302,11 @@ class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>, OpcodeHexagon; +// Same as above but doesn't derive from OpcodeHexagon +class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>; + // M Instruction Class in V2/V3. // XTYPE Instruction Class in V4. // Definition of the instruction class NOT CHANGED. @@ -294,6 +324,10 @@ class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>, OpcodeHexagon; +class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>; + // S Instruction Class in V2/V3. // XTYPE Instruction Class in V4. // Definition of the instruction class NOT CHANGED. @@ -402,3 +436,13 @@ include "HexagonInstrFormatsV4.td" //===----------------------------------------------------------------------===// // V4 Instruction Format Definitions + //===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// V60 Instruction Format Definitions + +//===----------------------------------------------------------------------===// + +include "HexagonInstrFormatsV60.td" + +//===----------------------------------------------------------------------===// +// V60 Instruction Format Definitions + +//===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td index db83ef6..2d1dea5 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV4.td @@ -21,8 +21,6 @@ def TypeMEMOP : IType<9>; def TypeNV : IType<10>; def TypeDUPLEX : IType<11>; def TypeCOMPOUND : IType<12>; -def TypeAG_VX : IType<28>; -def TypeAG_VM : IType<29>; def TypePREFIX : IType<30>; // Duplex Instruction Class Declaration diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td new file mode 100644 index 0000000..f3d43de --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrFormatsV60.td @@ -0,0 +1,238 @@ +//==- HexagonInstrFormatsV60.td - Hexagon Instruction Formats -*- tablegen -==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon V60 instruction classes in TableGen format. +// +//===----------------------------------------------------------------------===// + +//----------------------------------------------------------------------------// +// Hexagon Intruction Flags + +// +// *** Must match BaseInfo.h *** +//----------------------------------------------------------------------------// + +def TypeCVI_VA : IType<13>; +def TypeCVI_VA_DV : IType<14>; +def TypeCVI_VX : IType<15>; +def TypeCVI_VX_DV : IType<16>; +def TypeCVI_VP : IType<17>; +def TypeCVI_VP_VS : IType<18>; +def TypeCVI_VS : IType<19>; +def TypeCVI_VINLANESAT : IType<20>; +def TypeCVI_VM_LD : IType<21>; +def TypeCVI_VM_TMP_LD : IType<22>; +def TypeCVI_VM_CUR_LD : IType<23>; +def TypeCVI_VM_VP_LDU : IType<24>; +def TypeCVI_VM_ST : IType<25>; +def TypeCVI_VM_NEW_ST : IType<26>; +def TypeCVI_VM_STU : IType<27>; +def TypeCVI_HIST : IType<28>; +//----------------------------------------------------------------------------// +// Intruction Classes Definitions + +//----------------------------------------------------------------------------// + +let validSubTargets = HasV60SubT in +{ +class CVI_VA_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VA> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VA_DV_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VA_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_Resource_late<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_LATE> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>, + Requires<[HasV60T, UseHVX]>; + +class CVI_VX_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Slot2_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV_SLOT2> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_VS_Resource_early<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_VS_EARLY> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_VS_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_VS_LONG> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VP_VS_Resource_long_early<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VP_VS_LONG_EARLY> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VP_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VS_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VS> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VINLANESAT_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VINLANESAT> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VINLANESAT>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VS_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VS> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VS>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_LD_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_LD_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_TMP_LD_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_TMP_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_TMP_LD_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_TMP_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_TMP_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_CUR_LD_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_CUR_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_CUR_LD>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_VP_LDU_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_VP_LDU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_VP_LDU_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_VP_LDU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_VP_LDU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_ST_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_ST_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_NEW_ST_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_NEW_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_NEW_ST_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_NEW_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_NEW_ST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_STU_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_STU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_VM_STU_Resource_long<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VM_STU> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VM_STU>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; + +class CVI_HIST_Resource<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_HIST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>, + OpcodeHexagon, Requires<[HasV60T, UseHVX]>; +} + +let validSubTargets = HasV60SubT in +{ +class CVI_VA_Resource1<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VA> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VA>, + Requires<[HasV60T, UseHVX]>; + +class CVI_VX_DV_Resource1<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_VX_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_VX_DV>, + Requires<[HasV60T, UseHVX]>; + +class CVI_HIST_Resource1<dag outs, dag ins, string asmstr, + list<dag> pattern = [], string cstr = "", + InstrItinClass itin = CVI_HIST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCVI_HIST>, + Requires<[HasV60T, UseHVX]>; +} + + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp index 3cb0823..eb3590c 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -23,9 +23,11 @@ #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <cctype> using namespace llvm; @@ -36,9 +38,41 @@ using namespace llvm; #include "HexagonGenInstrInfo.inc" #include "HexagonGenDFAPacketizer.inc" +using namespace llvm; + +cl::opt<bool> ScheduleInlineAsm("hexagon-sched-inline-asm", cl::Hidden, + cl::init(false), cl::desc("Do not consider inline-asm a scheduling/" + "packetization boundary.")); + +static cl::opt<bool> EnableBranchPrediction("hexagon-enable-branch-prediction", + cl::Hidden, cl::init(true), cl::desc("Enable branch prediction")); + +static cl::opt<bool> DisableNVSchedule("disable-hexagon-nv-schedule", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Disable schedule adjustment for new value stores.")); + +static cl::opt<bool> EnableTimingClassLatency( + "enable-timing-class-latency", cl::Hidden, cl::init(false), + cl::desc("Enable timing class latency")); + +static cl::opt<bool> EnableALUForwarding( + "enable-alu-forwarding", cl::Hidden, cl::init(true), + cl::desc("Enable vec alu forwarding")); + +static cl::opt<bool> EnableACCForwarding( + "enable-acc-forwarding", cl::Hidden, cl::init(true), + cl::desc("Enable vec acc forwarding")); + +static cl::opt<bool> BranchRelaxAsmLarge("branch-relax-asm-large", + cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("branch relax asm")); + /// /// Constants for Hexagon instructions. /// +const int Hexagon_MEMV_OFFSET_MAX_128B = 2047; // #s7 +const int Hexagon_MEMV_OFFSET_MIN_128B = -2048; // #s7 +const int Hexagon_MEMV_OFFSET_MAX = 1023; // #s6 +const int Hexagon_MEMV_OFFSET_MIN = -1024; // #s6 const int Hexagon_MEMW_OFFSET_MAX = 4095; const int Hexagon_MEMW_OFFSET_MIN = -4096; const int Hexagon_MEMD_OFFSET_MAX = 8191; @@ -57,71 +91,49 @@ const int Hexagon_MEMH_AUTOINC_MAX = 14; const int Hexagon_MEMH_AUTOINC_MIN = -16; const int Hexagon_MEMB_AUTOINC_MAX = 7; const int Hexagon_MEMB_AUTOINC_MIN = -8; +const int Hexagon_MEMV_AUTOINC_MAX = 192; +const int Hexagon_MEMV_AUTOINC_MIN = -256; +const int Hexagon_MEMV_AUTOINC_MAX_128B = 384; +const int Hexagon_MEMV_AUTOINC_MIN_128B = -512; // Pin the vtable to this file. void HexagonInstrInfo::anchor() {} HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST) : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP), - RI(), Subtarget(ST) {} + RI() {} -/// isLoadFromStackSlot - If the specified machine instruction is a direct -/// load from a stack slot, return the virtual or physical register number of -/// the destination along with the FrameIndex of the loaded stack slot. If -/// not, return 0. This predicate must return 0 if the instruction has -/// any side effects other than loading from the stack slot. -unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, - int &FrameIndex) const { +static bool isIntRegForSubInst(unsigned Reg) { + return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) || + (Reg >= Hexagon::R16 && Reg <= Hexagon::R23); +} - switch (MI->getOpcode()) { - default: break; - case Hexagon::L2_loadri_io: - case Hexagon::L2_loadrd_io: - case Hexagon::L2_loadrh_io: - case Hexagon::L2_loadrb_io: - case Hexagon::L2_loadrub_io: - if (MI->getOperand(2).isFI() && - MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { - FrameIndex = MI->getOperand(2).getIndex(); - return MI->getOperand(0).getReg(); - } - break; - } - return 0; + +static bool isDblRegForSubInst(unsigned Reg, const HexagonRegisterInfo &HRI) { + return isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_loreg)) && + isIntRegForSubInst(HRI.getSubReg(Reg, Hexagon::subreg_hireg)); } -/// isStoreToStackSlot - If the specified machine instruction is a direct -/// store to a stack slot, return the virtual or physical register number of -/// the source reg along with the FrameIndex of the loaded stack slot. If -/// not, return 0. This predicate must return 0 if the instruction has -/// any side effects other than storing to the stack slot. -unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI, - int &FrameIndex) const { - switch (MI->getOpcode()) { - default: break; - case Hexagon::S2_storeri_io: - case Hexagon::S2_storerd_io: - case Hexagon::S2_storerh_io: - case Hexagon::S2_storerb_io: - if (MI->getOperand(2).isFI() && - MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { - FrameIndex = MI->getOperand(0).getIndex(); - return MI->getOperand(2).getReg(); - } - break; +/// Calculate number of instructions excluding the debug instructions. +static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB, + MachineBasicBlock::const_instr_iterator MIE) { + unsigned Count = 0; + for (; MIB != MIE; ++MIB) { + if (!MIB->isDebugValue()) + ++Count; } - return 0; + return Count; } -// Find the hardware loop instruction used to set-up the specified loop. -// On Hexagon, we have two instructions used to set-up the hardware loop -// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions -// to indicate the end of a loop. -static MachineInstr * -findLoopInstr(MachineBasicBlock *BB, int EndLoopOp, - SmallPtrSet<MachineBasicBlock *, 8> &Visited) { + +/// Find the hardware loop instruction used to set-up the specified loop. +/// On Hexagon, we have two instructions used to set-up the hardware loop +/// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions +/// to indicate the end of a loop. +static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp, + SmallPtrSet<MachineBasicBlock *, 8> &Visited) { int LOOPi; int LOOPr; if (EndLoopOp == Hexagon::ENDLOOP0) { @@ -157,100 +169,108 @@ findLoopInstr(MachineBasicBlock *BB, int EndLoopOp, return 0; } -unsigned HexagonInstrInfo::InsertBranch( - MachineBasicBlock &MBB,MachineBasicBlock *TBB, MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, DebugLoc DL) const { - Opcode_t BOpc = Hexagon::J2_jump; - Opcode_t BccOpc = Hexagon::J2_jumpt; +/// Gather register def/uses from MI. +/// This treats possible (predicated) defs as actually happening ones +/// (conservatively). +static inline void parseOperands(const MachineInstr *MI, + SmallVector<unsigned, 4> &Defs, SmallVector<unsigned, 8> &Uses) { + Defs.clear(); + Uses.clear(); - assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); - // Check if ReverseBranchCondition has asked to reverse this branch - // If we want to reverse the branch an odd number of times, we want - // J2_jumpf. - if (!Cond.empty() && Cond[0].isImm()) - BccOpc = Cond[0].getImm(); + if (!MO.isReg()) + continue; - if (!FBB) { - if (Cond.empty()) { - // Due to a bug in TailMerging/CFG Optimization, we need to add a - // special case handling of a predicated jump followed by an - // unconditional jump. If not, Tail Merging and CFG Optimization go - // into an infinite loop. - MachineBasicBlock *NewTBB, *NewFBB; - SmallVector<MachineOperand, 4> Cond; - MachineInstr *Term = MBB.getFirstTerminator(); - if (Term != MBB.end() && isPredicated(Term) && - !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) { - MachineBasicBlock *NextBB = - std::next(MachineFunction::iterator(&MBB)); - if (NewTBB == NextBB) { - ReverseBranchCondition(Cond); - RemoveBranch(MBB); - return InsertBranch(MBB, TBB, nullptr, Cond, DL); - } - } - BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); - } else if (isEndLoopN(Cond[0].getImm())) { - int EndLoopOp = Cond[0].getImm(); - assert(Cond[1].isMBB()); - // Since we're adding an ENDLOOP, there better be a LOOP instruction. - // Check for it, and change the BB target if needed. - SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; - MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); - assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); - Loop->getOperand(0).setMBB(TBB); - // Add the ENDLOOP after the finding the LOOP0. - BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); - } else if (isNewValueJump(Cond[0].getImm())) { - assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump"); - // New value jump - // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset) - // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset) - unsigned Flags1 = getUndefRegState(Cond[1].isUndef()); - DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber();); - if (Cond[2].isReg()) { - unsigned Flags2 = getUndefRegState(Cond[2].isUndef()); - BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). - addReg(Cond[2].getReg(), Flags2).addMBB(TBB); - } else if(Cond[2].isImm()) { - BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). - addImm(Cond[2].getImm()).addMBB(TBB); - } else - llvm_unreachable("Invalid condition for branching"); - } else { - assert((Cond.size() == 2) && "Malformed cond vector"); - const MachineOperand &RO = Cond[1]; - unsigned Flags = getUndefRegState(RO.isUndef()); - BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); - } - return 1; + unsigned Reg = MO.getReg(); + if (!Reg) + continue; + + if (MO.isUse()) + Uses.push_back(MO.getReg()); + + if (MO.isDef()) + Defs.push_back(MO.getReg()); } - assert((!Cond.empty()) && - "Cond. cannot be empty when multiple branchings are required"); - assert((!isNewValueJump(Cond[0].getImm())) && - "NV-jump cannot be inserted with another branch"); - // Special case for hardware loops. The condition is a basic block. - if (isEndLoopN(Cond[0].getImm())) { - int EndLoopOp = Cond[0].getImm(); - assert(Cond[1].isMBB()); - // Since we're adding an ENDLOOP, there better be a LOOP instruction. - // Check for it, and change the BB target if needed. - SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; - MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); - assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); - Loop->getOperand(0).setMBB(TBB); - // Add the ENDLOOP after the finding the LOOP0. - BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); - } else { - const MachineOperand &RO = Cond[1]; - unsigned Flags = getUndefRegState(RO.isUndef()); - BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); +} + + +// Position dependent, so check twice for swap. +static bool isDuplexPairMatch(unsigned Ga, unsigned Gb) { + switch (Ga) { + case HexagonII::HSIG_None: + default: + return false; + case HexagonII::HSIG_L1: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_L2: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 || + Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_S1: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 || + Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_S2: + return (Gb == HexagonII::HSIG_L1 || Gb == HexagonII::HSIG_L2 || + Gb == HexagonII::HSIG_S1 || Gb == HexagonII::HSIG_S2 || + Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_A: + return (Gb == HexagonII::HSIG_A); + case HexagonII::HSIG_Compound: + return (Gb == HexagonII::HSIG_Compound); } - BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); + return false; +} - return 2; + + +/// isLoadFromStackSlot - If the specified machine instruction is a direct +/// load from a stack slot, return the virtual or physical register number of +/// the destination along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than loading from the stack slot. +unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case Hexagon::L2_loadri_io: + case Hexagon::L2_loadrd_io: + case Hexagon::L2_loadrh_io: + case Hexagon::L2_loadrb_io: + case Hexagon::L2_loadrub_io: + if (MI->getOperand(2).isFI() && + MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { + FrameIndex = MI->getOperand(2).getIndex(); + return MI->getOperand(0).getReg(); + } + break; + } + return 0; +} + + +/// isStoreToStackSlot - If the specified machine instruction is a direct +/// store to a stack slot, return the virtual or physical register number of +/// the source reg along with the FrameIndex of the loaded stack slot. If +/// not, return 0. This predicate must return 0 if the instruction has +/// any side effects other than storing to the stack slot. +unsigned HexagonInstrInfo::isStoreToStackSlot(const MachineInstr *MI, + int &FrameIndex) const { + switch (MI->getOpcode()) { + default: break; + case Hexagon::S2_storeri_io: + case Hexagon::S2_storerd_io: + case Hexagon::S2_storerh_io: + case Hexagon::S2_storerb_io: + if (MI->getOperand(2).isFI() && + MI->getOperand(1).isImm() && (MI->getOperand(1).getImm() == 0)) { + FrameIndex = MI->getOperand(0).getIndex(); + return MI->getOperand(2).getReg(); + } + break; + } + return 0; } @@ -269,9 +289,6 @@ unsigned HexagonInstrInfo::InsertBranch( /// Cond[0] = Hexagon::CMPEQri_f_Jumpnv_t_V4 -- specific opcode /// Cond[1] = R /// Cond[2] = Imm -/// @note Related function is \fn findInstrPredicate which fills in -/// Cond. vector when a predicated instruction is passed to it. -/// We follow same protocol in that case too. /// bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -314,7 +331,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return false; --I; } - + bool JumpToBlock = I->getOpcode() == Hexagon::J2_jump && I->getOperand(0).isMBB(); // Delete the J2_jump if it's equivalent to a fall-through. @@ -327,17 +344,17 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return false; --I; } - if (!isUnpredicatedTerminator(I)) + if (!isUnpredicatedTerminator(&*I)) return false; // Get the last instruction in the block. - MachineInstr *LastInst = I; + MachineInstr *LastInst = &*I; MachineInstr *SecondLastInst = nullptr; // Find one more terminator if present. - do { - if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(I)) { + for (;;) { + if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) { if (!SecondLastInst) - SecondLastInst = I; + SecondLastInst = &*I; else // This is a third branch. return true; @@ -345,7 +362,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, if (I == MBB.instr_begin()) break; --I; - } while(I); + } int LastOpcode = LastInst->getOpcode(); int SecLastOpcode = SecondLastInst ? SecondLastInst->getOpcode() : 0; @@ -418,7 +435,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, // executed, so remove it. if (SecLastOpcode == Hexagon::J2_jump && LastOpcode == Hexagon::J2_jump) { TBB = SecondLastInst->getOperand(0).getMBB(); - I = LastInst; + I = LastInst->getIterator(); if (AllowModify) I->eraseFromParent(); return false; @@ -438,6 +455,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, return true; } + unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { DEBUG(dbgs() << "\nRemoving branches out of BB#" << MBB.getNumber()); MachineBasicBlock::iterator I = MBB.end(); @@ -458,100 +476,127 @@ unsigned HexagonInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { return Count; } -/// \brief For a comparison instruction, return the source registers in -/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it -/// compares against in CmpValue. Return true if the comparison instruction -/// can be analyzed. -bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI, - unsigned &SrcReg, unsigned &SrcReg2, - int &Mask, int &Value) const { - unsigned Opc = MI->getOpcode(); - // Set mask and the first source register. - switch (Opc) { - case Hexagon::C2_cmpeq: - case Hexagon::C2_cmpeqp: - case Hexagon::C2_cmpgt: - case Hexagon::C2_cmpgtp: - case Hexagon::C2_cmpgtu: - case Hexagon::C2_cmpgtup: - case Hexagon::C4_cmpneq: - case Hexagon::C4_cmplte: - case Hexagon::C4_cmplteu: - case Hexagon::C2_cmpeqi: - case Hexagon::C2_cmpgti: - case Hexagon::C2_cmpgtui: - case Hexagon::C4_cmpneqi: - case Hexagon::C4_cmplteui: - case Hexagon::C4_cmpltei: - SrcReg = MI->getOperand(1).getReg(); - Mask = ~0; - break; - case Hexagon::A4_cmpbeq: - case Hexagon::A4_cmpbgt: - case Hexagon::A4_cmpbgtu: - case Hexagon::A4_cmpbeqi: - case Hexagon::A4_cmpbgti: - case Hexagon::A4_cmpbgtui: - SrcReg = MI->getOperand(1).getReg(); - Mask = 0xFF; - break; - case Hexagon::A4_cmpheq: - case Hexagon::A4_cmphgt: - case Hexagon::A4_cmphgtu: - case Hexagon::A4_cmpheqi: - case Hexagon::A4_cmphgti: - case Hexagon::A4_cmphgtui: - SrcReg = MI->getOperand(1).getReg(); - Mask = 0xFFFF; - break; - } +unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, DebugLoc DL) const { + unsigned BOpc = Hexagon::J2_jump; + unsigned BccOpc = Hexagon::J2_jumpt; + assert(validateBranchCond(Cond) && "Invalid branching condition"); + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); - // Set the value/second source register. - switch (Opc) { - case Hexagon::C2_cmpeq: - case Hexagon::C2_cmpeqp: - case Hexagon::C2_cmpgt: - case Hexagon::C2_cmpgtp: - case Hexagon::C2_cmpgtu: - case Hexagon::C2_cmpgtup: - case Hexagon::A4_cmpbeq: - case Hexagon::A4_cmpbgt: - case Hexagon::A4_cmpbgtu: - case Hexagon::A4_cmpheq: - case Hexagon::A4_cmphgt: - case Hexagon::A4_cmphgtu: - case Hexagon::C4_cmpneq: - case Hexagon::C4_cmplte: - case Hexagon::C4_cmplteu: - SrcReg2 = MI->getOperand(2).getReg(); - return true; + // Check if ReverseBranchCondition has asked to reverse this branch + // If we want to reverse the branch an odd number of times, we want + // J2_jumpf. + if (!Cond.empty() && Cond[0].isImm()) + BccOpc = Cond[0].getImm(); - case Hexagon::C2_cmpeqi: - case Hexagon::C2_cmpgtui: - case Hexagon::C2_cmpgti: - case Hexagon::C4_cmpneqi: - case Hexagon::C4_cmplteui: - case Hexagon::C4_cmpltei: - case Hexagon::A4_cmpbeqi: - case Hexagon::A4_cmpbgti: - case Hexagon::A4_cmpbgtui: - case Hexagon::A4_cmpheqi: - case Hexagon::A4_cmphgti: - case Hexagon::A4_cmphgtui: - SrcReg2 = 0; - Value = MI->getOperand(2).getImm(); - return true; + if (!FBB) { + if (Cond.empty()) { + // Due to a bug in TailMerging/CFG Optimization, we need to add a + // special case handling of a predicated jump followed by an + // unconditional jump. If not, Tail Merging and CFG Optimization go + // into an infinite loop. + MachineBasicBlock *NewTBB, *NewFBB; + SmallVector<MachineOperand, 4> Cond; + MachineInstr *Term = MBB.getFirstTerminator(); + if (Term != MBB.end() && isPredicated(Term) && + !AnalyzeBranch(MBB, NewTBB, NewFBB, Cond, false)) { + MachineBasicBlock *NextBB = &*++MBB.getIterator(); + if (NewTBB == NextBB) { + ReverseBranchCondition(Cond); + RemoveBranch(MBB); + return InsertBranch(MBB, TBB, nullptr, Cond, DL); + } + } + BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); + } else if (isEndLoopN(Cond[0].getImm())) { + int EndLoopOp = Cond[0].getImm(); + assert(Cond[1].isMBB()); + // Since we're adding an ENDLOOP, there better be a LOOP instruction. + // Check for it, and change the BB target if needed. + SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; + MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); + assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); + Loop->getOperand(0).setMBB(TBB); + // Add the ENDLOOP after the finding the LOOP0. + BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); + } else if (isNewValueJump(Cond[0].getImm())) { + assert((Cond.size() == 3) && "Only supporting rr/ri version of nvjump"); + // New value jump + // (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset) + // (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset) + unsigned Flags1 = getUndefRegState(Cond[1].isUndef()); + DEBUG(dbgs() << "\nInserting NVJump for BB#" << MBB.getNumber();); + if (Cond[2].isReg()) { + unsigned Flags2 = getUndefRegState(Cond[2].isUndef()); + BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). + addReg(Cond[2].getReg(), Flags2).addMBB(TBB); + } else if(Cond[2].isImm()) { + BuildMI(&MBB, DL, get(BccOpc)).addReg(Cond[1].getReg(), Flags1). + addImm(Cond[2].getImm()).addMBB(TBB); + } else + llvm_unreachable("Invalid condition for branching"); + } else { + assert((Cond.size() == 2) && "Malformed cond vector"); + const MachineOperand &RO = Cond[1]; + unsigned Flags = getUndefRegState(RO.isUndef()); + BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); + } + return 1; } + assert((!Cond.empty()) && + "Cond. cannot be empty when multiple branchings are required"); + assert((!isNewValueJump(Cond[0].getImm())) && + "NV-jump cannot be inserted with another branch"); + // Special case for hardware loops. The condition is a basic block. + if (isEndLoopN(Cond[0].getImm())) { + int EndLoopOp = Cond[0].getImm(); + assert(Cond[1].isMBB()); + // Since we're adding an ENDLOOP, there better be a LOOP instruction. + // Check for it, and change the BB target if needed. + SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs; + MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs); + assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP"); + Loop->getOperand(0).setMBB(TBB); + // Add the ENDLOOP after the finding the LOOP0. + BuildMI(&MBB, DL, get(EndLoopOp)).addMBB(TBB); + } else { + const MachineOperand &RO = Cond[1]; + unsigned Flags = getUndefRegState(RO.isUndef()); + BuildMI(&MBB, DL, get(BccOpc)).addReg(RO.getReg(), Flags).addMBB(TBB); + } + BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB); - return false; + return 2; +} + + +bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB, + unsigned NumCycles, unsigned ExtraPredCycles, + BranchProbability Probability) const { + return nonDbgBBSize(&MBB) <= 3; +} + + +bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, BranchProbability Probability) + const { + return nonDbgBBSize(&TMBB) <= 3 && nonDbgBBSize(&FMBB) <= 3; +} + + +bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB, + unsigned NumInstrs, BranchProbability Probability) const { + return NumInstrs <= 4; } void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { + auto &HRI = getRegisterInfo(); if (Hexagon::IntRegsRegClass.contains(SrcReg, DestReg)) { BuildMI(MBB, I, DL, get(Hexagon::A2_tfr), DestReg).addReg(SrcReg); return; @@ -599,28 +644,74 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB, addReg(SrcReg, getKillRegState(KillSrc)); return; } + if (Hexagon::PredRegsRegClass.contains(SrcReg) && + Hexagon::IntRegsRegClass.contains(DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::C2_tfrpr), DestReg). + addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (Hexagon::VectorRegsRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_vassign), DestReg). + addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (Hexagon::VecDblRegsRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg), + getKillRegState(KillSrc)). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg), + getKillRegState(KillSrc)); + return; + } + if (Hexagon::VecPredRegsRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), DestReg). + addReg(SrcReg). + addReg(SrcReg, getKillRegState(KillSrc)); + return; + } + if (Hexagon::VecPredRegsRegClass.contains(SrcReg) && + Hexagon::VectorRegsRegClass.contains(DestReg)) { + llvm_unreachable("Unimplemented pred to vec"); + return; + } + if (Hexagon::VecPredRegsRegClass.contains(DestReg) && + Hexagon::VectorRegsRegClass.contains(SrcReg)) { + llvm_unreachable("Unimplemented vec to pred"); + return; + } + if (Hexagon::VecPredRegs128BRegClass.contains(SrcReg, DestReg)) { + BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), + HRI.getSubReg(DestReg, Hexagon::subreg_hireg)). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_hireg), + getKillRegState(KillSrc)); + BuildMI(MBB, I, DL, get(Hexagon::V6_pred_and), + HRI.getSubReg(DestReg, Hexagon::subreg_loreg)). + addReg(HRI.getSubReg(SrcReg, Hexagon::subreg_loreg), + getKillRegState(KillSrc)); + return; + } +#ifndef NDEBUG + // Show the invalid registers to ease debugging. + dbgs() << "Invalid registers for copy in BB#" << MBB.getNumber() + << ": " << PrintReg(DestReg, &HRI) + << " = " << PrintReg(SrcReg, &HRI) << '\n'; +#endif llvm_unreachable("Unimplemented"); } -void HexagonInstrInfo:: -storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned SrcReg, bool isKill, int FI, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { - +void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI, + const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand( - MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), Align); if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) { BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io)) @@ -640,33 +731,17 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } -void HexagonInstrInfo::storeRegToAddr( - MachineFunction &MF, unsigned SrcReg, - bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const -{ - llvm_unreachable("Unimplemented"); -} - - -void HexagonInstrInfo:: -loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - unsigned DestReg, int FI, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void HexagonInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, unsigned DestReg, int FI, + const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(I); MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - MachineMemOperand *MMO = - MF.getMachineMemOperand( - MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - Align); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), Align); if (RC == &Hexagon::IntRegsRegClass) { BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg) .addFrameIndex(FI).addImm(0).addMemOperand(MMO); @@ -682,27 +757,136 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, } -void HexagonInstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const { - llvm_unreachable("Unimplemented"); -} -bool -HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - const HexagonRegisterInfo &TRI = getRegisterInfo(); +/// expandPostRAPseudo - This function is called for all pseudo instructions +/// that remain after register allocation. Many pseudo instructions are +/// created to help register allocation. This is the place to convert them +/// into real instructions. The target can edit MI in place, or it can insert +/// new instructions and erase MI. The function should return true if +/// anything was changed. +bool HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) + const { + const HexagonRegisterInfo &HRI = getRegisterInfo(); MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); unsigned Opc = MI->getOpcode(); + const unsigned VecOffset = 1; + bool Is128B = false; switch (Opc) { case Hexagon::ALIGNA: BuildMI(MBB, MI, DL, get(Hexagon::A2_andir), MI->getOperand(0).getReg()) - .addReg(TRI.getFrameRegister()) + .addReg(HRI.getFrameRegister()) .addImm(-MI->getOperand(1).getImm()); MBB.erase(MI); return true; + case Hexagon::HEXAGON_V6_vassignp_128B: + case Hexagon::HEXAGON_V6_vassignp: { + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + if (SrcReg != DstReg) + copyPhysReg(MBB, MI, DL, DstReg, SrcReg, MI->getOperand(1).isKill()); + MBB.erase(MI); + return true; + } + case Hexagon::HEXAGON_V6_lo_128B: + case Hexagon::HEXAGON_V6_lo: { + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg); + copyPhysReg(MBB, MI, DL, DstReg, SrcSubLo, MI->getOperand(1).isKill()); + MBB.erase(MI); + MRI.clearKillFlags(SrcSubLo); + return true; + } + case Hexagon::HEXAGON_V6_hi_128B: + case Hexagon::HEXAGON_V6_hi: { + unsigned SrcReg = MI->getOperand(1).getReg(); + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg); + copyPhysReg(MBB, MI, DL, DstReg, SrcSubHi, MI->getOperand(1).isKill()); + MBB.erase(MI); + MRI.clearKillFlags(SrcSubHi); + return true; + } + case Hexagon::STrivv_indexed_128B: + Is128B = true; + case Hexagon::STrivv_indexed: { + unsigned SrcReg = MI->getOperand(2).getReg(); + unsigned SrcSubHi = HRI.getSubReg(SrcReg, Hexagon::subreg_hireg); + unsigned SrcSubLo = HRI.getSubReg(SrcReg, Hexagon::subreg_loreg); + unsigned NewOpcd = Is128B ? Hexagon::V6_vS32b_ai_128B + : Hexagon::V6_vS32b_ai; + unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6; + MachineInstr *MI1New = BuildMI(MBB, MI, DL, get(NewOpcd)) + .addOperand(MI->getOperand(0)) + .addImm(MI->getOperand(1).getImm()) + .addReg(SrcSubLo) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MI1New->getOperand(0).setIsKill(false); + BuildMI(MBB, MI, DL, get(NewOpcd)) + .addOperand(MI->getOperand(0)) + // The Vectors are indexed in multiples of vector size. + .addImm(MI->getOperand(1).getImm()+Offset) + .addReg(SrcSubHi) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } + case Hexagon::LDrivv_pseudo_V6_128B: + case Hexagon::LDrivv_indexed_128B: + Is128B = true; + case Hexagon::LDrivv_pseudo_V6: + case Hexagon::LDrivv_indexed: { + unsigned NewOpcd = Is128B ? Hexagon::V6_vL32b_ai_128B + : Hexagon::V6_vL32b_ai; + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned Offset = Is128B ? VecOffset << 7 : VecOffset << 6; + MachineInstr *MI1New = + BuildMI(MBB, MI, DL, get(NewOpcd), + HRI.getSubReg(DstReg, Hexagon::subreg_loreg)) + .addOperand(MI->getOperand(1)) + .addImm(MI->getOperand(2).getImm()); + MI1New->getOperand(1).setIsKill(false); + BuildMI(MBB, MI, DL, get(NewOpcd), + HRI.getSubReg(DstReg, Hexagon::subreg_hireg)) + .addOperand(MI->getOperand(1)) + // The Vectors are indexed in multiples of vector size. + .addImm(MI->getOperand(2).getImm() + Offset) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } + case Hexagon::LDriv_pseudo_V6_128B: + Is128B = true; + case Hexagon::LDriv_pseudo_V6: { + unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NewOpc = Is128B ? Hexagon::V6_vL32b_ai_128B + : Hexagon::V6_vL32b_ai; + int32_t Off = MI->getOperand(2).getImm(); + int32_t Idx = Off; + BuildMI(MBB, MI, DL, get(NewOpc), DstReg) + .addOperand(MI->getOperand(1)) + .addImm(Idx) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } + case Hexagon::STriv_pseudo_V6_128B: + Is128B = true; + case Hexagon::STriv_pseudo_V6: { + unsigned NewOpc = Is128B ? Hexagon::V6_vS32b_ai_128B + : Hexagon::V6_vS32b_ai; + int32_t Off = MI->getOperand(1).getImm(); + int32_t Idx = Is128B ? (Off >> 7) : (Off >> 6); + BuildMI(MBB, MI, DL, get(NewOpc)) + .addOperand(MI->getOperand(0)) + .addImm(Idx) + .addOperand(MI->getOperand(2)) + .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MBB.erase(MI); + return true; + } case Hexagon::TFR_PdTrue: { unsigned Reg = MI->getOperand(0).getReg(); BuildMI(MBB, MI, DL, get(Hexagon::C2_orn), Reg) @@ -724,15 +908,15 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { unsigned DstReg = MI->getOperand(0).getReg(); unsigned Src1Reg = MI->getOperand(1).getReg(); unsigned Src2Reg = MI->getOperand(2).getReg(); - unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); - unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); - unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); - unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); + unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); + unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); + unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); + unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi), - TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) + HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) .addReg(Src2SubHi); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_mpyi), - TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) + HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) .addReg(Src2SubLo); MBB.erase(MI); MRI.clearKillFlags(Src1SubHi); @@ -747,17 +931,17 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { unsigned Src1Reg = MI->getOperand(1).getReg(); unsigned Src2Reg = MI->getOperand(2).getReg(); unsigned Src3Reg = MI->getOperand(3).getReg(); - unsigned Src1SubHi = TRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); - unsigned Src1SubLo = TRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); - unsigned Src2SubHi = TRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); - unsigned Src2SubLo = TRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); - unsigned Src3SubHi = TRI.getSubReg(Src3Reg, Hexagon::subreg_hireg); - unsigned Src3SubLo = TRI.getSubReg(Src3Reg, Hexagon::subreg_loreg); + unsigned Src1SubHi = HRI.getSubReg(Src1Reg, Hexagon::subreg_hireg); + unsigned Src1SubLo = HRI.getSubReg(Src1Reg, Hexagon::subreg_loreg); + unsigned Src2SubHi = HRI.getSubReg(Src2Reg, Hexagon::subreg_hireg); + unsigned Src2SubLo = HRI.getSubReg(Src2Reg, Hexagon::subreg_loreg); + unsigned Src3SubHi = HRI.getSubReg(Src3Reg, Hexagon::subreg_hireg); + unsigned Src3SubLo = HRI.getSubReg(Src3Reg, Hexagon::subreg_loreg); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci), - TRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) + HRI.getSubReg(DstReg, Hexagon::subreg_hireg)).addReg(Src1SubHi) .addReg(Src2SubHi).addReg(Src3SubHi); BuildMI(MBB, MI, MI->getDebugLoc(), get(Hexagon::M2_maci), - TRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) + HRI.getSubReg(DstReg, Hexagon::subreg_loreg)).addReg(Src1SubLo) .addReg(Src2SubLo).addReg(Src3SubLo); MBB.erase(MI); MRI.clearKillFlags(Src1SubHi); @@ -768,104 +952,168 @@ HexagonInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { MRI.clearKillFlags(Src3SubLo); return true; } + case Hexagon::MUX64_rr: { + const MachineOperand &Op0 = MI->getOperand(0); + const MachineOperand &Op1 = MI->getOperand(1); + const MachineOperand &Op2 = MI->getOperand(2); + const MachineOperand &Op3 = MI->getOperand(3); + unsigned Rd = Op0.getReg(); + unsigned Pu = Op1.getReg(); + unsigned Rs = Op2.getReg(); + unsigned Rt = Op3.getReg(); + DebugLoc DL = MI->getDebugLoc(); + unsigned K1 = getKillRegState(Op1.isKill()); + unsigned K2 = getKillRegState(Op2.isKill()); + unsigned K3 = getKillRegState(Op3.isKill()); + if (Rd != Rs) + BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpt), Rd) + .addReg(Pu, (Rd == Rt) ? K1 : 0) + .addReg(Rs, K2); + if (Rd != Rt) + BuildMI(MBB, MI, DL, get(Hexagon::A2_tfrpf), Rd) + .addReg(Pu, K1) + .addReg(Rt, K3); + MBB.erase(MI); + return true; + } case Hexagon::TCRETURNi: MI->setDesc(get(Hexagon::J2_jump)); return true; case Hexagon::TCRETURNr: MI->setDesc(get(Hexagon::J2_jumpr)); return true; + case Hexagon::TFRI_f: + case Hexagon::TFRI_cPt_f: + case Hexagon::TFRI_cNotPt_f: { + unsigned Opx = (Opc == Hexagon::TFRI_f) ? 1 : 2; + APFloat FVal = MI->getOperand(Opx).getFPImm()->getValueAPF(); + APInt IVal = FVal.bitcastToAPInt(); + MI->RemoveOperand(Opx); + unsigned NewOpc = (Opc == Hexagon::TFRI_f) ? Hexagon::A2_tfrsi : + (Opc == Hexagon::TFRI_cPt_f) ? Hexagon::C2_cmoveit : + Hexagon::C2_cmoveif; + MI->setDesc(get(NewOpc)); + MI->addOperand(MachineOperand::CreateImm(IVal.getZExtValue())); + return true; + } } return false; } -MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, int FI) const { - // Hexagon_TODO: Implement. - return nullptr; + +// We indicate that we want to reverse the branch by +// inserting the reversed branching opcode. +bool HexagonInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + if (Cond.empty()) + return true; + assert(Cond[0].isImm() && "First entry in the cond vector not imm-val"); + unsigned opcode = Cond[0].getImm(); + //unsigned temp; + assert(get(opcode).isBranch() && "Should be a branching condition."); + if (isEndLoopN(opcode)) + return true; + unsigned NewOpcode = getInvertedPredicatedOpcode(opcode); + Cond[0].setImm(NewOpcode); + return false; } -unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const { - MachineRegisterInfo &RegInfo = MF->getRegInfo(); - const TargetRegisterClass *TRC; - if (VT == MVT::i1) { - TRC = &Hexagon::PredRegsRegClass; - } else if (VT == MVT::i32 || VT == MVT::f32) { - TRC = &Hexagon::IntRegsRegClass; - } else if (VT == MVT::i64 || VT == MVT::f64) { - TRC = &Hexagon::DoubleRegsRegClass; - } else { - llvm_unreachable("Cannot handle this register class"); - } +void HexagonInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + DebugLoc DL; + BuildMI(MBB, MI, DL, get(Hexagon::A2_nop)); +} - unsigned NewReg = RegInfo.createVirtualRegister(TRC); - return NewReg; + +// Returns true if an instruction is predicated irrespective of the predicate +// sense. For example, all of the following will return true. +// if (p0) R1 = add(R2, R3) +// if (!p0) R1 = add(R2, R3) +// if (p0.new) R1 = add(R2, R3) +// if (!p0.new) R1 = add(R2, R3) +// Note: New-value stores are not included here as in the current +// implementation, we don't need to check their predicate sense. +bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask; } -bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const { - const MCInstrDesc &MID = MI->getDesc(); - const uint64_t F = MID.TSFlags; - if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask) - return true; - // TODO: This is largely obsolete now. Will need to be removed - // in consecutive patches. - switch(MI->getOpcode()) { - // TFR_FI Remains a special case. - case Hexagon::TFR_FI: - return true; - default: - return false; +bool HexagonInstrInfo::PredicateInstruction(MachineInstr *MI, + ArrayRef<MachineOperand> Cond) const { + if (Cond.empty() || isNewValueJump(Cond[0].getImm()) || + isEndLoopN(Cond[0].getImm())) { + DEBUG(dbgs() << "\nCannot predicate:"; MI->dump();); + return false; } - return false; -} + int Opc = MI->getOpcode(); + assert (isPredicable(MI) && "Expected predicable instruction"); + bool invertJump = predOpcodeHasNot(Cond); -// This returns true in two cases: -// - The OP code itself indicates that this is an extended instruction. -// - One of MOs has been marked with HMOTF_ConstExtended flag. -bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const { - // First check if this is permanently extended op code. - const uint64_t F = MI->getDesc().TSFlags; - if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask) - return true; - // Use MO operand flags to determine if one of MI's operands - // has HMOTF_ConstExtended flag set. - for (MachineInstr::const_mop_iterator I = MI->operands_begin(), - E = MI->operands_end(); I != E; ++I) { - if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended) - return true; + // We have to predicate MI "in place", i.e. after this function returns, + // MI will need to be transformed into a predicated form. To avoid com- + // plicated manipulations with the operands (handling tied operands, + // etc.), build a new temporary instruction, then overwrite MI with it. + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned PredOpc = getCondOpcode(Opc, invertJump); + MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc)); + unsigned NOp = 0, NumOps = MI->getNumOperands(); + while (NOp < NumOps) { + MachineOperand &Op = MI->getOperand(NOp); + if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) + break; + T.addOperand(Op); + NOp++; } - return false; -} -bool HexagonInstrInfo::isBranch (const MachineInstr *MI) const { - return MI->getDesc().isBranch(); -} + unsigned PredReg, PredRegPos, PredRegFlags; + bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags); + (void)GotPredReg; + assert(GotPredReg); + T.addReg(PredReg, PredRegFlags); + while (NOp < NumOps) + T.addOperand(MI->getOperand(NOp++)); -bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const { - if (isNewValueJump(MI)) - return true; + MI->setDesc(get(PredOpc)); + while (unsigned n = MI->getNumOperands()) + MI->RemoveOperand(n-1); + for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i) + MI->addOperand(T->getOperand(i)); - if (isNewValueStore(MI)) - return true; + MachineBasicBlock::instr_iterator TI = T->getIterator(); + B.erase(TI); - return false; + MachineRegisterInfo &MRI = B.getParent()->getRegInfo(); + MRI.clearKillFlags(PredReg); + return true; } -bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const { - const uint64_t F = MI->getDesc().TSFlags; - return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask); -} -bool HexagonInstrInfo::isNewValue(Opcode_t Opcode) const { - const uint64_t F = get(Opcode).TSFlags; - return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask); +bool HexagonInstrInfo::SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const { + // TODO: Fix this + return false; } -bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const { - return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4; + +bool HexagonInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + auto &HRI = getRegisterInfo(); + for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) { + MachineOperand MO = MI->getOperand(oper); + if (MO.isReg() && MO.isDef()) { + const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg()); + if (RC == &Hexagon::PredRegsRegClass) { + Pred.push_back(MO); + return true; + } + } + } + return false; } bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { @@ -875,10 +1123,21 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { return false; const int Opc = MI->getOpcode(); + int NumOperands = MI->getNumOperands(); + + // Keep a flag for upto 4 operands in the instructions, to indicate if + // that operand has been constant extended. + bool OpCExtended[4]; + if (NumOperands > 4) + NumOperands = 4; + + for (int i = 0; i < NumOperands; i++) + OpCExtended[i] = (isOperandExtended(MI, i) && isConstExtended(MI)); switch(Opc) { case Hexagon::A2_tfrsi: - return (isOperandExtended(MI, 1) && isConstExtended(MI)) || isInt<12>(MI->getOperand(1).getImm()); + return (isOperandExtended(MI, 1) && isConstExtended(MI)) || + isInt<12>(MI->getOperand(1).getImm()); case Hexagon::S2_storerd_io: return isShiftedUInt<6,3>(MI->getOperand(1).getImm()); @@ -926,8 +1185,8 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { case Hexagon::S4_storeirb_io: case Hexagon::S4_storeirh_io: case Hexagon::S4_storeiri_io: - return (isUInt<6>(MI->getOperand(1).getImm()) && - isInt<6>(MI->getOperand(2).getImm())); + return (OpCExtended[1] || isUInt<6>(MI->getOperand(1).getImm())) && + (OpCExtended[2] || isInt<6>(MI->getOperand(2).getImm())); case Hexagon::A2_addi: return isInt<8>(MI->getOperand(2).getImm()); @@ -944,269 +1203,1117 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const { return true; } -// This function performs the following inversiones: -// -// cPt ---> cNotPt -// cNotPt ---> cPt -// -unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const { - int InvPredOpcode; - InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc) - : Hexagon::getTruePredOpcode(Opc); - if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate. - return InvPredOpcode; - switch(Opc) { - default: llvm_unreachable("Unexpected predicated instruction"); - case Hexagon::C2_ccombinewt: - return Hexagon::C2_ccombinewf; +bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, const MachineFunction &MF) const { + // Debug info is never a scheduling boundary. It's necessary to be explicit + // due to the special treatment of IT instructions below, otherwise a + // dbg_value followed by an IT will result in the IT instruction being + // considered a scheduling hazard, which is wrong. It should be the actual + // instruction preceding the dbg_value instruction(s), just like it is + // when debug info is not present. + if (MI->isDebugValue()) + return false; + + // Throwing call is a boundary. + if (MI->isCall()) { + // If any of the block's successors is a landing pad, this could be a + // throwing call. + for (auto I : MBB->successors()) + if (I->isEHPad()) + return true; + } + + // Don't mess around with no return calls. + if (MI->getOpcode() == Hexagon::CALLv3nr) + return true; + + // Terminators and labels can't be scheduled around. + if (MI->getDesc().isTerminator() || MI->isPosition()) + return true; + + if (MI->isInlineAsm() && !ScheduleInlineAsm) + return true; + + return false; +} + + +/// Measure the specified inline asm to determine an approximation of its +/// length. +/// Comments (which run till the next SeparatorString or newline) do not +/// count as an instruction. +/// Any other non-whitespace text is considered an instruction, with +/// multiple instructions separated by SeparatorString or newlines. +/// Variable-length instructions are not handled here; this function +/// may be overloaded in the target code to do that. +/// Hexagon counts the number of ##'s and adjust for that many +/// constant exenders. +unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str, + const MCAsmInfo &MAI) const { + StringRef AStr(Str); + // Count the number of instructions in the asm. + bool atInsnStart = true; + unsigned Length = 0; + for (; *Str; ++Str) { + if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(), + strlen(MAI.getSeparatorString())) == 0) + atInsnStart = true; + if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) { + Length += MAI.getMaxInstLength(); + atInsnStart = false; + } + if (atInsnStart && strncmp(Str, MAI.getCommentString(), + strlen(MAI.getCommentString())) == 0) + atInsnStart = false; + } + + // Add to size number of constant extenders seen * 4. + StringRef Occ("##"); + Length += AStr.count(Occ)*4; + return Length; +} + + +ScheduleHazardRecognizer* +HexagonInstrInfo::CreateTargetPostRAHazardRecognizer( + const InstrItineraryData *II, const ScheduleDAG *DAG) const { + return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG); +} + + +/// \brief For a comparison instruction, return the source registers in +/// \p SrcReg and \p SrcReg2 if having two register operands, and the value it +/// compares against in CmpValue. Return true if the comparison instruction +/// can be analyzed. +bool HexagonInstrInfo::analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, int &Mask, int &Value) const { + unsigned Opc = MI->getOpcode(); + + // Set mask and the first source register. + switch (Opc) { + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpeqp: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtp: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpgtup: + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmplte: + case Hexagon::C4_cmplteu: + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgtui: + case Hexagon::C4_cmpneqi: + case Hexagon::C4_cmplteui: + case Hexagon::C4_cmpltei: + SrcReg = MI->getOperand(1).getReg(); + Mask = ~0; + break; + case Hexagon::A4_cmpbeq: + case Hexagon::A4_cmpbgt: + case Hexagon::A4_cmpbgtu: + case Hexagon::A4_cmpbeqi: + case Hexagon::A4_cmpbgti: + case Hexagon::A4_cmpbgtui: + SrcReg = MI->getOperand(1).getReg(); + Mask = 0xFF; + break; + case Hexagon::A4_cmpheq: + case Hexagon::A4_cmphgt: + case Hexagon::A4_cmphgtu: + case Hexagon::A4_cmpheqi: + case Hexagon::A4_cmphgti: + case Hexagon::A4_cmphgtui: + SrcReg = MI->getOperand(1).getReg(); + Mask = 0xFFFF; + break; + } + + // Set the value/second source register. + switch (Opc) { + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpeqp: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtp: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpgtup: + case Hexagon::A4_cmpbeq: + case Hexagon::A4_cmpbgt: + case Hexagon::A4_cmpbgtu: + case Hexagon::A4_cmpheq: + case Hexagon::A4_cmphgt: + case Hexagon::A4_cmphgtu: + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmplte: + case Hexagon::C4_cmplteu: + SrcReg2 = MI->getOperand(2).getReg(); + return true; + + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgtui: + case Hexagon::C2_cmpgti: + case Hexagon::C4_cmpneqi: + case Hexagon::C4_cmplteui: + case Hexagon::C4_cmpltei: + case Hexagon::A4_cmpbeqi: + case Hexagon::A4_cmpbgti: + case Hexagon::A4_cmpbgtui: + case Hexagon::A4_cmpheqi: + case Hexagon::A4_cmphgti: + case Hexagon::A4_cmphgtui: + SrcReg2 = 0; + Value = MI->getOperand(2).getImm(); + return true; + } + + return false; +} + + +unsigned HexagonInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, unsigned *PredCost) const { + return getInstrTimingClassLatency(ItinData, MI); +} + + +DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState( + const TargetSubtargetInfo &STI) const { + const InstrItineraryData *II = STI.getInstrItineraryData(); + return static_cast<const HexagonSubtarget&>(STI).createDFAPacketizer(II); +} + + +// Inspired by this pair: +// %R13<def> = L2_loadri_io %R29, 136; mem:LD4[FixedStack0] +// S2_storeri_io %R29, 132, %R1<kill>; flags: mem:ST4[FixedStack1] +// Currently AA considers the addresses in these instructions to be aliasing. +bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, + MachineInstr *MIb, AliasAnalysis *AA) const { + int OffsetA = 0, OffsetB = 0; + unsigned SizeA = 0, SizeB = 0; + + if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() || + MIa->hasOrderedMemoryRef() || MIa->hasOrderedMemoryRef()) + return false; + + // Instructions that are pure loads, not loads and stores like memops are not + // dependent. + if (MIa->mayLoad() && !isMemOp(MIa) && MIb->mayLoad() && !isMemOp(MIb)) + return true; + + // Get base, offset, and access size in MIa. + unsigned BaseRegA = getBaseAndOffset(MIa, OffsetA, SizeA); + if (!BaseRegA || !SizeA) + return false; + + // Get base, offset, and access size in MIb. + unsigned BaseRegB = getBaseAndOffset(MIb, OffsetB, SizeB); + if (!BaseRegB || !SizeB) + return false; + + if (BaseRegA != BaseRegB) + return false; + + // This is a mem access with the same base register and known offsets from it. + // Reason about it. + if (OffsetA > OffsetB) { + uint64_t offDiff = (uint64_t)((int64_t)OffsetA - (int64_t)OffsetB); + return (SizeB <= offDiff); + } else if (OffsetA < OffsetB) { + uint64_t offDiff = (uint64_t)((int64_t)OffsetB - (int64_t)OffsetA); + return (SizeA <= offDiff); + } + + return false; +} + + +unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const { + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterClass *TRC; + if (VT == MVT::i1) { + TRC = &Hexagon::PredRegsRegClass; + } else if (VT == MVT::i32 || VT == MVT::f32) { + TRC = &Hexagon::IntRegsRegClass; + } else if (VT == MVT::i64 || VT == MVT::f64) { + TRC = &Hexagon::DoubleRegsRegClass; + } else { + llvm_unreachable("Cannot handle this register class"); + } + + unsigned NewReg = MRI.createVirtualRegister(TRC); + return NewReg; +} + + +bool HexagonInstrInfo::isAbsoluteSet(const MachineInstr* MI) const { + return (getAddrMode(MI) == HexagonII::AbsoluteSet); +} + + +bool HexagonInstrInfo::isAccumulator(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask); +} + + +bool HexagonInstrInfo::isComplex(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + + if (!(isTC1(MI)) + && !(QII->isTC2Early(MI)) + && !(MI->getDesc().mayLoad()) + && !(MI->getDesc().mayStore()) + && (MI->getDesc().getOpcode() != Hexagon::S2_allocframe) + && (MI->getDesc().getOpcode() != Hexagon::L2_deallocframe) + && !(QII->isMemOp(MI)) + && !(MI->isBranch()) + && !(MI->isReturn()) + && !MI->isCall()) + return true; + + return false; +} + + +// Return true if the instruction is a compund branch instruction. +bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr *MI) const { + return (getType(MI) == HexagonII::TypeCOMPOUND && MI->isBranch()); +} + + +bool HexagonInstrInfo::isCondInst(const MachineInstr *MI) const { + return (MI->isBranch() && isPredicated(MI)) || + isConditionalTransfer(MI) || + isConditionalALU32(MI) || + isConditionalLoad(MI) || + // Predicated stores which don't have a .new on any operands. + (MI->mayStore() && isPredicated(MI) && !isNewValueStore(MI) && + !isPredicatedNew(MI)); +} + + +bool HexagonInstrInfo::isConditionalALU32(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + case Hexagon::A2_paddf: + case Hexagon::A2_paddfnew: + case Hexagon::A2_paddif: + case Hexagon::A2_paddifnew: + case Hexagon::A2_paddit: + case Hexagon::A2_padditnew: + case Hexagon::A2_paddt: + case Hexagon::A2_paddtnew: + case Hexagon::A2_pandf: + case Hexagon::A2_pandfnew: + case Hexagon::A2_pandt: + case Hexagon::A2_pandtnew: + case Hexagon::A2_porf: + case Hexagon::A2_porfnew: + case Hexagon::A2_port: + case Hexagon::A2_portnew: + case Hexagon::A2_psubf: + case Hexagon::A2_psubfnew: + case Hexagon::A2_psubt: + case Hexagon::A2_psubtnew: + case Hexagon::A2_pxorf: + case Hexagon::A2_pxorfnew: + case Hexagon::A2_pxort: + case Hexagon::A2_pxortnew: + case Hexagon::A4_paslhf: + case Hexagon::A4_paslhfnew: + case Hexagon::A4_paslht: + case Hexagon::A4_paslhtnew: + case Hexagon::A4_pasrhf: + case Hexagon::A4_pasrhfnew: + case Hexagon::A4_pasrht: + case Hexagon::A4_pasrhtnew: + case Hexagon::A4_psxtbf: + case Hexagon::A4_psxtbfnew: + case Hexagon::A4_psxtbt: + case Hexagon::A4_psxtbtnew: + case Hexagon::A4_psxthf: + case Hexagon::A4_psxthfnew: + case Hexagon::A4_psxtht: + case Hexagon::A4_psxthtnew: + case Hexagon::A4_pzxtbf: + case Hexagon::A4_pzxtbfnew: + case Hexagon::A4_pzxtbt: + case Hexagon::A4_pzxtbtnew: + case Hexagon::A4_pzxthf: + case Hexagon::A4_pzxthfnew: + case Hexagon::A4_pzxtht: + case Hexagon::A4_pzxthtnew: case Hexagon::C2_ccombinewf: - return Hexagon::C2_ccombinewt; + case Hexagon::C2_ccombinewt: + return true; + } + return false; +} + + +// FIXME - Function name and it's functionality don't match. +// It should be renamed to hasPredNewOpcode() +bool HexagonInstrInfo::isConditionalLoad(const MachineInstr* MI) const { + if (!MI->getDesc().mayLoad() || !isPredicated(MI)) + return false; + + int PNewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode()); + // Instruction with valid predicated-new opcode can be promoted to .new. + return PNewOpcode >= 0; +} + - // Dealloc_return. - case Hexagon::L4_return_t: - return Hexagon::L4_return_f; - case Hexagon::L4_return_f: - return Hexagon::L4_return_t; +// Returns true if an instruction is a conditional store. +// +// Note: It doesn't include conditional new-value stores as they can't be +// converted to .new predicate. +bool HexagonInstrInfo::isConditionalStore(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + default: return false; + case Hexagon::S4_storeirbt_io: + case Hexagon::S4_storeirbf_io: + case Hexagon::S4_pstorerbt_rr: + case Hexagon::S4_pstorerbf_rr: + case Hexagon::S2_pstorerbt_io: + case Hexagon::S2_pstorerbf_io: + case Hexagon::S2_pstorerbt_pi: + case Hexagon::S2_pstorerbf_pi: + case Hexagon::S2_pstorerdt_io: + case Hexagon::S2_pstorerdf_io: + case Hexagon::S4_pstorerdt_rr: + case Hexagon::S4_pstorerdf_rr: + case Hexagon::S2_pstorerdt_pi: + case Hexagon::S2_pstorerdf_pi: + case Hexagon::S2_pstorerht_io: + case Hexagon::S2_pstorerhf_io: + case Hexagon::S4_storeirht_io: + case Hexagon::S4_storeirhf_io: + case Hexagon::S4_pstorerht_rr: + case Hexagon::S4_pstorerhf_rr: + case Hexagon::S2_pstorerht_pi: + case Hexagon::S2_pstorerhf_pi: + case Hexagon::S2_pstorerit_io: + case Hexagon::S2_pstorerif_io: + case Hexagon::S4_storeirit_io: + case Hexagon::S4_storeirif_io: + case Hexagon::S4_pstorerit_rr: + case Hexagon::S4_pstorerif_rr: + case Hexagon::S2_pstorerit_pi: + case Hexagon::S2_pstorerif_pi: + + // V4 global address store before promoting to dot new. + case Hexagon::S4_pstorerdt_abs: + case Hexagon::S4_pstorerdf_abs: + case Hexagon::S4_pstorerbt_abs: + case Hexagon::S4_pstorerbf_abs: + case Hexagon::S4_pstorerht_abs: + case Hexagon::S4_pstorerhf_abs: + case Hexagon::S4_pstorerit_abs: + case Hexagon::S4_pstorerif_abs: + return true; + + // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded + // from the "Conditional Store" list. Because a predicated new value store + // would NOT be promoted to a double dot new store. + // This function returns yes for those stores that are predicated but not + // yet promoted to predicate dot new instructions. } } -// New Value Store instructions. -bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const { + +bool HexagonInstrInfo::isConditionalTransfer(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::A2_tfrt: + case Hexagon::A2_tfrf: + case Hexagon::C2_cmoveit: + case Hexagon::C2_cmoveif: + case Hexagon::A2_tfrtnew: + case Hexagon::A2_tfrfnew: + case Hexagon::C2_cmovenewit: + case Hexagon::C2_cmovenewif: + case Hexagon::A2_tfrpt: + case Hexagon::A2_tfrpf: + return true; + + default: + return false; + } + return false; +} + + +// TODO: In order to have isExtendable for fpimm/f32Ext, we need to handle +// isFPImm and later getFPImm as well. +bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; + unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask; + if (isExtended) // Instruction must be extended. + return true; + + unsigned isExtendable = + (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask; + if (!isExtendable) + return false; + + if (MI->isCall()) + return false; + + short ExtOpNum = getCExtOpNum(MI); + const MachineOperand &MO = MI->getOperand(ExtOpNum); + // Use MO operand flags to determine if MO + // has the HMOTF_ConstExtended flag set. + if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended) + return true; + // If this is a Machine BB address we are talking about, and it is + // not marked as extended, say so. + if (MO.isMBB()) + return false; + + // We could be using an instruction with an extendable immediate and shoehorn + // a global address into it. If it is a global address it will be constant + // extended. We do this for COMBINE. + // We currently only handle isGlobal() because it is the only kind of + // object we are going to end up with here for now. + // In the future we probably should add isSymbol(), etc. + if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() || + MO.isJTI() || MO.isCPI()) + return true; - return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask); + // If the extendable operand is not 'Immediate' type, the instruction should + // have 'isExtended' flag set. + assert(MO.isImm() && "Extendable operand must be Immediate type"); + + int MinValue = getMinValue(MI); + int MaxValue = getMaxValue(MI); + int ImmValue = MO.getImm(); + + return (ImmValue < MinValue || ImmValue > MaxValue); } -bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const { + +bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::L4_return : + case Hexagon::L4_return_t : + case Hexagon::L4_return_f : + case Hexagon::L4_return_tnew_pnt : + case Hexagon::L4_return_fnew_pnt : + case Hexagon::L4_return_tnew_pt : + case Hexagon::L4_return_fnew_pt : + return true; + } + return false; +} + + +// Return true when ConsMI uses a register defined by ProdMI. +bool HexagonInstrInfo::isDependent(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const { + const MCInstrDesc &ProdMCID = ProdMI->getDesc(); + if (!ProdMCID.getNumDefs()) + return false; + + auto &HRI = getRegisterInfo(); + + SmallVector<unsigned, 4> DefsA; + SmallVector<unsigned, 4> DefsB; + SmallVector<unsigned, 8> UsesA; + SmallVector<unsigned, 8> UsesB; + + parseOperands(ProdMI, DefsA, UsesA); + parseOperands(ConsMI, DefsB, UsesB); + + for (auto &RegA : DefsA) + for (auto &RegB : UsesB) { + // True data dependency. + if (RegA == RegB) + return true; + + if (Hexagon::DoubleRegsRegClass.contains(RegA)) + for (MCSubRegIterator SubRegs(RegA, &HRI); SubRegs.isValid(); ++SubRegs) + if (RegB == *SubRegs) + return true; + + if (Hexagon::DoubleRegsRegClass.contains(RegB)) + for (MCSubRegIterator SubRegs(RegB, &HRI); SubRegs.isValid(); ++SubRegs) + if (RegA == *SubRegs) + return true; + } + + return false; +} + + +// Returns true if the instruction is alread a .cur. +bool HexagonInstrInfo::isDotCurInst(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + case Hexagon::V6_vL32b_cur_pi: + case Hexagon::V6_vL32b_cur_ai: + case Hexagon::V6_vL32b_cur_pi_128B: + case Hexagon::V6_vL32b_cur_ai_128B: + return true; + } + return false; +} + + +// Returns true, if any one of the operands is a dot new +// insn, whether it is predicated dot new or register dot new. +bool HexagonInstrInfo::isDotNewInst(const MachineInstr* MI) const { + if (isNewValueInst(MI) || + (isPredicated(MI) && isPredicatedNew(MI))) + return true; + + return false; +} + + +/// Symmetrical. See if these two instructions are fit for duplex pair. +bool HexagonInstrInfo::isDuplexPair(const MachineInstr *MIa, + const MachineInstr *MIb) const { + HexagonII::SubInstructionGroup MIaG = getDuplexCandidateGroup(MIa); + HexagonII::SubInstructionGroup MIbG = getDuplexCandidateGroup(MIb); + return (isDuplexPairMatch(MIaG, MIbG) || isDuplexPairMatch(MIbG, MIaG)); +} + + +bool HexagonInstrInfo::isEarlySourceInstr(const MachineInstr *MI) const { + if (!MI) + return false; + + if (MI->mayLoad() || MI->mayStore() || MI->isCompare()) + return true; + + // Multiply + unsigned SchedClass = MI->getDesc().getSchedClass(); + if (SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23) + return true; + return false; +} + + +bool HexagonInstrInfo::isEndLoopN(unsigned Opcode) const { + return (Opcode == Hexagon::ENDLOOP0 || + Opcode == Hexagon::ENDLOOP1); +} + + +bool HexagonInstrInfo::isExpr(unsigned OpType) const { + switch(OpType) { + case MachineOperand::MO_MachineBasicBlock: + case MachineOperand::MO_GlobalAddress: + case MachineOperand::MO_ExternalSymbol: + case MachineOperand::MO_JumpTableIndex: + case MachineOperand::MO_ConstantPoolIndex: + case MachineOperand::MO_BlockAddress: + return true; + default: + return false; + } +} + + +bool HexagonInstrInfo::isExtendable(const MachineInstr *MI) const { + const MCInstrDesc &MID = MI->getDesc(); + const uint64_t F = MID.TSFlags; + if ((F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask) + return true; + + // TODO: This is largely obsolete now. Will need to be removed + // in consecutive patches. + switch(MI->getOpcode()) { + // TFR_FI Remains a special case. + case Hexagon::TFR_FI: + return true; + default: + return false; + } + return false; +} + + +// This returns true in two cases: +// - The OP code itself indicates that this is an extended instruction. +// - One of MOs has been marked with HMOTF_ConstExtended flag. +bool HexagonInstrInfo::isExtended(const MachineInstr *MI) const { + // First check if this is permanently extended op code. + const uint64_t F = MI->getDesc().TSFlags; + if ((F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask) + return true; + // Use MO operand flags to determine if one of MI's operands + // has HMOTF_ConstExtended flag set. + for (MachineInstr::const_mop_iterator I = MI->operands_begin(), + E = MI->operands_end(); I != E; ++I) { + if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended) + return true; + } + return false; +} + + +bool HexagonInstrInfo::isFloat(const MachineInstr *MI) const { + unsigned Opcode = MI->getOpcode(); const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::FPPos) & HexagonII::FPMask; +} - return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask); + +// No V60 HVX VMEM with A_INDIRECT. +bool HexagonInstrInfo::isHVXMemWithAIndirect(const MachineInstr *I, + const MachineInstr *J) const { + if (!isV60VectorInstruction(I)) + return false; + if (!I->mayLoad() && !I->mayStore()) + return false; + return J->isIndirectBranch() || isIndirectCall(J) || isIndirectL4Return(J); } -int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const { - enum Hexagon::PredSense inPredSense; - inPredSense = invertPredicate ? Hexagon::PredSense_false : - Hexagon::PredSense_true; - int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense); - if (CondOpcode >= 0) // Valid Conditional opcode/instruction - return CondOpcode; - // This switch case will be removed once all the instructions have been - // modified to use relation maps. - switch(Opc) { - case Hexagon::TFRI_f: - return !invertPredicate ? Hexagon::TFRI_cPt_f : - Hexagon::TFRI_cNotPt_f; - case Hexagon::A2_combinew: - return !invertPredicate ? Hexagon::C2_ccombinewt : - Hexagon::C2_ccombinewf; +bool HexagonInstrInfo::isIndirectCall(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::J2_callr : + case Hexagon::J2_callrf : + case Hexagon::J2_callrt : + return true; + } + return false; +} - // DEALLOC_RETURN. - case Hexagon::L4_return: - return !invertPredicate ? Hexagon::L4_return_t: - Hexagon::L4_return_f; + +bool HexagonInstrInfo::isIndirectL4Return(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::L4_return : + case Hexagon::L4_return_t : + case Hexagon::L4_return_f : + case Hexagon::L4_return_fnew_pnt : + case Hexagon::L4_return_fnew_pt : + case Hexagon::L4_return_tnew_pnt : + case Hexagon::L4_return_tnew_pt : + return true; } - llvm_unreachable("Unexpected predicable instruction"); + return false; } -bool HexagonInstrInfo:: -PredicateInstruction(MachineInstr *MI, - ArrayRef<MachineOperand> Cond) const { - if (Cond.empty() || isEndLoopN(Cond[0].getImm())) { - DEBUG(dbgs() << "\nCannot predicate:"; MI->dump();); +bool HexagonInstrInfo::isJumpR(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::J2_jumpr : + case Hexagon::J2_jumprt : + case Hexagon::J2_jumprf : + case Hexagon::J2_jumprtnewpt : + case Hexagon::J2_jumprfnewpt : + case Hexagon::J2_jumprtnew : + case Hexagon::J2_jumprfnew : + return true; + } + return false; +} + + +// Return true if a given MI can accomodate given offset. +// Use abs estimate as oppose to the exact number. +// TODO: This will need to be changed to use MC level +// definition of instruction extendable field size. +bool HexagonInstrInfo::isJumpWithinBranchRange(const MachineInstr *MI, + unsigned offset) const { + // This selection of jump instructions matches to that what + // AnalyzeBranch can parse, plus NVJ. + if (isNewValueJump(MI)) // r9:2 + return isInt<11>(offset); + + switch (MI->getOpcode()) { + // Still missing Jump to address condition on register value. + default: return false; + case Hexagon::J2_jump: // bits<24> dst; // r22:2 + case Hexagon::J2_call: + case Hexagon::CALLv3nr: + return isInt<24>(offset); + case Hexagon::J2_jumpt: //bits<17> dst; // r15:2 + case Hexagon::J2_jumpf: + case Hexagon::J2_jumptnew: + case Hexagon::J2_jumptnewpt: + case Hexagon::J2_jumpfnew: + case Hexagon::J2_jumpfnewpt: + case Hexagon::J2_callt: + case Hexagon::J2_callf: + return isInt<17>(offset); + case Hexagon::J2_loop0i: + case Hexagon::J2_loop0iext: + case Hexagon::J2_loop0r: + case Hexagon::J2_loop0rext: + case Hexagon::J2_loop1i: + case Hexagon::J2_loop1iext: + case Hexagon::J2_loop1r: + case Hexagon::J2_loop1rext: + return isInt<9>(offset); + // TODO: Add all the compound branches here. Can we do this in Relation model? + case Hexagon::J4_cmpeqi_tp0_jump_nt: + case Hexagon::J4_cmpeqi_tp1_jump_nt: + return isInt<11>(offset); } - int Opc = MI->getOpcode(); - assert (isPredicable(MI) && "Expected predicable instruction"); - bool invertJump = predOpcodeHasNot(Cond); +} - // We have to predicate MI "in place", i.e. after this function returns, - // MI will need to be transformed into a predicated form. To avoid com- - // plicated manipulations with the operands (handling tied operands, - // etc.), build a new temporary instruction, then overwrite MI with it. - MachineBasicBlock &B = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned PredOpc = getCondOpcode(Opc, invertJump); - MachineInstrBuilder T = BuildMI(B, MI, DL, get(PredOpc)); - unsigned NOp = 0, NumOps = MI->getNumOperands(); - while (NOp < NumOps) { - MachineOperand &Op = MI->getOperand(NOp); - if (!Op.isReg() || !Op.isDef() || Op.isImplicit()) - break; - T.addOperand(Op); - NOp++; +bool HexagonInstrInfo::isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI, + const MachineInstr *ESMI) const { + if (!LRMI || !ESMI) + return false; + + bool isLate = isLateResultInstr(LRMI); + bool isEarly = isEarlySourceInstr(ESMI); + + DEBUG(dbgs() << "V60" << (isLate ? "-LR " : " -- ")); + DEBUG(LRMI->dump()); + DEBUG(dbgs() << "V60" << (isEarly ? "-ES " : " -- ")); + DEBUG(ESMI->dump()); + + if (isLate && isEarly) { + DEBUG(dbgs() << "++Is Late Result feeding Early Source\n"); + return true; } - unsigned PredReg, PredRegPos, PredRegFlags; - bool GotPredReg = getPredReg(Cond, PredReg, PredRegPos, PredRegFlags); - (void)GotPredReg; - assert(GotPredReg); - T.addReg(PredReg, PredRegFlags); - while (NOp < NumOps) - T.addOperand(MI->getOperand(NOp++)); + return false; +} - MI->setDesc(get(PredOpc)); - while (unsigned n = MI->getNumOperands()) - MI->RemoveOperand(n-1); - for (unsigned i = 0, n = T->getNumOperands(); i < n; ++i) - MI->addOperand(T->getOperand(i)); - MachineBasicBlock::instr_iterator TI = &*T; - B.erase(TI); +bool HexagonInstrInfo::isLateResultInstr(const MachineInstr *MI) const { + if (!MI) + return false; - MachineRegisterInfo &MRI = B.getParent()->getRegInfo(); - MRI.clearKillFlags(PredReg); + switch (MI->getOpcode()) { + case TargetOpcode::EXTRACT_SUBREG: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + case TargetOpcode::REG_SEQUENCE: + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::COPY: + case TargetOpcode::INLINEASM: + case TargetOpcode::PHI: + return false; + default: + break; + } + unsigned SchedClass = MI->getDesc().getSchedClass(); + + switch (SchedClass) { + case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123: + case Hexagon::Sched::ALU64_tc_1_SLOT23: + case Hexagon::Sched::EXTENDER_tc_1_SLOT0123: + case Hexagon::Sched::S_2op_tc_1_SLOT23: + case Hexagon::Sched::S_3op_tc_1_SLOT23: + case Hexagon::Sched::V2LDST_tc_ld_SLOT01: + case Hexagon::Sched::V2LDST_tc_st_SLOT0: + case Hexagon::Sched::V2LDST_tc_st_SLOT01: + case Hexagon::Sched::V4LDST_tc_ld_SLOT01: + case Hexagon::Sched::V4LDST_tc_st_SLOT0: + case Hexagon::Sched::V4LDST_tc_st_SLOT01: + return false; + } return true; } -bool -HexagonInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &MBB, - unsigned NumCycles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const { - return true; +bool HexagonInstrInfo::isLateSourceInstr(const MachineInstr *MI) const { + if (!MI) + return false; + + // Instructions with iclass A_CVI_VX and attribute A_CVI_LATE uses a multiply + // resource, but all operands can be received late like an ALU instruction. + return MI->getDesc().getSchedClass() == Hexagon::Sched::CVI_VX_LATE; } -bool -HexagonInstrInfo:: -isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, - unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, - unsigned ExtraFCycles, - const BranchProbability &Probability) const { - return true; +bool HexagonInstrInfo::isLoopN(const MachineInstr *MI) const { + unsigned Opcode = MI->getOpcode(); + return Opcode == Hexagon::J2_loop0i || + Opcode == Hexagon::J2_loop0r || + Opcode == Hexagon::J2_loop0iext || + Opcode == Hexagon::J2_loop0rext || + Opcode == Hexagon::J2_loop1i || + Opcode == Hexagon::J2_loop1r || + Opcode == Hexagon::J2_loop1iext || + Opcode == Hexagon::J2_loop1rext; +} + + +bool HexagonInstrInfo::isMemOp(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + default: return false; + case Hexagon::L4_iadd_memopw_io : + case Hexagon::L4_isub_memopw_io : + case Hexagon::L4_add_memopw_io : + case Hexagon::L4_sub_memopw_io : + case Hexagon::L4_and_memopw_io : + case Hexagon::L4_or_memopw_io : + case Hexagon::L4_iadd_memoph_io : + case Hexagon::L4_isub_memoph_io : + case Hexagon::L4_add_memoph_io : + case Hexagon::L4_sub_memoph_io : + case Hexagon::L4_and_memoph_io : + case Hexagon::L4_or_memoph_io : + case Hexagon::L4_iadd_memopb_io : + case Hexagon::L4_isub_memopb_io : + case Hexagon::L4_add_memopb_io : + case Hexagon::L4_sub_memopb_io : + case Hexagon::L4_and_memopb_io : + case Hexagon::L4_or_memopb_io : + case Hexagon::L4_ior_memopb_io: + case Hexagon::L4_ior_memoph_io: + case Hexagon::L4_ior_memopw_io: + case Hexagon::L4_iand_memopb_io: + case Hexagon::L4_iand_memoph_io: + case Hexagon::L4_iand_memopw_io: + return true; + } + return false; } -// Returns true if an instruction is predicated irrespective of the predicate -// sense. For example, all of the following will return true. -// if (p0) R1 = add(R2, R3) -// if (!p0) R1 = add(R2, R3) -// if (p0.new) R1 = add(R2, R3) -// if (!p0.new) R1 = add(R2, R3) -bool HexagonInstrInfo::isPredicated(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); +bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask; } -bool HexagonInstrInfo::isPredicated(unsigned Opcode) const { + +bool HexagonInstrInfo::isNewValue(unsigned Opcode) const { const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::NewValuePos) & HexagonII::NewValueMask; +} + - return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); +bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const { + return isNewValueJump(MI) || isNewValueStore(MI); } -bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - assert(isPredicated(MI)); - return (!((F >> HexagonII::PredicatedFalsePos) & - HexagonII::PredicatedFalseMask)); +bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const { + return isNewValue(MI) && MI->isBranch(); } -bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const { + +bool HexagonInstrInfo::isNewValueJump(unsigned Opcode) const { + return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode); +} + + +bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask; +} + + +bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const { const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask; +} - // Make sure that the instruction is predicated. - assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); - return (!((F >> HexagonII::PredicatedFalsePos) & - HexagonII::PredicatedFalseMask)); + +// Returns true if a particular operand is extendable for an instruction. +bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI, + unsigned OperandNum) const { + const uint64_t F = MI->getDesc().TSFlags; + return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) + == OperandNum; } + +bool HexagonInstrInfo::isPostIncrement(const MachineInstr* MI) const { + return getAddrMode(MI) == HexagonII::PostInc; +} + + bool HexagonInstrInfo::isPredicatedNew(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; - assert(isPredicated(MI)); - return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask); + return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask; } + bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const { const uint64_t F = get(Opcode).TSFlags; - assert(isPredicated(Opcode)); - return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask); + return (F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask; } -// Returns true, if a ST insn can be promoted to a new-value store. -bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const { + +bool HexagonInstrInfo::isPredicatedTrue(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; + return !((F >> HexagonII::PredicatedFalsePos) & + HexagonII::PredicatedFalseMask); +} + - return ((F >> HexagonII::mayNVStorePos) & - HexagonII::mayNVStoreMask); +bool HexagonInstrInfo::isPredicatedTrue(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + // Make sure that the instruction is predicated. + assert((F>> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); + return !((F >> HexagonII::PredicatedFalsePos) & + HexagonII::PredicatedFalseMask); } -bool -HexagonInstrInfo::DefinesPredicate(MachineInstr *MI, - std::vector<MachineOperand> &Pred) const { - for (unsigned oper = 0; oper < MI->getNumOperands(); ++oper) { - MachineOperand MO = MI->getOperand(oper); - if (MO.isReg() && MO.isDef()) { - const TargetRegisterClass* RC = RI.getMinimalPhysRegClass(MO.getReg()); - if (RC == &Hexagon::PredRegsRegClass) { - Pred.push_back(MO); - return true; - } - } + +bool HexagonInstrInfo::isPredicated(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask; +} + + +bool HexagonInstrInfo::isPredicateLate(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + return ~(F >> HexagonII::PredicateLatePos) & HexagonII::PredicateLateMask; +} + + +bool HexagonInstrInfo::isPredictedTaken(unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + assert(get(Opcode).isBranch() && + (isPredicatedNew(Opcode) || isNewValue(Opcode))); + return (F >> HexagonII::TakenPos) & HexagonII::TakenMask; +} + + +bool HexagonInstrInfo::isSaveCalleeSavedRegsCall(const MachineInstr *MI) const { + return MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4 || + MI->getOpcode() == Hexagon::SAVE_REGISTERS_CALL_V4_EXT; +} + + +bool HexagonInstrInfo::isSolo(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::SoloPos) & HexagonII::SoloMask; +} + + +bool HexagonInstrInfo::isSpillPredRegOp(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case Hexagon::STriw_pred : + case Hexagon::LDriw_pred : + return true; + default: + return false; } - return false; } -bool -HexagonInstrInfo:: -SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const { - // TODO: Fix this - return false; +// Returns true when SU has a timing class TC1. +bool HexagonInstrInfo::isTC1(const MachineInstr *MI) const { + unsigned SchedClass = MI->getDesc().getSchedClass(); + switch (SchedClass) { + case Hexagon::Sched::ALU32_2op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_3op_tc_1_SLOT0123: + case Hexagon::Sched::ALU32_ADDI_tc_1_SLOT0123: + case Hexagon::Sched::ALU64_tc_1_SLOT23: + case Hexagon::Sched::EXTENDER_tc_1_SLOT0123: + //case Hexagon::Sched::M_tc_1_SLOT23: + case Hexagon::Sched::S_2op_tc_1_SLOT23: + case Hexagon::Sched::S_3op_tc_1_SLOT23: + return true; + + default: + return false; + } } -// -// We indicate that we want to reverse the branch by -// inserting the reversed branching opcode. -// -bool HexagonInstrInfo::ReverseBranchCondition( - SmallVectorImpl<MachineOperand> &Cond) const { - if (Cond.empty()) +bool HexagonInstrInfo::isTC2(const MachineInstr *MI) const { + unsigned SchedClass = MI->getDesc().getSchedClass(); + switch (SchedClass) { + case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123: + case Hexagon::Sched::ALU64_tc_2_SLOT23: + case Hexagon::Sched::CR_tc_2_SLOT3: + case Hexagon::Sched::M_tc_2_SLOT23: + case Hexagon::Sched::S_2op_tc_2_SLOT23: + case Hexagon::Sched::S_3op_tc_2_SLOT23: return true; - assert(Cond[0].isImm() && "First entry in the cond vector not imm-val"); - Opcode_t opcode = Cond[0].getImm(); - //unsigned temp; - assert(get(opcode).isBranch() && "Should be a branching condition."); - if (isEndLoopN(opcode)) + + default: + return false; + } +} + + +bool HexagonInstrInfo::isTC2Early(const MachineInstr *MI) const { + unsigned SchedClass = MI->getDesc().getSchedClass(); + switch (SchedClass) { + case Hexagon::Sched::ALU32_2op_tc_2early_SLOT0123: + case Hexagon::Sched::ALU32_3op_tc_2early_SLOT0123: + case Hexagon::Sched::ALU64_tc_2early_SLOT23: + case Hexagon::Sched::CR_tc_2early_SLOT23: + case Hexagon::Sched::CR_tc_2early_SLOT3: + case Hexagon::Sched::J_tc_2early_SLOT0123: + case Hexagon::Sched::J_tc_2early_SLOT2: + case Hexagon::Sched::J_tc_2early_SLOT23: + case Hexagon::Sched::S_2op_tc_2early_SLOT23: + case Hexagon::Sched::S_3op_tc_2early_SLOT23: return true; - Opcode_t NewOpcode = getInvertedPredicatedOpcode(opcode); - Cond[0].setImm(NewOpcode); - return false; + + default: + return false; + } +} + + +bool HexagonInstrInfo::isTC4x(const MachineInstr *MI) const { + if (!MI) + return false; + + unsigned SchedClass = MI->getDesc().getSchedClass(); + return SchedClass == Hexagon::Sched::M_tc_3or4x_SLOT23; } -bool HexagonInstrInfo:: -isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumInstrs, - const BranchProbability &Probability) const { - return (NumInstrs <= 4); +bool HexagonInstrInfo::isV60VectorInstruction(const MachineInstr *MI) const { + if (!MI) + return false; + + const uint64_t V = getType(MI); + return HexagonII::TypeCVI_FIRST <= V && V <= HexagonII::TypeCVI_LAST; } -bool HexagonInstrInfo::isDeallocRet(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: return false; - case Hexagon::L4_return: - case Hexagon::L4_return_t: - case Hexagon::L4_return_f: - case Hexagon::L4_return_tnew_pnt: - case Hexagon::L4_return_fnew_pnt: - case Hexagon::L4_return_tnew_pt: - case Hexagon::L4_return_fnew_pt: - return true; + +// Check if the Offset is a valid auto-inc imm by Load/Store Type. +// +bool HexagonInstrInfo::isValidAutoIncImm(const EVT VT, const int Offset) const { + if (VT == MVT::v16i32 || VT == MVT::v8i64 || + VT == MVT::v32i16 || VT == MVT::v64i8) { + return (Offset >= Hexagon_MEMV_AUTOINC_MIN && + Offset <= Hexagon_MEMV_AUTOINC_MAX && + (Offset & 0x3f) == 0); + } + // 128B + if (VT == MVT::v32i32 || VT == MVT::v16i64 || + VT == MVT::v64i16 || VT == MVT::v128i8) { + return (Offset >= Hexagon_MEMV_AUTOINC_MIN_128B && + Offset <= Hexagon_MEMV_AUTOINC_MAX_128B && + (Offset & 0x7f) == 0); + } + if (VT == MVT::i64) { + return (Offset >= Hexagon_MEMD_AUTOINC_MIN && + Offset <= Hexagon_MEMD_AUTOINC_MAX && + (Offset & 0x7) == 0); + } + if (VT == MVT::i32) { + return (Offset >= Hexagon_MEMW_AUTOINC_MIN && + Offset <= Hexagon_MEMW_AUTOINC_MAX && + (Offset & 0x3) == 0); + } + if (VT == MVT::i16) { + return (Offset >= Hexagon_MEMH_AUTOINC_MIN && + Offset <= Hexagon_MEMH_AUTOINC_MAX && + (Offset & 0x1) == 0); + } + if (VT == MVT::i8) { + return (Offset >= Hexagon_MEMB_AUTOINC_MIN && + Offset <= Hexagon_MEMB_AUTOINC_MAX); } + llvm_unreachable("Not an auto-inc opc!"); } @@ -1222,6 +2329,40 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, // misaligns with respect to load size. switch (Opcode) { + case Hexagon::STriq_pred_V6: + case Hexagon::STriq_pred_vec_V6: + case Hexagon::STriv_pseudo_V6: + case Hexagon::STrivv_pseudo_V6: + case Hexagon::LDriq_pred_V6: + case Hexagon::LDriq_pred_vec_V6: + case Hexagon::LDriv_pseudo_V6: + case Hexagon::LDrivv_pseudo_V6: + case Hexagon::LDrivv_indexed: + case Hexagon::STrivv_indexed: + case Hexagon::V6_vL32b_ai: + case Hexagon::V6_vS32b_ai: + case Hexagon::V6_vL32Ub_ai: + case Hexagon::V6_vS32Ub_ai: + return (Offset >= Hexagon_MEMV_OFFSET_MIN) && + (Offset <= Hexagon_MEMV_OFFSET_MAX); + + case Hexagon::STriq_pred_V6_128B: + case Hexagon::STriq_pred_vec_V6_128B: + case Hexagon::STriv_pseudo_V6_128B: + case Hexagon::STrivv_pseudo_V6_128B: + case Hexagon::LDriq_pred_V6_128B: + case Hexagon::LDriq_pred_vec_V6_128B: + case Hexagon::LDriv_pseudo_V6_128B: + case Hexagon::LDrivv_pseudo_V6_128B: + case Hexagon::LDrivv_indexed_128B: + case Hexagon::STrivv_indexed_128B: + case Hexagon::V6_vL32b_ai_128B: + case Hexagon::V6_vS32b_ai_128B: + case Hexagon::V6_vL32Ub_ai_128B: + case Hexagon::V6_vS32Ub_ai_128B: + return (Offset >= Hexagon_MEMV_OFFSET_MIN_128B) && + (Offset <= Hexagon_MEMV_OFFSET_MAX_128B); + case Hexagon::J2_loop0i: case Hexagon::J2_loop1i: return isUInt<10>(Offset); @@ -1248,8 +2389,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, (Offset <= Hexagon_MEMH_OFFSET_MAX); case Hexagon::L2_loadrb_io: - case Hexagon::S2_storerb_io: case Hexagon::L2_loadrub_io: + case Hexagon::S2_storerb_io: return (Offset >= Hexagon_MEMB_OFFSET_MIN) && (Offset <= Hexagon_MEMB_OFFSET_MAX); @@ -1257,28 +2398,28 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, return (Offset >= Hexagon_ADDI_OFFSET_MIN) && (Offset <= Hexagon_ADDI_OFFSET_MAX); - case Hexagon::L4_iadd_memopw_io: - case Hexagon::L4_isub_memopw_io: - case Hexagon::L4_add_memopw_io: - case Hexagon::L4_sub_memopw_io: - case Hexagon::L4_and_memopw_io: - case Hexagon::L4_or_memopw_io: + case Hexagon::L4_iadd_memopw_io : + case Hexagon::L4_isub_memopw_io : + case Hexagon::L4_add_memopw_io : + case Hexagon::L4_sub_memopw_io : + case Hexagon::L4_and_memopw_io : + case Hexagon::L4_or_memopw_io : return (0 <= Offset && Offset <= 255); - case Hexagon::L4_iadd_memoph_io: - case Hexagon::L4_isub_memoph_io: - case Hexagon::L4_add_memoph_io: - case Hexagon::L4_sub_memoph_io: - case Hexagon::L4_and_memoph_io: - case Hexagon::L4_or_memoph_io: + case Hexagon::L4_iadd_memoph_io : + case Hexagon::L4_isub_memoph_io : + case Hexagon::L4_add_memoph_io : + case Hexagon::L4_sub_memoph_io : + case Hexagon::L4_and_memoph_io : + case Hexagon::L4_or_memoph_io : return (0 <= Offset && Offset <= 127); - case Hexagon::L4_iadd_memopb_io: - case Hexagon::L4_isub_memopb_io: - case Hexagon::L4_add_memopb_io: - case Hexagon::L4_sub_memopb_io: - case Hexagon::L4_and_memopb_io: - case Hexagon::L4_or_memopb_io: + case Hexagon::L4_iadd_memopb_io : + case Hexagon::L4_isub_memopb_io : + case Hexagon::L4_add_memopb_io : + case Hexagon::L4_sub_memopb_io : + case Hexagon::L4_and_memopb_io : + case Hexagon::L4_or_memopb_io : return (0 <= Offset && Offset <= 63); // LDri_pred and STriw_pred are pseudo operations, so it has to take offset of @@ -1291,223 +2432,556 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, case Hexagon::TFR_FIA: case Hexagon::INLINEASM: return true; - } + + case Hexagon::L2_ploadrbt_io: + case Hexagon::L2_ploadrbf_io: + case Hexagon::L2_ploadrubt_io: + case Hexagon::L2_ploadrubf_io: + case Hexagon::S2_pstorerbt_io: + case Hexagon::S2_pstorerbf_io: + case Hexagon::S4_storeirb_io: + case Hexagon::S4_storeirbt_io: + case Hexagon::S4_storeirbf_io: + return isUInt<6>(Offset); + + case Hexagon::L2_ploadrht_io: + case Hexagon::L2_ploadrhf_io: + case Hexagon::L2_ploadruht_io: + case Hexagon::L2_ploadruhf_io: + case Hexagon::S2_pstorerht_io: + case Hexagon::S2_pstorerhf_io: + case Hexagon::S4_storeirh_io: + case Hexagon::S4_storeirht_io: + case Hexagon::S4_storeirhf_io: + return isShiftedUInt<6,1>(Offset); + + case Hexagon::L2_ploadrit_io: + case Hexagon::L2_ploadrif_io: + case Hexagon::S2_pstorerit_io: + case Hexagon::S2_pstorerif_io: + case Hexagon::S4_storeiri_io: + case Hexagon::S4_storeirit_io: + case Hexagon::S4_storeirif_io: + return isShiftedUInt<6,2>(Offset); + + case Hexagon::L2_ploadrdt_io: + case Hexagon::L2_ploadrdf_io: + case Hexagon::S2_pstorerdt_io: + case Hexagon::S2_pstorerdf_io: + return isShiftedUInt<6,3>(Offset); + } // switch llvm_unreachable("No offset range is defined for this opcode. " "Please define it in the above switch statement!"); } -// -// Check if the Offset is a valid auto-inc imm by Load/Store Type. -// -bool HexagonInstrInfo:: -isValidAutoIncImm(const EVT VT, const int Offset) const { +bool HexagonInstrInfo::isVecAcc(const MachineInstr *MI) const { + return MI && isV60VectorInstruction(MI) && isAccumulator(MI); +} - if (VT == MVT::i64) { - return (Offset >= Hexagon_MEMD_AUTOINC_MIN && - Offset <= Hexagon_MEMD_AUTOINC_MAX && - (Offset & 0x7) == 0); - } - if (VT == MVT::i32) { - return (Offset >= Hexagon_MEMW_AUTOINC_MIN && - Offset <= Hexagon_MEMW_AUTOINC_MAX && - (Offset & 0x3) == 0); - } - if (VT == MVT::i16) { - return (Offset >= Hexagon_MEMH_AUTOINC_MIN && - Offset <= Hexagon_MEMH_AUTOINC_MAX && - (Offset & 0x1) == 0); - } - if (VT == MVT::i8) { - return (Offset >= Hexagon_MEMB_AUTOINC_MIN && - Offset <= Hexagon_MEMB_AUTOINC_MAX); + +bool HexagonInstrInfo::isVecALU(const MachineInstr *MI) const { + if (!MI) + return false; + const uint64_t F = get(MI->getOpcode()).TSFlags; + const uint64_t V = ((F >> HexagonII::TypePos) & HexagonII::TypeMask); + return + V == HexagonII::TypeCVI_VA || + V == HexagonII::TypeCVI_VA_DV; +} + + +bool HexagonInstrInfo::isVecUsableNextPacket(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const { + if (EnableACCForwarding && isVecAcc(ProdMI) && isVecAcc(ConsMI)) + return true; + + if (EnableALUForwarding && (isVecALU(ConsMI) || isLateSourceInstr(ConsMI))) + return true; + + if (mayBeNewStore(ConsMI)) + return true; + + return false; +} + + +/// \brief Can these instructions execute at the same time in a bundle. +bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First, + const MachineInstr *Second) const { + if (DisableNVSchedule) + return false; + if (mayBeNewStore(Second)) { + // Make sure the definition of the first instruction is the value being + // stored. + const MachineOperand &Stored = + Second->getOperand(Second->getNumOperands() - 1); + if (!Stored.isReg()) + return false; + for (unsigned i = 0, e = First->getNumOperands(); i < e; ++i) { + const MachineOperand &Op = First->getOperand(i); + if (Op.isReg() && Op.isDef() && Op.getReg() == Stored.getReg()) + return true; + } } - llvm_unreachable("Not an auto-inc opc!"); + return false; +} + + +bool HexagonInstrInfo::hasEHLabel(const MachineBasicBlock *B) const { + for (auto &I : *B) + if (I.isEHLabel()) + return true; + return false; } -bool HexagonInstrInfo:: -isMemOp(const MachineInstr *MI) const { -// return MI->getDesc().mayLoad() && MI->getDesc().mayStore(); - - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::L4_iadd_memopw_io: - case Hexagon::L4_isub_memopw_io: - case Hexagon::L4_add_memopw_io: - case Hexagon::L4_sub_memopw_io: - case Hexagon::L4_and_memopw_io: - case Hexagon::L4_or_memopw_io: - case Hexagon::L4_iadd_memoph_io: - case Hexagon::L4_isub_memoph_io: - case Hexagon::L4_add_memoph_io: - case Hexagon::L4_sub_memoph_io: - case Hexagon::L4_and_memoph_io: - case Hexagon::L4_or_memoph_io: - case Hexagon::L4_iadd_memopb_io: - case Hexagon::L4_isub_memopb_io: - case Hexagon::L4_add_memopb_io: - case Hexagon::L4_sub_memopb_io: - case Hexagon::L4_and_memopb_io: - case Hexagon::L4_or_memopb_io: - case Hexagon::L4_ior_memopb_io: - case Hexagon::L4_ior_memoph_io: - case Hexagon::L4_ior_memopw_io: - case Hexagon::L4_iand_memopb_io: - case Hexagon::L4_iand_memoph_io: - case Hexagon::L4_iand_memopw_io: +// Returns true if an instruction can be converted into a non-extended +// equivalent instruction. +bool HexagonInstrInfo::hasNonExtEquivalent(const MachineInstr *MI) const { + short NonExtOpcode; + // Check if the instruction has a register form that uses register in place + // of the extended operand, if so return that as the non-extended form. + if (Hexagon::getRegForm(MI->getOpcode()) >= 0) + return true; + + if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) { + // Check addressing mode and retrieve non-ext equivalent instruction. + + switch (getAddrMode(MI)) { + case HexagonII::Absolute : + // Load/store with absolute addressing mode can be converted into + // base+offset mode. + NonExtOpcode = Hexagon::getBaseWithImmOffset(MI->getOpcode()); + break; + case HexagonII::BaseImmOffset : + // Load/store with base+offset addressing mode can be converted into + // base+register offset addressing mode. However left shift operand should + // be set to 0. + NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode()); + break; + case HexagonII::BaseLongOffset: + NonExtOpcode = Hexagon::getRegShlForm(MI->getOpcode()); + break; + default: + return false; + } + if (NonExtOpcode < 0) + return false; return true; } return false; } -bool HexagonInstrInfo:: -isSpillPredRegOp(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: return false; - case Hexagon::STriw_pred : - case Hexagon::LDriw_pred : +bool HexagonInstrInfo::hasPseudoInstrPair(const MachineInstr *MI) const { + return Hexagon::getRealHWInstr(MI->getOpcode(), + Hexagon::InstrType_Pseudo) >= 0; +} + + +bool HexagonInstrInfo::hasUncondBranch(const MachineBasicBlock *B) + const { + MachineBasicBlock::const_iterator I = B->getFirstTerminator(), E = B->end(); + while (I != E) { + if (I->isBarrier()) return true; + ++I; } + return false; } -bool HexagonInstrInfo::isNewValueJumpCandidate(const MachineInstr *MI) const { - switch (MI->getOpcode()) { - default: return false; - case Hexagon::C2_cmpeq: - case Hexagon::C2_cmpeqi: - case Hexagon::C2_cmpgt: - case Hexagon::C2_cmpgti: - case Hexagon::C2_cmpgtu: - case Hexagon::C2_cmpgtui: + +// Returns true, if a LD insn can be promoted to a cur load. +bool HexagonInstrInfo::mayBeCurLoad(const MachineInstr *MI) const { + auto &HST = MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>(); + const uint64_t F = MI->getDesc().TSFlags; + return ((F >> HexagonII::mayCVLoadPos) & HexagonII::mayCVLoadMask) && + HST.hasV60TOps(); +} + + +// Returns true, if a ST insn can be promoted to a new-value store. +bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::mayNVStorePos) & HexagonII::mayNVStoreMask; +} + + +bool HexagonInstrInfo::producesStall(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const { + // There is no stall when ProdMI is not a V60 vector. + if (!isV60VectorInstruction(ProdMI)) + return false; + + // There is no stall when ProdMI and ConsMI are not dependent. + if (!isDependent(ProdMI, ConsMI)) + return false; + + // When Forward Scheduling is enabled, there is no stall if ProdMI and ConsMI + // are scheduled in consecutive packets. + if (isVecUsableNextPacket(ProdMI, ConsMI)) + return false; + + return true; +} + + +bool HexagonInstrInfo::producesStall(const MachineInstr *MI, + MachineBasicBlock::const_instr_iterator BII) const { + // There is no stall when I is not a V60 vector. + if (!isV60VectorInstruction(MI)) + return false; + + MachineBasicBlock::const_instr_iterator MII = BII; + MachineBasicBlock::const_instr_iterator MIE = MII->getParent()->instr_end(); + + if (!(*MII).isBundle()) { + const MachineInstr *J = &*MII; + if (!isV60VectorInstruction(J)) + return false; + else if (isVecUsableNextPacket(J, MI)) + return false; + return true; + } + + for (++MII; MII != MIE && MII->isInsideBundle(); ++MII) { + const MachineInstr *J = &*MII; + if (producesStall(J, MI)) return true; } + return false; +} + + +bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr *MI, + unsigned PredReg) const { + for (unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { + const MachineOperand &MO = MI->getOperand(opNum); + if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg)) + return false; // Predicate register must be explicitly defined. + } + + // Hexagon Programmer's Reference says that decbin, memw_locked, and + // memd_locked cannot be used as .new as well, + // but we don't seem to have these instructions defined. + return MI->getOpcode() != Hexagon::A4_tlbmatch; +} + + +bool HexagonInstrInfo::PredOpcodeHasJMP_c(unsigned Opcode) const { + return (Opcode == Hexagon::J2_jumpt) || + (Opcode == Hexagon::J2_jumpf) || + (Opcode == Hexagon::J2_jumptnew) || + (Opcode == Hexagon::J2_jumpfnew) || + (Opcode == Hexagon::J2_jumptnewpt) || + (Opcode == Hexagon::J2_jumpfnewpt); +} + + +bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const { + if (Cond.empty() || !isPredicated(Cond[0].getImm())) + return false; + return !isPredicatedTrue(Cond[0].getImm()); +} + + +unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask; +} + + +// Returns the base register in a memory access (load/store). The offset is +// returned in Offset and the access size is returned in AccessSize. +unsigned HexagonInstrInfo::getBaseAndOffset(const MachineInstr *MI, + int &Offset, unsigned &AccessSize) const { + // Return if it is not a base+offset type instruction or a MemOp. + if (getAddrMode(MI) != HexagonII::BaseImmOffset && + getAddrMode(MI) != HexagonII::BaseLongOffset && + !isMemOp(MI) && !isPostIncrement(MI)) + return 0; + + // Since it is a memory access instruction, getMemAccessSize() should never + // return 0. + assert (getMemAccessSize(MI) && + "BaseImmOffset or BaseLongOffset or MemOp without accessSize"); + + // Return Values of getMemAccessSize() are + // 0 - Checked in the assert above. + // 1, 2, 3, 4 & 7, 8 - The statement below is correct for all these. + // MemAccessSize is represented as 1+log2(N) where N is size in bits. + AccessSize = (1U << (getMemAccessSize(MI) - 1)); + + unsigned basePos = 0, offsetPos = 0; + if (!getBaseAndOffsetPosition(MI, basePos, offsetPos)) + return 0; + + // Post increment updates its EA after the mem access, + // so we need to treat its offset as zero. + if (isPostIncrement(MI)) + Offset = 0; + else { + Offset = MI->getOperand(offsetPos).getImm(); + } + + return MI->getOperand(basePos).getReg(); +} + + +/// Return the position of the base and offset operands for this instruction. +bool HexagonInstrInfo::getBaseAndOffsetPosition(const MachineInstr *MI, + unsigned &BasePos, unsigned &OffsetPos) const { + // Deal with memops first. + if (isMemOp(MI)) { + assert (MI->getOperand(0).isReg() && MI->getOperand(1).isImm() && + "Bad Memop."); + BasePos = 0; + OffsetPos = 1; + } else if (MI->mayStore()) { + BasePos = 0; + OffsetPos = 1; + } else if (MI->mayLoad()) { + BasePos = 1; + OffsetPos = 2; + } else + return false; + + if (isPredicated(MI)) { + BasePos++; + OffsetPos++; + } + if (isPostIncrement(MI)) { + BasePos++; + OffsetPos++; + } + + if (!MI->getOperand(BasePos).isReg() || !MI->getOperand(OffsetPos).isImm()) + return false; + + return true; +} + + +// Inserts branching instructions in reverse order of their occurence. +// e.g. jump_t t1 (i1) +// jump t2 (i2) +// Jumpers = {i2, i1} +SmallVector<MachineInstr*, 2> HexagonInstrInfo::getBranchingInstrs( + MachineBasicBlock& MBB) const { + SmallVector<MachineInstr*, 2> Jumpers; + // If the block has no terminators, it just falls into the block after it. + MachineBasicBlock::instr_iterator I = MBB.instr_end(); + if (I == MBB.instr_begin()) + return Jumpers; + + // A basic block may looks like this: + // + // [ insn + // EH_LABEL + // insn + // insn + // insn + // EH_LABEL + // insn ] + // + // It has two succs but does not have a terminator + // Don't know how to handle it. + do { + --I; + if (I->isEHLabel()) + return Jumpers; + } while (I != MBB.instr_begin()); + + I = MBB.instr_end(); + --I; + + while (I->isDebugValue()) { + if (I == MBB.instr_begin()) + return Jumpers; + --I; + } + if (!isUnpredicatedTerminator(&*I)) + return Jumpers; + + // Get the last instruction in the block. + MachineInstr *LastInst = &*I; + Jumpers.push_back(LastInst); + MachineInstr *SecondLastInst = nullptr; + // Find one more terminator if present. + do { + if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(&*I)) { + if (!SecondLastInst) { + SecondLastInst = &*I; + Jumpers.push_back(SecondLastInst); + } else // This is a third branch. + return Jumpers; + } + if (I == MBB.instr_begin()) + break; + --I; + } while (true); + return Jumpers; } -bool HexagonInstrInfo:: -isConditionalTransfer (const MachineInstr *MI) const { + +// Returns Operand Index for the constant extended instruction. +unsigned HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask; +} + +// See if instruction could potentially be a duplex candidate. +// If so, return its group. Zero otherwise. +HexagonII::CompoundGroup HexagonInstrInfo::getCompoundCandidateGroup( + const MachineInstr *MI) const { + unsigned DstReg, SrcReg, Src1Reg, Src2Reg; + switch (MI->getOpcode()) { - default: return false; - case Hexagon::A2_tfrt: - case Hexagon::A2_tfrf: - case Hexagon::C2_cmoveit: - case Hexagon::C2_cmoveif: - case Hexagon::A2_tfrtnew: - case Hexagon::A2_tfrfnew: - case Hexagon::C2_cmovenewit: - case Hexagon::C2_cmovenewif: - return true; + default: + return HexagonII::HCG_None; + // + // Compound pairs. + // "p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2" + // "Rd16=#U6 ; jump #r9:2" + // "Rd16=Rs16 ; jump #r9:2" + // + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgtu: + DstReg = MI->getOperand(0).getReg(); + Src1Reg = MI->getOperand(1).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && + isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg)) + return HexagonII::HCG_A; + break; + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgtui: + // P0 = cmp.eq(Rs,#u2) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && + isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() && + ((isUInt<5>(MI->getOperand(2).getImm())) || + (MI->getOperand(2).getImm() == -1))) + return HexagonII::HCG_A; + break; + case Hexagon::A2_tfr: + // Rd = Rs + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg)) + return HexagonII::HCG_A; + break; + case Hexagon::A2_tfrsi: + // Rd = #u6 + // Do not test for #u6 size since the const is getting extended + // regardless and compound could be formed. + DstReg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(DstReg)) + return HexagonII::HCG_A; + break; + case Hexagon::S2_tstbit_i: + DstReg = MI->getOperand(0).getReg(); + Src1Reg = MI->getOperand(1).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + (Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && + MI->getOperand(2).isImm() && + isIntRegForSubInst(Src1Reg) && (MI->getOperand(2).getImm() == 0)) + return HexagonII::HCG_A; + break; + // The fact that .new form is used pretty much guarantees + // that predicate register will match. Nevertheless, + // there could be some false positives without additional + // checking. + case Hexagon::J2_jumptnew: + case Hexagon::J2_jumpfnew: + case Hexagon::J2_jumptnewpt: + case Hexagon::J2_jumpfnewpt: + Src1Reg = MI->getOperand(0).getReg(); + if (Hexagon::PredRegsRegClass.contains(Src1Reg) && + (Hexagon::P0 == Src1Reg || Hexagon::P1 == Src1Reg)) + return HexagonII::HCG_B; + break; + // Transfer and jump: + // Rd=#U6 ; jump #r9:2 + // Rd=Rs ; jump #r9:2 + // Do not test for jump range here. + case Hexagon::J2_jump: + case Hexagon::RESTORE_DEALLOC_RET_JMP_V4: + return HexagonII::HCG_C; + break; } + + return HexagonII::HCG_None; +} + + +// Returns -1 when there is no opcode found. +unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr *GA, + const MachineInstr *GB) const { + assert(getCompoundCandidateGroup(GA) == HexagonII::HCG_A); + assert(getCompoundCandidateGroup(GB) == HexagonII::HCG_B); + if ((GA->getOpcode() != Hexagon::C2_cmpeqi) || + (GB->getOpcode() != Hexagon::J2_jumptnew)) + return -1; + unsigned DestReg = GA->getOperand(0).getReg(); + if (!GB->readsRegister(DestReg)) + return -1; + if (DestReg == Hexagon::P0) + return Hexagon::J4_cmpeqi_tp0_jump_nt; + if (DestReg == Hexagon::P1) + return Hexagon::J4_cmpeqi_tp1_jump_nt; + return -1; } -bool HexagonInstrInfo::isConditionalALU32 (const MachineInstr* MI) const { - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::A2_paddf: - case Hexagon::A2_paddfnew: - case Hexagon::A2_paddt: - case Hexagon::A2_paddtnew: - case Hexagon::A2_pandf: - case Hexagon::A2_pandfnew: - case Hexagon::A2_pandt: - case Hexagon::A2_pandtnew: - case Hexagon::A4_paslhf: - case Hexagon::A4_paslhfnew: - case Hexagon::A4_paslht: - case Hexagon::A4_paslhtnew: - case Hexagon::A4_pasrhf: - case Hexagon::A4_pasrhfnew: - case Hexagon::A4_pasrht: - case Hexagon::A4_pasrhtnew: - case Hexagon::A2_porf: - case Hexagon::A2_porfnew: - case Hexagon::A2_port: - case Hexagon::A2_portnew: - case Hexagon::A2_psubf: - case Hexagon::A2_psubfnew: - case Hexagon::A2_psubt: - case Hexagon::A2_psubtnew: - case Hexagon::A2_pxorf: - case Hexagon::A2_pxorfnew: - case Hexagon::A2_pxort: - case Hexagon::A2_pxortnew: - case Hexagon::A4_psxthf: - case Hexagon::A4_psxthfnew: - case Hexagon::A4_psxtht: - case Hexagon::A4_psxthtnew: - case Hexagon::A4_psxtbf: - case Hexagon::A4_psxtbfnew: - case Hexagon::A4_psxtbt: - case Hexagon::A4_psxtbtnew: - case Hexagon::A4_pzxtbf: - case Hexagon::A4_pzxtbfnew: - case Hexagon::A4_pzxtbt: - case Hexagon::A4_pzxtbtnew: - case Hexagon::A4_pzxthf: - case Hexagon::A4_pzxthfnew: - case Hexagon::A4_pzxtht: - case Hexagon::A4_pzxthtnew: - case Hexagon::A2_paddit: - case Hexagon::A2_paddif: - case Hexagon::C2_ccombinewt: - case Hexagon::C2_ccombinewf: - return true; + +int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const { + enum Hexagon::PredSense inPredSense; + inPredSense = invertPredicate ? Hexagon::PredSense_false : + Hexagon::PredSense_true; + int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense); + if (CondOpcode >= 0) // Valid Conditional opcode/instruction + return CondOpcode; + + // This switch case will be removed once all the instructions have been + // modified to use relation maps. + switch(Opc) { + case Hexagon::TFRI_f: + return !invertPredicate ? Hexagon::TFRI_cPt_f : + Hexagon::TFRI_cNotPt_f; } + + llvm_unreachable("Unexpected predicable instruction"); } -bool HexagonInstrInfo:: -isConditionalLoad (const MachineInstr* MI) const { - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::L2_ploadrdt_io : - case Hexagon::L2_ploadrdf_io: - case Hexagon::L2_ploadrit_io: - case Hexagon::L2_ploadrif_io: - case Hexagon::L2_ploadrht_io: - case Hexagon::L2_ploadrhf_io: - case Hexagon::L2_ploadrbt_io: - case Hexagon::L2_ploadrbf_io: - case Hexagon::L2_ploadruht_io: - case Hexagon::L2_ploadruhf_io: - case Hexagon::L2_ploadrubt_io: - case Hexagon::L2_ploadrubf_io: - case Hexagon::L2_ploadrdt_pi: - case Hexagon::L2_ploadrdf_pi: - case Hexagon::L2_ploadrit_pi: - case Hexagon::L2_ploadrif_pi: - case Hexagon::L2_ploadrht_pi: - case Hexagon::L2_ploadrhf_pi: - case Hexagon::L2_ploadrbt_pi: - case Hexagon::L2_ploadrbf_pi: - case Hexagon::L2_ploadruht_pi: - case Hexagon::L2_ploadruhf_pi: - case Hexagon::L2_ploadrubt_pi: - case Hexagon::L2_ploadrubf_pi: - case Hexagon::L4_ploadrdt_rr: - case Hexagon::L4_ploadrdf_rr: - case Hexagon::L4_ploadrbt_rr: - case Hexagon::L4_ploadrbf_rr: - case Hexagon::L4_ploadrubt_rr: - case Hexagon::L4_ploadrubf_rr: - case Hexagon::L4_ploadrht_rr: - case Hexagon::L4_ploadrhf_rr: - case Hexagon::L4_ploadruht_rr: - case Hexagon::L4_ploadruhf_rr: - case Hexagon::L4_ploadrit_rr: - case Hexagon::L4_ploadrif_rr: - return true; + +// Return the cur value instruction for a given store. +int HexagonInstrInfo::getDotCurOp(const MachineInstr* MI) const { + switch (MI->getOpcode()) { + default: llvm_unreachable("Unknown .cur type"); + case Hexagon::V6_vL32b_pi: + return Hexagon::V6_vL32b_cur_pi; + case Hexagon::V6_vL32b_ai: + return Hexagon::V6_vL32b_cur_ai; + //128B + case Hexagon::V6_vL32b_pi_128B: + return Hexagon::V6_vL32b_cur_pi_128B; + case Hexagon::V6_vL32b_ai_128B: + return Hexagon::V6_vL32b_cur_ai_128B; } + return 0; } -// Returns true if an instruction is a conditional store. -// -// Note: It doesn't include conditional new-value stores as they can't be -// converted to .new predicate. + + +// The diagram below shows the steps involved in the conversion of a predicated +// store instruction to its .new predicated new-value form. // // p.new NV store [ if(p0.new)memw(R0+#0)=R2.new ] // ^ ^ @@ -1524,8 +2998,6 @@ isConditionalLoad (const MachineInstr* MI) const { // p.old store // [if (p0)memw(R0+#0)=R2] // -// The above diagram shows the steps involoved in the conversion of a predicated -// store instruction to its .new predicated new-value form. // // The following set of instructions further explains the scenario where // conditional new-value store becomes invalid when promoted to .new predicate @@ -1538,105 +3010,33 @@ isConditionalLoad (const MachineInstr* MI) const { // the first two instructions because in instr 1, r0 is conditional on old value // of p0 but its use in instr 3 is conditional on p0 modified by instr 2 which // is not valid for new-value stores. -bool HexagonInstrInfo:: -isConditionalStore (const MachineInstr* MI) const { - switch (MI->getOpcode()) - { - default: return false; - case Hexagon::S4_storeirbt_io: - case Hexagon::S4_storeirbf_io: - case Hexagon::S4_pstorerbt_rr: - case Hexagon::S4_pstorerbf_rr: - case Hexagon::S2_pstorerbt_io: - case Hexagon::S2_pstorerbf_io: - case Hexagon::S2_pstorerbt_pi: - case Hexagon::S2_pstorerbf_pi: - case Hexagon::S2_pstorerdt_io: - case Hexagon::S2_pstorerdf_io: - case Hexagon::S4_pstorerdt_rr: - case Hexagon::S4_pstorerdf_rr: - case Hexagon::S2_pstorerdt_pi: - case Hexagon::S2_pstorerdf_pi: - case Hexagon::S2_pstorerht_io: - case Hexagon::S2_pstorerhf_io: - case Hexagon::S4_storeirht_io: - case Hexagon::S4_storeirhf_io: - case Hexagon::S4_pstorerht_rr: - case Hexagon::S4_pstorerhf_rr: - case Hexagon::S2_pstorerht_pi: - case Hexagon::S2_pstorerhf_pi: - case Hexagon::S2_pstorerit_io: - case Hexagon::S2_pstorerif_io: - case Hexagon::S4_storeirit_io: - case Hexagon::S4_storeirif_io: - case Hexagon::S4_pstorerit_rr: - case Hexagon::S4_pstorerif_rr: - case Hexagon::S2_pstorerit_pi: - case Hexagon::S2_pstorerif_pi: - - // V4 global address store before promoting to dot new. - case Hexagon::S4_pstorerdt_abs: - case Hexagon::S4_pstorerdf_abs: - case Hexagon::S4_pstorerbt_abs: - case Hexagon::S4_pstorerbf_abs: - case Hexagon::S4_pstorerht_abs: - case Hexagon::S4_pstorerhf_abs: - case Hexagon::S4_pstorerit_abs: - case Hexagon::S4_pstorerif_abs: - return true; - - // Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded - // from the "Conditional Store" list. Because a predicated new value store - // would NOT be promoted to a double dot new store. See diagram below: - // This function returns yes for those stores that are predicated but not - // yet promoted to predicate dot new instructions. - // - // +---------------------+ - // /-----| if (p0) memw(..)=r0 |---------\~ - // || +---------------------+ || - // promote || /\ /\ || promote - // || /||\ /||\ || - // \||/ demote || \||/ - // \/ || || \/ - // +-------------------------+ || +-------------------------+ - // | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new | - // +-------------------------+ || +-------------------------+ - // || || || - // || demote \||/ - // promote || \/ NOT possible - // || || /\~ - // \||/ || /||\~ - // \/ || || - // +-----------------------------+ - // | if (p0.new) memw(..)=r0.new | - // +-----------------------------+ - // Double Dot New Store - // - } -} - - -bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const { - if (isNewValue(MI) && isBranch(MI)) - return true; - return false; -} - -bool HexagonInstrInfo::isNewValueJump(Opcode_t Opcode) const { - return isNewValue(Opcode) && get(Opcode).isBranch() && isPredicated(Opcode); -} - -bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const { - return (getAddrMode(MI) == HexagonII::PostInc); -} - -// Returns true, if any one of the operands is a dot new -// insn, whether it is predicated dot new or register dot new. -bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const { - return (isNewValueInst(MI) || - (isPredicated(MI) && isPredicatedNew(MI))); -} - +// Predicated new value stores (i.e. if (p0) memw(..)=r0.new) are excluded +// from the "Conditional Store" list. Because a predicated new value store +// would NOT be promoted to a double dot new store. See diagram below: +// This function returns yes for those stores that are predicated but not +// yet promoted to predicate dot new instructions. +// +// +---------------------+ +// /-----| if (p0) memw(..)=r0 |---------\~ +// || +---------------------+ || +// promote || /\ /\ || promote +// || /||\ /||\ || +// \||/ demote || \||/ +// \/ || || \/ +// +-------------------------+ || +-------------------------+ +// | if (p0.new) memw(..)=r0 | || | if (p0) memw(..)=r0.new | +// +-------------------------+ || +-------------------------+ +// || || || +// || demote \||/ +// promote || \/ NOT possible +// || || /\~ +// \||/ || /||\~ +// \/ || || +// +-----------------------------+ +// | if (p0.new) memw(..)=r0.new | +// +-----------------------------+ +// Double Dot New Store +// // Returns the most basic instruction for the .new predicated instructions and // new-value stores. // For example, all of the following instructions will be converted back to the @@ -1645,24 +3045,23 @@ bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const { // 2) if (p0) memw(R0+#0)= R1.new -------> if (p0) memw(R0+#0) = R1 // 3) if (p0.new) memw(R0+#0) = R1 ---> // +// To understand the translation of instruction 1 to its original form, consider +// a packet with 3 instructions. +// { p0 = cmp.eq(R0,R1) +// if (p0.new) R2 = add(R3, R4) +// R5 = add (R3, R1) +// } +// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet +// +// This instruction can be part of the previous packet only if both p0 and R2 +// are promoted to .new values. This promotion happens in steps, first +// predicate register is promoted to .new and in the next iteration R2 is +// promoted. Therefore, in case of dependence check failure (due to R5) during +// next iteration, it should be converted back to its most basic form. -int HexagonInstrInfo::GetDotOldOp(const int opc) const { - int NewOp = opc; - if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form - NewOp = Hexagon::getPredOldOpcode(NewOp); - assert(NewOp >= 0 && - "Couldn't change predicate new instruction to its old form."); - } - - if (isNewValueStore(NewOp)) { // Convert into non-new-value format - NewOp = Hexagon::getNonNVStore(NewOp); - assert(NewOp >= 0 && "Couldn't change new-value store to its old form."); - } - return NewOp; -} // Return the new value instruction for a given store. -int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { +int HexagonInstrInfo::getDotNewOp(const MachineInstr* MI) const { int NVOpcode = Hexagon::getNewValueOpcode(MI->getOpcode()); if (NVOpcode >= 0) // Valid new-value store instruction. return NVOpcode; @@ -1672,12 +3071,6 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { case Hexagon::S4_storerb_ur: return Hexagon::S4_storerbnew_ur; - case Hexagon::S4_storerh_ur: - return Hexagon::S4_storerhnew_ur; - - case Hexagon::S4_storeri_ur: - return Hexagon::S4_storerinew_ur; - case Hexagon::S2_storerb_pci: return Hexagon::S2_storerb_pci; @@ -1692,203 +3085,496 @@ int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const { case Hexagon::S2_storerf_pci: return Hexagon::S2_storerf_pci; + + case Hexagon::V6_vS32b_ai: + return Hexagon::V6_vS32b_new_ai; + + case Hexagon::V6_vS32b_pi: + return Hexagon::V6_vS32b_new_pi; + + // 128B + case Hexagon::V6_vS32b_ai_128B: + return Hexagon::V6_vS32b_new_ai_128B; + + case Hexagon::V6_vS32b_pi_128B: + return Hexagon::V6_vS32b_new_pi_128B; } return 0; } -// Return .new predicate version for an instruction. -int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI, - const MachineBranchProbabilityInfo - *MBPI) const { +// Returns the opcode to use when converting MI, which is a conditional jump, +// into a conditional instruction which uses the .new value of the predicate. +// We also use branch probabilities to add a hint to the jump. +int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const { + // We assume that block can have at most two successors. + bool taken = false; + const MachineBasicBlock *Src = MI->getParent(); + const MachineOperand *BrTarget = &MI->getOperand(1); + const MachineBasicBlock *Dst = BrTarget->getMBB(); + const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst); + if (Prediction >= BranchProbability(1,2)) + taken = true; + + switch (MI->getOpcode()) { + case Hexagon::J2_jumpt: + return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew; + case Hexagon::J2_jumpf: + return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew; + + default: + llvm_unreachable("Unexpected jump instruction."); + } +} + + +// Return .new predicate version for an instruction. +int HexagonInstrInfo::getDotNewPredOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const { int NewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode()); if (NewOpcode >= 0) // Valid predicate new instruction return NewOpcode; switch (MI->getOpcode()) { - default: llvm_unreachable("Unknown .new type"); // Condtional Jumps case Hexagon::J2_jumpt: case Hexagon::J2_jumpf: return getDotNewPredJumpOp(MI, MBPI); - case Hexagon::J2_jumprt: - return Hexagon::J2_jumptnewpt; - - case Hexagon::J2_jumprf: - return Hexagon::J2_jumprfnewpt; - - case Hexagon::JMPrett: - return Hexagon::J2_jumprtnewpt; - - case Hexagon::JMPretf: - return Hexagon::J2_jumprfnewpt; - - - // Conditional combine - case Hexagon::C2_ccombinewt: - return Hexagon::C2_ccombinewnewt; - case Hexagon::C2_ccombinewf: - return Hexagon::C2_ccombinewnewf; + default: + assert(0 && "Unknown .new type"); } + return 0; } -unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const { - const uint64_t F = MI->getDesc().TSFlags; +int HexagonInstrInfo::getDotOldOp(const int opc) const { + int NewOp = opc; + if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form + NewOp = Hexagon::getPredOldOpcode(NewOp); + assert(NewOp >= 0 && + "Couldn't change predicate new instruction to its old form."); + } - return((F >> HexagonII::AddrModePos) & HexagonII::AddrModeMask); + if (isNewValueStore(NewOp)) { // Convert into non-new-value format + NewOp = Hexagon::getNonNVStore(NewOp); + assert(NewOp >= 0 && "Couldn't change new-value store to its old form."); + } + return NewOp; } -/// immediateExtend - Changes the instruction in place to one using an immediate -/// extender. -void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const { - assert((isExtendable(MI)||isConstExtended(MI)) && - "Instruction must be extendable"); - // Find which operand is extendable. - short ExtOpNum = getCExtOpNum(MI); - MachineOperand &MO = MI->getOperand(ExtOpNum); - // This needs to be something we understand. - assert((MO.isMBB() || MO.isImm()) && - "Branch with unknown extendable field type"); - // Mark given operand as extended. - MO.addTargetFlag(HexagonII::HMOTF_ConstExtended); -} -DFAPacketizer *HexagonInstrInfo::CreateTargetScheduleState( - const TargetSubtargetInfo &STI) const { - const InstrItineraryData *II = STI.getInstrItineraryData(); - return static_cast<const HexagonSubtarget &>(STI).createDFAPacketizer(II); -} - -bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr *MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const { - // Debug info is never a scheduling boundary. It's necessary to be explicit - // due to the special treatment of IT instructions below, otherwise a - // dbg_value followed by an IT will result in the IT instruction being - // considered a scheduling hazard, which is wrong. It should be the actual - // instruction preceding the dbg_value instruction(s), just like it is - // when debug info is not present. - if (MI->isDebugValue()) - return false; +// See if instruction could potentially be a duplex candidate. +// If so, return its group. Zero otherwise. +HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup( + const MachineInstr *MI) const { + unsigned DstReg, SrcReg, Src1Reg, Src2Reg; + auto &HRI = getRegisterInfo(); - // Terminators and labels can't be scheduled around. - if (MI->getDesc().isTerminator() || MI->isPosition() || MI->isInlineAsm()) - return true; + switch (MI->getOpcode()) { + default: + return HexagonII::HSIG_None; + // + // Group L1: + // + // Rd = memw(Rs+#u4:2) + // Rd = memub(Rs+#u4:0) + case Hexagon::L2_loadri_io: + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + // Special case this one from Group L2. + // Rd = memw(r29+#u5:2) + if (isIntRegForSubInst(DstReg)) { + if (Hexagon::IntRegsRegClass.contains(SrcReg) && + HRI.getStackRegister() == SrcReg && + MI->getOperand(2).isImm() && + isShiftedUInt<5,2>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + // Rd = memw(Rs+#u4:2) + if (isIntRegForSubInst(SrcReg) && + (MI->getOperand(2).isImm() && + isShiftedUInt<4,2>(MI->getOperand(2).getImm()))) + return HexagonII::HSIG_L1; + } + break; + case Hexagon::L2_loadrub_io: + // Rd = memub(Rs+#u4:0) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && isUInt<4>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L1; + break; + // + // Group L2: + // + // Rd = memh/memuh(Rs+#u3:1) + // Rd = memb(Rs+#u3:0) + // Rd = memw(r29+#u5:2) - Handled above. + // Rdd = memd(r29+#u5:3) + // deallocframe + // [if ([!]p0[.new])] dealloc_return + // [if ([!]p0[.new])] jumpr r31 + case Hexagon::L2_loadrh_io: + case Hexagon::L2_loadruh_io: + // Rd = memh/memuh(Rs+#u3:1) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && + isShiftedUInt<3,1>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + break; + case Hexagon::L2_loadrb_io: + // Rd = memb(Rs+#u3:0) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && + isUInt<3>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + break; + case Hexagon::L2_loadrd_io: + // Rdd = memd(r29+#u5:3) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && + Hexagon::IntRegsRegClass.contains(SrcReg) && + HRI.getStackRegister() == SrcReg && + MI->getOperand(2).isImm() && + isShiftedUInt<5,3>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_L2; + break; + // dealloc_return is not documented in Hexagon Manual, but marked + // with A_SUBINSN attribute in iset_v4classic.py. + case Hexagon::RESTORE_DEALLOC_RET_JMP_V4: + case Hexagon::L4_return: + case Hexagon::L2_deallocframe: + return HexagonII::HSIG_L2; + case Hexagon::EH_RETURN_JMPR: + case Hexagon::JMPret : + // jumpr r31 + // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>. + DstReg = MI->getOperand(0).getReg(); + if (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg)) + return HexagonII::HSIG_L2; + break; + case Hexagon::JMPrett: + case Hexagon::JMPretf: + case Hexagon::JMPrettnewpt: + case Hexagon::JMPretfnewpt : + case Hexagon::JMPrettnew : + case Hexagon::JMPretfnew : + DstReg = MI->getOperand(1).getReg(); + SrcReg = MI->getOperand(0).getReg(); + // [if ([!]p0[.new])] jumpr r31 + if ((Hexagon::PredRegsRegClass.contains(SrcReg) && + (Hexagon::P0 == SrcReg)) && + (Hexagon::IntRegsRegClass.contains(DstReg) && (Hexagon::R31 == DstReg))) + return HexagonII::HSIG_L2; + break; + case Hexagon::L4_return_t : + case Hexagon::L4_return_f : + case Hexagon::L4_return_tnew_pnt : + case Hexagon::L4_return_fnew_pnt : + case Hexagon::L4_return_tnew_pt : + case Hexagon::L4_return_fnew_pt : + // [if ([!]p0[.new])] dealloc_return + SrcReg = MI->getOperand(0).getReg(); + if (Hexagon::PredRegsRegClass.contains(SrcReg) && (Hexagon::P0 == SrcReg)) + return HexagonII::HSIG_L2; + break; + // + // Group S1: + // + // memw(Rs+#u4:2) = Rt + // memb(Rs+#u4:0) = Rt + case Hexagon::S2_storeri_io: + // Special case this one from Group S2. + // memw(r29+#u5:2) = Rt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (Hexagon::IntRegsRegClass.contains(Src1Reg) && + isIntRegForSubInst(Src2Reg) && + HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() && + isShiftedUInt<5,2>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S2; + // memw(Rs+#u4:2) = Rt + if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) && + MI->getOperand(1).isImm() && + isShiftedUInt<4,2>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S1; + break; + case Hexagon::S2_storerb_io: + // memb(Rs+#u4:0) = Rt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) && + MI->getOperand(1).isImm() && isUInt<4>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S1; + break; + // + // Group S2: + // + // memh(Rs+#u3:1) = Rt + // memw(r29+#u5:2) = Rt + // memd(r29+#s6:3) = Rtt + // memw(Rs+#u4:2) = #U1 + // memb(Rs+#u4) = #U1 + // allocframe(#u5:3) + case Hexagon::S2_storerh_io: + // memh(Rs+#u3:1) = Rt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isIntRegForSubInst(Src1Reg) && isIntRegForSubInst(Src2Reg) && + MI->getOperand(1).isImm() && + isShiftedUInt<3,1>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S1; + break; + case Hexagon::S2_storerd_io: + // memd(r29+#s6:3) = Rtt + Src1Reg = MI->getOperand(0).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isDblRegForSubInst(Src2Reg, HRI) && + Hexagon::IntRegsRegClass.contains(Src1Reg) && + HRI.getStackRegister() == Src1Reg && MI->getOperand(1).isImm() && + isShiftedInt<6,3>(MI->getOperand(1).getImm())) + return HexagonII::HSIG_S2; + break; + case Hexagon::S4_storeiri_io: + // memw(Rs+#u4:2) = #U1 + Src1Reg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() && + isShiftedUInt<4,2>(MI->getOperand(1).getImm()) && + MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_S2; + break; + case Hexagon::S4_storeirb_io: + // memb(Rs+#u4) = #U1 + Src1Reg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(Src1Reg) && MI->getOperand(1).isImm() && + isUInt<4>(MI->getOperand(1).getImm()) && MI->getOperand(2).isImm() && + MI->getOperand(2).isImm() && isUInt<1>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_S2; + break; + case Hexagon::S2_allocframe: + if (MI->getOperand(0).isImm() && + isShiftedUInt<5,3>(MI->getOperand(0).getImm())) + return HexagonII::HSIG_S1; + break; + // + // Group A: + // + // Rx = add(Rx,#s7) + // Rd = Rs + // Rd = #u6 + // Rd = #-1 + // if ([!]P0[.new]) Rd = #0 + // Rd = add(r29,#u6:2) + // Rx = add(Rx,Rs) + // P0 = cmp.eq(Rs,#u2) + // Rdd = combine(#0,Rs) + // Rdd = combine(Rs,#0) + // Rdd = combine(#u2,#U2) + // Rd = add(Rs,#1) + // Rd = add(Rs,#-1) + // Rd = sxth/sxtb/zxtb/zxth(Rs) + // Rd = and(Rs,#1) + case Hexagon::A2_addi: + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg)) { + // Rd = add(r29,#u6:2) + if (Hexagon::IntRegsRegClass.contains(SrcReg) && + HRI.getStackRegister() == SrcReg && MI->getOperand(2).isImm() && + isShiftedUInt<6,2>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_A; + // Rx = add(Rx,#s7) + if ((DstReg == SrcReg) && MI->getOperand(2).isImm() && + isInt<7>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_A; + // Rd = add(Rs,#1) + // Rd = add(Rs,#-1) + if (isIntRegForSubInst(SrcReg) && MI->getOperand(2).isImm() && + ((MI->getOperand(2).getImm() == 1) || + (MI->getOperand(2).getImm() == -1))) + return HexagonII::HSIG_A; + } + break; + case Hexagon::A2_add: + // Rx = add(Rx,Rs) + DstReg = MI->getOperand(0).getReg(); + Src1Reg = MI->getOperand(1).getReg(); + Src2Reg = MI->getOperand(2).getReg(); + if (isIntRegForSubInst(DstReg) && (DstReg == Src1Reg) && + isIntRegForSubInst(Src2Reg)) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_andir: + // Same as zxtb. + // Rd16=and(Rs16,#255) + // Rd16=and(Rs16,#1) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && + ((MI->getOperand(2).getImm() == 1) || + (MI->getOperand(2).getImm() == 255))) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_tfr: + // Rd = Rs + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg)) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_tfrsi: + // Rd = #u6 + // Do not test for #u6 size since the const is getting extended + // regardless and compound could be formed. + // Rd = #-1 + DstReg = MI->getOperand(0).getReg(); + if (isIntRegForSubInst(DstReg)) + return HexagonII::HSIG_A; + break; + case Hexagon::C2_cmoveit: + case Hexagon::C2_cmovenewit: + case Hexagon::C2_cmoveif: + case Hexagon::C2_cmovenewif: + // if ([!]P0[.new]) Rd = #0 + // Actual form: + // %R16<def> = C2_cmovenewit %P0<internal>, 0, %R16<imp-use,undef>; + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && + Hexagon::PredRegsRegClass.contains(SrcReg) && Hexagon::P0 == SrcReg && + MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) + return HexagonII::HSIG_A; + break; + case Hexagon::C2_cmpeqi: + // P0 = cmp.eq(Rs,#u2) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (Hexagon::PredRegsRegClass.contains(DstReg) && + Hexagon::P0 == DstReg && isIntRegForSubInst(SrcReg) && + MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm())) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + // Rdd = combine(#u2,#U2) + DstReg = MI->getOperand(0).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && + ((MI->getOperand(1).isImm() && isUInt<2>(MI->getOperand(1).getImm())) || + (MI->getOperand(1).isGlobal() && + isUInt<2>(MI->getOperand(1).getOffset()))) && + ((MI->getOperand(2).isImm() && isUInt<2>(MI->getOperand(2).getImm())) || + (MI->getOperand(2).isGlobal() && + isUInt<2>(MI->getOperand(2).getOffset())))) + return HexagonII::HSIG_A; + break; + case Hexagon::A4_combineri: + // Rdd = combine(Rs,#0) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) && + ((MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) || + (MI->getOperand(2).isGlobal() && MI->getOperand(2).getOffset() == 0))) + return HexagonII::HSIG_A; + break; + case Hexagon::A4_combineir: + // Rdd = combine(#0,Rs) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(2).getReg(); + if (isDblRegForSubInst(DstReg, HRI) && isIntRegForSubInst(SrcReg) && + ((MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) || + (MI->getOperand(1).isGlobal() && MI->getOperand(1).getOffset() == 0))) + return HexagonII::HSIG_A; + break; + case Hexagon::A2_sxtb: + case Hexagon::A2_sxth: + case Hexagon::A2_zxtb: + case Hexagon::A2_zxth: + // Rd = sxth/sxtb/zxtb/zxth(Rs) + DstReg = MI->getOperand(0).getReg(); + SrcReg = MI->getOperand(1).getReg(); + if (isIntRegForSubInst(DstReg) && isIntRegForSubInst(SrcReg)) + return HexagonII::HSIG_A; + break; + } - return false; + return HexagonII::HSIG_None; } -bool HexagonInstrInfo::isConstExtended(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - unsigned isExtended = (F >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask; - if (isExtended) // Instruction must be extended. - return true; - unsigned isExtendable = - (F >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask; - if (!isExtendable) - return false; - - short ExtOpNum = getCExtOpNum(MI); - const MachineOperand &MO = MI->getOperand(ExtOpNum); - // Use MO operand flags to determine if MO - // has the HMOTF_ConstExtended flag set. - if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended) - return true; - // If this is a Machine BB address we are talking about, and it is - // not marked as extended, say so. - if (MO.isMBB()) - return false; - - // We could be using an instruction with an extendable immediate and shoehorn - // a global address into it. If it is a global address it will be constant - // extended. We do this for COMBINE. - // We currently only handle isGlobal() because it is the only kind of - // object we are going to end up with here for now. - // In the future we probably should add isSymbol(), etc. - if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() || - MO.isJTI() || MO.isCPI()) - return true; - - // If the extendable operand is not 'Immediate' type, the instruction should - // have 'isExtended' flag set. - assert(MO.isImm() && "Extendable operand must be Immediate type"); +short HexagonInstrInfo::getEquivalentHWInstr(const MachineInstr *MI) const { + return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Real); +} - int MinValue = getMinValue(MI); - int MaxValue = getMaxValue(MI); - int ImmValue = MO.getImm(); - return (ImmValue < MinValue || ImmValue > MaxValue); +// Return first non-debug instruction in the basic block. +MachineInstr *HexagonInstrInfo::getFirstNonDbgInst(MachineBasicBlock *BB) + const { + for (auto MII = BB->instr_begin(), End = BB->instr_end(); MII != End; MII++) { + MachineInstr *MI = &*MII; + if (MI->isDebugValue()) + continue; + return MI; + } + return nullptr; } -// Return the number of bytes required to encode the instruction. -// Hexagon instructions are fixed length, 4 bytes, unless they -// use a constant extender, which requires another 4 bytes. -// For debug instructions and prolog labels, return 0. -unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const { - if (MI->isDebugValue() || MI->isPosition()) - return 0; +unsigned HexagonInstrInfo::getInstrTimingClassLatency( + const InstrItineraryData *ItinData, const MachineInstr *MI) const { + // Default to one cycle for no itinerary. However, an "empty" itinerary may + // still have a MinLatency property, which getStageLatency checks. + if (!ItinData) + return getInstrLatency(ItinData, MI); - unsigned Size = MI->getDesc().getSize(); - if (!Size) - // Assume the default insn size in case it cannot be determined - // for whatever reason. - Size = HEXAGON_INSTR_SIZE; - - if (isConstExtended(MI) || isExtended(MI)) - Size += HEXAGON_INSTR_SIZE; - - return Size; + // Get the latency embedded in the itinerary. If we're not using timing class + // latencies or if we using BSB scheduling, then restrict the maximum latency + // to 1 (that is, either 0 or 1). + if (MI->isTransient()) + return 0; + unsigned Latency = ItinData->getStageLatency(MI->getDesc().getSchedClass()); + if (!EnableTimingClassLatency || + MI->getParent()->getParent()->getSubtarget<HexagonSubtarget>(). + useBSBScheduling()) + if (Latency > 1) + Latency = 1; + return Latency; } -// Returns the opcode to use when converting MI, which is a conditional jump, -// into a conditional instruction which uses the .new value of the predicate. -// We also use branch probabilities to add a hint to the jump. -int -HexagonInstrInfo::getDotNewPredJumpOp(MachineInstr *MI, - const - MachineBranchProbabilityInfo *MBPI) const { - - // We assume that block can have at most two successors. - bool taken = false; - MachineBasicBlock *Src = MI->getParent(); - MachineOperand *BrTarget = &MI->getOperand(1); - MachineBasicBlock *Dst = BrTarget->getMBB(); - const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst); - if (Prediction >= BranchProbability(1,2)) - taken = true; +// inverts the predication logic. +// p -> NotP +// NotP -> P +bool HexagonInstrInfo::getInvertedPredSense( + SmallVectorImpl<MachineOperand> &Cond) const { + if (Cond.empty()) + return false; + unsigned Opc = getInvertedPredicatedOpcode(Cond[0].getImm()); + Cond[0].setImm(Opc); + return true; +} - switch (MI->getOpcode()) { - case Hexagon::J2_jumpt: - return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew; - case Hexagon::J2_jumpf: - return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew; - default: - llvm_unreachable("Unexpected jump instruction."); - } -} -// Returns true if a particular operand is extendable for an instruction. -bool HexagonInstrInfo::isOperandExtended(const MachineInstr *MI, - unsigned short OperandNum) const { - const uint64_t F = MI->getDesc().TSFlags; +unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const { + int InvPredOpcode; + InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc) + : Hexagon::getTruePredOpcode(Opc); + if (InvPredOpcode >= 0) // Valid instruction with the inverted predicate. + return InvPredOpcode; - return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) - == OperandNum; + llvm_unreachable("Unexpected predicated instruction"); } -// Returns Operand Index for the constant extended instruction. -unsigned short HexagonInstrInfo::getCExtOpNum(const MachineInstr *MI) const { - const uint64_t F = MI->getDesc().TSFlags; - return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask); -} -// Returns the min value that doesn't need to be extended. -int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const { +// Returns the max value that doesn't need to be extended. +int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; unsigned isSigned = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; @@ -1896,13 +3582,20 @@ int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const { & HexagonII::ExtentBitsMask; if (isSigned) // if value is signed - return -1U << (bits - 1); + return ~(-1U << (bits - 1)); else - return 0; + return ~(-1U << bits); } -// Returns the max value that doesn't need to be extended. -int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const { + +unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::MemAccessSizePos) & HexagonII::MemAccesSizeMask; +} + + +// Returns the min value that doesn't need to be extended. +int HexagonInstrInfo::getMinValue(const MachineInstr *MI) const { const uint64_t F = MI->getDesc().TSFlags; unsigned isSigned = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; @@ -1910,49 +3603,14 @@ int HexagonInstrInfo::getMaxValue(const MachineInstr *MI) const { & HexagonII::ExtentBitsMask; if (isSigned) // if value is signed - return ~(-1U << (bits - 1)); + return -1U << (bits - 1); else - return ~(-1U << bits); + return 0; } -// Returns true if an instruction can be converted into a non-extended -// equivalent instruction. -bool HexagonInstrInfo::NonExtEquivalentExists (const MachineInstr *MI) const { - - short NonExtOpcode; - // Check if the instruction has a register form that uses register in place - // of the extended operand, if so return that as the non-extended form. - if (Hexagon::getRegForm(MI->getOpcode()) >= 0) - return true; - - if (MI->getDesc().mayLoad() || MI->getDesc().mayStore()) { - // Check addressing mode and retrieve non-ext equivalent instruction. - - switch (getAddrMode(MI)) { - case HexagonII::Absolute : - // Load/store with absolute addressing mode can be converted into - // base+offset mode. - NonExtOpcode = Hexagon::getBasedWithImmOffset(MI->getOpcode()); - break; - case HexagonII::BaseImmOffset : - // Load/store with base+offset addressing mode can be converted into - // base+register offset addressing mode. However left shift operand should - // be set to 0. - NonExtOpcode = Hexagon::getBaseWithRegOffset(MI->getOpcode()); - break; - default: - return false; - } - if (NonExtOpcode < 0) - return false; - return true; - } - return false; -} // Returns opcode of the non-extended equivalent instruction. -short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const { - +short HexagonInstrInfo::getNonExtOpcode(const MachineInstr *MI) const { // Check if the instruction has a register form that uses register in place // of the extended operand, if so return that as the non-extended form. short NonExtOpcode = Hexagon::getRegForm(MI->getOpcode()); @@ -1963,9 +3621,12 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const { // Check addressing mode and retrieve non-ext equivalent instruction. switch (getAddrMode(MI)) { case HexagonII::Absolute : - return Hexagon::getBasedWithImmOffset(MI->getOpcode()); + return Hexagon::getBaseWithImmOffset(MI->getOpcode()); case HexagonII::BaseImmOffset : return Hexagon::getBaseWithRegOffset(MI->getOpcode()); + case HexagonII::BaseLongOffset: + return Hexagon::getRegShlForm(MI->getOpcode()); + default: return -1; } @@ -1973,29 +3634,9 @@ short HexagonInstrInfo::getNonExtOpcode (const MachineInstr *MI) const { return -1; } -bool HexagonInstrInfo::PredOpcodeHasJMP_c(Opcode_t Opcode) const { - return (Opcode == Hexagon::J2_jumpt) || - (Opcode == Hexagon::J2_jumpf) || - (Opcode == Hexagon::J2_jumptnewpt) || - (Opcode == Hexagon::J2_jumpfnewpt) || - (Opcode == Hexagon::J2_jumpt) || - (Opcode == Hexagon::J2_jumpf); -} - -bool HexagonInstrInfo::predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const { - if (Cond.empty() || !isPredicated(Cond[0].getImm())) - return false; - return !isPredicatedTrue(Cond[0].getImm()); -} - -bool HexagonInstrInfo::isEndLoopN(Opcode_t Opcode) const { - return (Opcode == Hexagon::ENDLOOP0 || - Opcode == Hexagon::ENDLOOP1); -} bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond, - unsigned &PredReg, unsigned &PredRegPos, - unsigned &PredRegFlags) const { + unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const { if (Cond.empty()) return false; assert(Cond.size() == 2); @@ -2014,3 +3655,174 @@ bool HexagonInstrInfo::getPredReg(ArrayRef<MachineOperand> Cond, return true; } + +short HexagonInstrInfo::getPseudoInstrPair(const MachineInstr *MI) const { + return Hexagon::getRealHWInstr(MI->getOpcode(), Hexagon::InstrType_Pseudo); +} + + +short HexagonInstrInfo::getRegForm(const MachineInstr *MI) const { + return Hexagon::getRegForm(MI->getOpcode()); +} + + +// Return the number of bytes required to encode the instruction. +// Hexagon instructions are fixed length, 4 bytes, unless they +// use a constant extender, which requires another 4 bytes. +// For debug instructions and prolog labels, return 0. +unsigned HexagonInstrInfo::getSize(const MachineInstr *MI) const { + if (MI->isDebugValue() || MI->isPosition()) + return 0; + + unsigned Size = MI->getDesc().getSize(); + if (!Size) + // Assume the default insn size in case it cannot be determined + // for whatever reason. + Size = HEXAGON_INSTR_SIZE; + + if (isConstExtended(MI) || isExtended(MI)) + Size += HEXAGON_INSTR_SIZE; + + // Try and compute number of instructions in asm. + if (BranchRelaxAsmLarge && MI->getOpcode() == Hexagon::INLINEASM) { + const MachineBasicBlock &MBB = *MI->getParent(); + const MachineFunction *MF = MBB.getParent(); + const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); + + // Count the number of register definitions to find the asm string. + unsigned NumDefs = 0; + for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef(); + ++NumDefs) + assert(NumDefs != MI->getNumOperands()-2 && "No asm string?"); + + assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?"); + // Disassemble the AsmStr and approximate number of instructions. + const char *AsmStr = MI->getOperand(NumDefs).getSymbolName(); + Size = getInlineAsmLength(AsmStr, *MAI); + } + + return Size; +} + + +uint64_t HexagonInstrInfo::getType(const MachineInstr* MI) const { + const uint64_t F = MI->getDesc().TSFlags; + return (F >> HexagonII::TypePos) & HexagonII::TypeMask; +} + + +unsigned HexagonInstrInfo::getUnits(const MachineInstr* MI) const { + const TargetSubtargetInfo &ST = MI->getParent()->getParent()->getSubtarget(); + const InstrItineraryData &II = *ST.getInstrItineraryData(); + const InstrStage &IS = *II.beginStage(MI->getDesc().getSchedClass()); + + return IS.getUnits(); +} + + +unsigned HexagonInstrInfo::getValidSubTargets(const unsigned Opcode) const { + const uint64_t F = get(Opcode).TSFlags; + return (F >> HexagonII::validSubTargetPos) & HexagonII::validSubTargetMask; +} + + +// Calculate size of the basic block without debug instructions. +unsigned HexagonInstrInfo::nonDbgBBSize(const MachineBasicBlock *BB) const { + return nonDbgMICount(BB->instr_begin(), BB->instr_end()); +} + + +unsigned HexagonInstrInfo::nonDbgBundleSize( + MachineBasicBlock::const_iterator BundleHead) const { + assert(BundleHead->isBundle() && "Not a bundle header"); + auto MII = BundleHead.getInstrIterator(); + // Skip the bundle header. + return nonDbgMICount(++MII, getBundleEnd(BundleHead)); +} + + +/// immediateExtend - Changes the instruction in place to one using an immediate +/// extender. +void HexagonInstrInfo::immediateExtend(MachineInstr *MI) const { + assert((isExtendable(MI)||isConstExtended(MI)) && + "Instruction must be extendable"); + // Find which operand is extendable. + short ExtOpNum = getCExtOpNum(MI); + MachineOperand &MO = MI->getOperand(ExtOpNum); + // This needs to be something we understand. + assert((MO.isMBB() || MO.isImm()) && + "Branch with unknown extendable field type"); + // Mark given operand as extended. + MO.addTargetFlag(HexagonII::HMOTF_ConstExtended); +} + + +bool HexagonInstrInfo::invertAndChangeJumpTarget( + MachineInstr* MI, MachineBasicBlock* NewTarget) const { + DEBUG(dbgs() << "\n[invertAndChangeJumpTarget] to BB#" + << NewTarget->getNumber(); MI->dump();); + assert(MI->isBranch()); + unsigned NewOpcode = getInvertedPredicatedOpcode(MI->getOpcode()); + int TargetPos = MI->getNumOperands() - 1; + // In general branch target is the last operand, + // but some implicit defs added at the end might change it. + while ((TargetPos > -1) && !MI->getOperand(TargetPos).isMBB()) + --TargetPos; + assert((TargetPos >= 0) && MI->getOperand(TargetPos).isMBB()); + MI->getOperand(TargetPos).setMBB(NewTarget); + if (EnableBranchPrediction && isPredicatedNew(MI)) { + NewOpcode = reversePrediction(NewOpcode); + } + MI->setDesc(get(NewOpcode)); + return true; +} + + +void HexagonInstrInfo::genAllInsnTimingClasses(MachineFunction &MF) const { + /* +++ The code below is used to generate complete set of Hexagon Insn +++ */ + MachineFunction::iterator A = MF.begin(); + MachineBasicBlock &B = *A; + MachineBasicBlock::iterator I = B.begin(); + MachineInstr *MI = &*I; + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *NewMI; + + for (unsigned insn = TargetOpcode::GENERIC_OP_END+1; + insn < Hexagon::INSTRUCTION_LIST_END; ++insn) { + NewMI = BuildMI(B, MI, DL, get(insn)); + DEBUG(dbgs() << "\n" << getName(NewMI->getOpcode()) << + " Class: " << NewMI->getDesc().getSchedClass()); + NewMI->eraseFromParent(); + } + /* --- The code above is used to generate complete set of Hexagon Insn --- */ +} + + +// inverts the predication logic. +// p -> NotP +// NotP -> P +bool HexagonInstrInfo::reversePredSense(MachineInstr* MI) const { + DEBUG(dbgs() << "\nTrying to reverse pred. sense of:"; MI->dump()); + MI->setDesc(get(getInvertedPredicatedOpcode(MI->getOpcode()))); + return true; +} + + +// Reverse the branch prediction. +unsigned HexagonInstrInfo::reversePrediction(unsigned Opcode) const { + int PredRevOpcode = -1; + if (isPredictedTaken(Opcode)) + PredRevOpcode = Hexagon::notTakenBranchPrediction(Opcode); + else + PredRevOpcode = Hexagon::takenBranchPrediction(Opcode); + assert(PredRevOpcode > 0); + return PredRevOpcode; +} + + +// TODO: Add more rigorous validation. +bool HexagonInstrInfo::validateBranchCond(const ArrayRef<MachineOperand> &Cond) + const { + return Cond.empty() || (Cond[0].isImm() && (Cond.size() != 1)); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h index d0b8a46..9530d9f 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.h @@ -1,4 +1,3 @@ - //===- HexagonInstrInfo.h - Hexagon Instruction Information -----*- C++ -*-===// // // The LLVM Compiler Infrastructure @@ -28,23 +27,18 @@ namespace llvm { struct EVT; class HexagonSubtarget; + class HexagonInstrInfo : public HexagonGenInstrInfo { virtual void anchor(); const HexagonRegisterInfo RI; - const HexagonSubtarget &Subtarget; public: - typedef unsigned Opcode_t; - explicit HexagonInstrInfo(HexagonSubtarget &ST); - /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As - /// such, whenever a client has an instance of instruction info, it should - /// always be able to get register info as well (through this method). + /// TargetInstrInfo overrides. /// - const HexagonRegisterInfo &getRegisterInfo() const { return RI; } - /// isLoadFromStackSlot - If the specified machine instruction is a direct + /// If the specified machine instruction is a direct /// load from a stack slot, return the virtual or physical register number of /// the destination along with the FrameIndex of the loaded stack slot. If /// not, return 0. This predicate must return 0 if the instruction has @@ -52,7 +46,7 @@ public: unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const override; - /// isStoreToStackSlot - If the specified machine instruction is a direct + /// If the specified machine instruction is a direct /// store to a stack slot, return the virtual or physical register number of /// the source reg along with the FrameIndex of the loaded stack slot. If /// not, return 0. This predicate must return 0 if the instruction has @@ -60,50 +54,118 @@ public: unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const override; - + /// Analyze the branching code at the end of MBB, returning + /// true if it cannot be understood (e.g. it's a switch dispatch or isn't + /// implemented for a target). Upon success, this returns false and returns + /// with the following information in various cases: + /// + /// 1. If this block ends with no branches (it just falls through to its succ) + /// just return false, leaving TBB/FBB null. + /// 2. If this block ends with only an unconditional branch, it sets TBB to be + /// the destination block. + /// 3. If this block ends with a conditional branch and it falls through to a + /// successor block, it sets TBB to be the branch destination block and a + /// list of operands that evaluate the condition. These operands can be + /// passed to other TargetInstrInfo methods to create new branches. + /// 4. If this block ends with a conditional branch followed by an + /// unconditional branch, it returns the 'true' destination in TBB, the + /// 'false' destination in FBB, and a list of operands that evaluate the + /// condition. These operands can be passed to other TargetInstrInfo + /// methods to create new branches. + /// + /// Note that RemoveBranch and InsertBranch must be implemented to support + /// cases where this method returns success. + /// + /// If AllowModify is true, then this routine is allowed to modify the basic + /// block (e.g. delete instructions after the unconditional branch). + /// bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override; + /// Remove the branching code at the end of the specific MBB. + /// This is only invoked in cases where AnalyzeBranch returns success. It + /// returns the number of instructions that were removed. unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + /// Insert branch code into the end of the specified MachineBasicBlock. + /// The operands to this method are the same as those + /// returned by AnalyzeBranch. This is only invoked in cases where + /// AnalyzeBranch returns success. It returns the number of instructions + /// inserted. + /// + /// It is also invoked by tail merging to add unconditional branches in + /// cases where AnalyzeBranch doesn't apply because there was no original + /// branch to analyze. At least this much must be implemented, else tail + /// merging needs to be disabled. unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, DebugLoc DL) const override; - bool analyzeCompare(const MachineInstr *MI, - unsigned &SrcReg, unsigned &SrcReg2, - int &Mask, int &Value) const override; + /// Return true if it's profitable to predicate + /// instructions with accumulated instruction latency of "NumCycles" + /// of the specified basic block, where the probability of the instructions + /// being executed is given by Probability, and Confidence is a measure + /// of our confidence that it will be properly predicted. + bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + unsigned ExtraPredCycles, + BranchProbability Probability) const override; + + /// Second variant of isProfitableToIfCvt. This one + /// checks for the case where two basic blocks from true and false path + /// of a if-then-else (diamond) are predicated on mutally exclusive + /// predicates, where the probability of the true path being taken is given + /// by Probability, and Confidence is a measure of our confidence that it + /// will be properly predicted. + bool isProfitableToIfCvt(MachineBasicBlock &TMBB, + unsigned NumTCycles, unsigned ExtraTCycles, + MachineBasicBlock &FMBB, + unsigned NumFCycles, unsigned ExtraFCycles, + BranchProbability Probability) const override; + + /// Return true if it's profitable for if-converter to duplicate instructions + /// of specified accumulated instruction latencies in the specified MBB to + /// enable if-conversion. + /// The probability of the instructions being executed is given by + /// Probability, and Confidence is a measure of our confidence that it + /// will be properly predicted. + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + BranchProbability Probability) const override; + /// Emit instructions to copy a pair of physical registers. + /// + /// This function should support copies within any legal register class as + /// well as any cross-class copies created during instruction selection. + /// + /// The source and destination registers may overlap, which may require a + /// careful implementation when multiple copy instructions are required for + /// large registers. See for example the ARM target. void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + /// Store the specified register of the given register class to the specified + /// stack frame index. The store instruction is to be added to the given + /// machine basic block before the specified machine instruction. If isKill + /// is true, the register operand is the last use and must be marked kill. void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - + /// Load the specified register of the given register class from the specified + /// stack frame index. The load instruction is to be added to the given + /// machine basic block before the specified machine instruction. void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, - SmallVectorImpl<MachineOperand> &Addr, - const TargetRegisterClass *RC, - SmallVectorImpl<MachineInstr*> &NewMIs) const; - - /// expandPostRAPseudo - This function is called for all pseudo instructions + /// This function is called for all pseudo instructions /// that remain after register allocation. Many pseudo instructions are /// created to help register allocation. This is the place to convert them /// into real instructions. The target can edit MI in place, or it can insert @@ -111,122 +173,228 @@ public: /// anything was changed. bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, - int FrameIndex) const override; + /// Reverses the branch condition of the specified condition list, + /// returning false on success and true if it cannot be reversed. + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) + const override; - MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, - ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, - MachineInstr *LoadMI) const override { - return nullptr; - } + /// Insert a noop into the instruction stream at the specified point. + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const override; - unsigned createVR(MachineFunction* MF, MVT VT) const; + /// Returns true if the instruction is already predicated. + bool isPredicated(const MachineInstr *MI) const override; - bool isBranch(const MachineInstr *MI) const; - bool isPredicable(MachineInstr *MI) const override; + /// Convert the instruction into a predicated instruction. + /// It returns true if the operation was successful. bool PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Cond) const override; - bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, - unsigned ExtraPredCycles, - const BranchProbability &Probability) const override; - - bool isProfitableToIfCvt(MachineBasicBlock &TMBB, - unsigned NumTCycles, unsigned ExtraTCycles, - MachineBasicBlock &FMBB, - unsigned NumFCycles, unsigned ExtraFCycles, - const BranchProbability &Probability) const override; + /// Returns true if the first specified predicate + /// subsumes the second, e.g. GE subsumes GT. + bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, + ArrayRef<MachineOperand> Pred2) const override; - bool isPredicated(const MachineInstr *MI) const override; - bool isPredicated(unsigned Opcode) const; - bool isPredicatedTrue(const MachineInstr *MI) const; - bool isPredicatedTrue(unsigned Opcode) const; - bool isPredicatedNew(const MachineInstr *MI) const; - bool isPredicatedNew(unsigned Opcode) const; + /// If the specified instruction defines any predicate + /// or condition code register(s) used for predication, returns true as well + /// as the definition predicate(s) by reference. bool DefinesPredicate(MachineInstr *MI, std::vector<MachineOperand> &Pred) const override; - bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1, - ArrayRef<MachineOperand> Pred2) const override; - bool - ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + /// Return true if the specified instruction can be predicated. + /// By default, this returns true for every instruction with a + /// PredicateOperand. + bool isPredicable(MachineInstr *MI) const override; - bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, - const BranchProbability &Probability) const override; + /// Test if the given instruction should be considered a scheduling boundary. + /// This primarily includes labels and terminators. + bool isSchedulingBoundary(const MachineInstr *MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const override; + /// Measure the specified inline asm to determine an approximation of its + /// length. + unsigned getInlineAsmLength(const char *Str, + const MCAsmInfo &MAI) const override; + + /// Allocate and return a hazard recognizer to use for this target when + /// scheduling the machine instructions after register allocation. + ScheduleHazardRecognizer* + CreateTargetPostRAHazardRecognizer(const InstrItineraryData*, + const ScheduleDAG *DAG) const override; + + /// For a comparison instruction, return the source registers + /// in SrcReg and SrcReg2 if having two register operands, and the value it + /// compares against in CmpValue. Return true if the comparison instruction + /// can be analyzed. + bool analyzeCompare(const MachineInstr *MI, + unsigned &SrcReg, unsigned &SrcReg2, + int &Mask, int &Value) const override; + + /// Compute the instruction latency of a given instruction. + /// If the instruction has higher cost when predicated, it's returned via + /// PredCost. + unsigned getInstrLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI, + unsigned *PredCost = 0) const override; + + /// Create machine specific model for scheduling. DFAPacketizer * CreateTargetScheduleState(const TargetSubtargetInfo &STI) const override; - bool isSchedulingBoundary(const MachineInstr *MI, - const MachineBasicBlock *MBB, - const MachineFunction &MF) const override; - bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const; - bool isValidAutoIncImm(const EVT VT, const int Offset) const; - bool isMemOp(const MachineInstr *MI) const; - bool isSpillPredRegOp(const MachineInstr *MI) const; - bool isU6_3Immediate(const int value) const; - bool isU6_2Immediate(const int value) const; - bool isU6_1Immediate(const int value) const; - bool isU6_0Immediate(const int value) const; - bool isS4_3Immediate(const int value) const; - bool isS4_2Immediate(const int value) const; - bool isS4_1Immediate(const int value) const; - bool isS4_0Immediate(const int value) const; - bool isS12_Immediate(const int value) const; - bool isU6_Immediate(const int value) const; - bool isS8_Immediate(const int value) const; - bool isS6_Immediate(const int value) const; - - bool isSaveCalleeSavedRegsCall(const MachineInstr* MI) const; - bool isConditionalTransfer(const MachineInstr* MI) const; + // Sometimes, it is possible for the target + // to tell, even without aliasing information, that two MIs access different + // memory addresses. This function returns true if two MIs access different + // memory addresses and false otherwise. + bool areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb, + AliasAnalysis *AA = nullptr) + const override; + + + /// HexagonInstrInfo specifics. + /// + + const HexagonRegisterInfo &getRegisterInfo() const { return RI; } + + unsigned createVR(MachineFunction* MF, MVT VT) const; + + bool isAbsoluteSet(const MachineInstr* MI) const; + bool isAccumulator(const MachineInstr *MI) const; + bool isComplex(const MachineInstr *MI) const; + bool isCompoundBranchInstr(const MachineInstr *MI) const; + bool isCondInst(const MachineInstr *MI) const; bool isConditionalALU32 (const MachineInstr* MI) const; - bool isConditionalLoad (const MachineInstr* MI) const; + bool isConditionalLoad(const MachineInstr* MI) const; bool isConditionalStore(const MachineInstr* MI) const; - bool isNewValueInst(const MachineInstr* MI) const; - bool isNewValue(const MachineInstr* MI) const; - bool isNewValue(Opcode_t Opcode) const; - bool isDotNewInst(const MachineInstr* MI) const; - int GetDotOldOp(const int opc) const; - int GetDotNewOp(const MachineInstr* MI) const; - int GetDotNewPredOp(MachineInstr *MI, - const MachineBranchProbabilityInfo - *MBPI) const; - bool mayBeNewStore(const MachineInstr* MI) const; + bool isConditionalTransfer(const MachineInstr* MI) const; + bool isConstExtended(const MachineInstr *MI) const; bool isDeallocRet(const MachineInstr *MI) const; - unsigned getInvertedPredicatedOpcode(const int Opc) const; + bool isDependent(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const; + bool isDotCurInst(const MachineInstr* MI) const; + bool isDotNewInst(const MachineInstr* MI) const; + bool isDuplexPair(const MachineInstr *MIa, const MachineInstr *MIb) const; + bool isEarlySourceInstr(const MachineInstr *MI) const; + bool isEndLoopN(unsigned Opcode) const; + bool isExpr(unsigned OpType) const; bool isExtendable(const MachineInstr* MI) const; bool isExtended(const MachineInstr* MI) const; - bool isPostIncrement(const MachineInstr* MI) const; + bool isFloat(const MachineInstr *MI) const; + bool isHVXMemWithAIndirect(const MachineInstr *I, + const MachineInstr *J) const; + bool isIndirectCall(const MachineInstr *MI) const; + bool isIndirectL4Return(const MachineInstr *MI) const; + bool isJumpR(const MachineInstr *MI) const; + bool isJumpWithinBranchRange(const MachineInstr *MI, unsigned offset) const; + bool isLateInstrFeedsEarlyInstr(const MachineInstr *LRMI, + const MachineInstr *ESMI) const; + bool isLateResultInstr(const MachineInstr *MI) const; + bool isLateSourceInstr(const MachineInstr *MI) const; + bool isLoopN(const MachineInstr *MI) const; + bool isMemOp(const MachineInstr *MI) const; + bool isNewValue(const MachineInstr* MI) const; + bool isNewValue(unsigned Opcode) const; + bool isNewValueInst(const MachineInstr* MI) const; + bool isNewValueJump(const MachineInstr* MI) const; + bool isNewValueJump(unsigned Opcode) const; bool isNewValueStore(const MachineInstr* MI) const; bool isNewValueStore(unsigned Opcode) const; - bool isNewValueJump(const MachineInstr* MI) const; - bool isNewValueJump(Opcode_t Opcode) const; - bool isNewValueJumpCandidate(const MachineInstr *MI) const; + bool isOperandExtended(const MachineInstr *MI, unsigned OperandNum) const; + bool isPostIncrement(const MachineInstr* MI) const; + bool isPredicatedNew(const MachineInstr *MI) const; + bool isPredicatedNew(unsigned Opcode) const; + bool isPredicatedTrue(const MachineInstr *MI) const; + bool isPredicatedTrue(unsigned Opcode) const; + bool isPredicated(unsigned Opcode) const; + bool isPredicateLate(unsigned Opcode) const; + bool isPredictedTaken(unsigned Opcode) const; + bool isSaveCalleeSavedRegsCall(const MachineInstr *MI) const; + bool isSolo(const MachineInstr* MI) const; + bool isSpillPredRegOp(const MachineInstr *MI) const; + bool isTC1(const MachineInstr *MI) const; + bool isTC2(const MachineInstr *MI) const; + bool isTC2Early(const MachineInstr *MI) const; + bool isTC4x(const MachineInstr *MI) const; + bool isV60VectorInstruction(const MachineInstr *MI) const; + bool isValidAutoIncImm(const EVT VT, const int Offset) const; + bool isValidOffset(unsigned Opcode, int Offset, bool Extend = true) const; + bool isVecAcc(const MachineInstr *MI) const; + bool isVecALU(const MachineInstr *MI) const; + bool isVecUsableNextPacket(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const; + + + bool canExecuteInBundle(const MachineInstr *First, + const MachineInstr *Second) const; + bool hasEHLabel(const MachineBasicBlock *B) const; + bool hasNonExtEquivalent(const MachineInstr *MI) const; + bool hasPseudoInstrPair(const MachineInstr *MI) const; + bool hasUncondBranch(const MachineBasicBlock *B) const; + bool mayBeCurLoad(const MachineInstr* MI) const; + bool mayBeNewStore(const MachineInstr* MI) const; + bool producesStall(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) const; + bool producesStall(const MachineInstr *MI, + MachineBasicBlock::const_instr_iterator MII) const; + bool predCanBeUsedAsDotNew(const MachineInstr *MI, unsigned PredReg) const; + bool PredOpcodeHasJMP_c(unsigned Opcode) const; + bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const; - void immediateExtend(MachineInstr *MI) const; - bool isConstExtended(const MachineInstr *MI) const; - unsigned getSize(const MachineInstr *MI) const; - int getDotNewPredJumpOp(MachineInstr *MI, - const MachineBranchProbabilityInfo *MBPI) const; unsigned getAddrMode(const MachineInstr* MI) const; - bool isOperandExtended(const MachineInstr *MI, - unsigned short OperandNum) const; - unsigned short getCExtOpNum(const MachineInstr *MI) const; - int getMinValue(const MachineInstr *MI) const; + unsigned getBaseAndOffset(const MachineInstr *MI, int &Offset, + unsigned &AccessSize) const; + bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos, + unsigned &OffsetPos) const; + SmallVector<MachineInstr*,2> getBranchingInstrs(MachineBasicBlock& MBB) const; + unsigned getCExtOpNum(const MachineInstr *MI) const; + HexagonII::CompoundGroup + getCompoundCandidateGroup(const MachineInstr *MI) const; + unsigned getCompoundOpcode(const MachineInstr *GA, + const MachineInstr *GB) const; + int getCondOpcode(int Opc, bool sense) const; + int getDotCurOp(const MachineInstr* MI) const; + int getDotNewOp(const MachineInstr* MI) const; + int getDotNewPredJumpOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const; + int getDotNewPredOp(const MachineInstr *MI, + const MachineBranchProbabilityInfo *MBPI) const; + int getDotOldOp(const int opc) const; + HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr *MI) + const; + short getEquivalentHWInstr(const MachineInstr *MI) const; + MachineInstr *getFirstNonDbgInst(MachineBasicBlock *BB) const; + unsigned getInstrTimingClassLatency(const InstrItineraryData *ItinData, + const MachineInstr *MI) const; + bool getInvertedPredSense(SmallVectorImpl<MachineOperand> &Cond) const; + unsigned getInvertedPredicatedOpcode(const int Opc) const; int getMaxValue(const MachineInstr *MI) const; - bool NonExtEquivalentExists (const MachineInstr *MI) const; + unsigned getMemAccessSize(const MachineInstr* MI) const; + int getMinValue(const MachineInstr *MI) const; short getNonExtOpcode(const MachineInstr *MI) const; - bool PredOpcodeHasJMP_c(Opcode_t Opcode) const; - bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const; - bool isEndLoopN(Opcode_t Opcode) const; bool getPredReg(ArrayRef<MachineOperand> Cond, unsigned &PredReg, unsigned &PredRegPos, unsigned &PredRegFlags) const; - int getCondOpcode(int Opc, bool sense) const; + short getPseudoInstrPair(const MachineInstr *MI) const; + short getRegForm(const MachineInstr *MI) const; + unsigned getSize(const MachineInstr *MI) const; + uint64_t getType(const MachineInstr* MI) const; + unsigned getUnits(const MachineInstr* MI) const; + unsigned getValidSubTargets(const unsigned Opcode) const; + + /// getInstrTimingClassLatency - Compute the instruction latency of a given + /// instruction using Timing Class information, if available. + unsigned nonDbgBBSize(const MachineBasicBlock *BB) const; + unsigned nonDbgBundleSize(MachineBasicBlock::const_iterator BundleHead) const; + + + void immediateExtend(MachineInstr *MI) const; + bool invertAndChangeJumpTarget(MachineInstr* MI, + MachineBasicBlock* NewTarget) const; + void genAllInsnTimingClasses(MachineFunction &MF) const; + bool reversePredSense(MachineInstr* MI) const; + unsigned reversePrediction(unsigned Opcode) const; + bool validateBranchCond(const ArrayRef<MachineOperand> &Cond) const; }; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td index 3b32c10..5cfeba7 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfo.td @@ -13,7 +13,7 @@ include "HexagonInstrFormats.td" include "HexagonOperands.td" - +include "HexagonInstrEnc.td" // Pattern fragment that combines the value type and the register class // into a single parameter. // The pat frags in the definitions below need to have a named register, @@ -1426,9 +1426,6 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>; -def SDHexagonBR_JT: SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>; -def HexagonBR_JT: SDNode<"HexagonISD::BR_JT", SDHexagonBR_JT, [SDNPHasChain]>; - class CondStr<string CReg, bit True, bit New> { string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") "; } @@ -1606,8 +1603,6 @@ def EH_RETURN_JMPR : T_JMPr; def: Pat<(eh_return), (EH_RETURN_JMPR (i32 R31))>; -def: Pat<(HexagonBR_JT (i32 IntRegs:$dst)), - (J2_jumpr IntRegs:$dst)>; def: Pat<(brind (i32 IntRegs:$dst)), (J2_jumpr IntRegs:$dst)>; @@ -2825,7 +2820,7 @@ let CextOpcode = "ADD_acc" in { let isExtentSigned = 1 in def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8Ext, [(set (i32 IntRegs:$dst), - (add (add (i32 IntRegs:$src2), s16_16ImmPred:$src3), + (add (add (i32 IntRegs:$src2), s32ImmPred:$src3), (i32 IntRegs:$src1)))]>, ImmRegRel; def M2_acci : T_MType_acc_rr <"+= add", 0b000, 0b001, 0, @@ -2859,7 +2854,7 @@ class T_MType_acc_pat2 <InstHexagon MI, SDNode firstOp, SDNode secOp> def : T_MType_acc_pat2 <M2_xor_xacc, xor, xor>; def : T_MType_acc_pat1 <M2_macsin, mul, sub, u32ImmPred>; -def : T_MType_acc_pat1 <M2_naccii, add, sub, s16_16ImmPred>; +def : T_MType_acc_pat1 <M2_naccii, add, sub, s32ImmPred>; def : T_MType_acc_pat2 <M2_nacci, add, sub>; //===----------------------------------------------------------------------===// @@ -3303,7 +3298,8 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2}, !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1}, /* s4_0Imm */ offset{3-0}))); - let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1)); let IClass = 0b1010; @@ -3322,7 +3318,7 @@ class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp, //===----------------------------------------------------------------------===// let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew > + bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew> : STInst <(outs IntRegs:$_dst_), (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3), !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ", @@ -3341,7 +3337,8 @@ class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1}, /* s4_0Imm */ offset{3-0}))); - let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1)); let isPredicatedNew = isPredNew; let isPredicatedFalse = isPredNot; @@ -3404,7 +3401,6 @@ def: Storepi_pat<post_store, I64, s4_3ImmPred, S2_storerd_pi>; //===----------------------------------------------------------------------===// // Template class for post increment stores with register offset. //===----------------------------------------------------------------------===// -let isNVStorable = 1 in class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0> : STInst <(outs IntRegs:$_dst_), @@ -3416,6 +3412,9 @@ class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp, bits<5> src3; let accessSize = AccessSz; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1)); + let IClass = 0b1010; let Inst{27-24} = 0b1101; @@ -3430,12 +3429,11 @@ def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>; def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>; def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>; def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>; - def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>; let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<3>MajOp, bit isH = 0> + bits<3> MajOp, bit isH = 0> : STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>, @@ -3455,6 +3453,8 @@ class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2}, !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1}, /* s11_0Ext */ src2{10-0}))); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); let IClass = 0b1010; let Inst{27} = 0b0; @@ -3494,7 +3494,10 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2}, !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1}, /* u6_0Ext */ src3{5-0}))); - let IClass = 0b0100; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); + + let IClass = 0b0100; let Inst{27} = 0b0; let Inst{26} = PredNot; @@ -3508,7 +3511,7 @@ class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp, let Inst{1-0} = src1; } -let isExtendable = 1, isNVStorable = 1, hasSideEffects = 0 in +let isExtendable = 1, hasSideEffects = 0 in multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC, Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> { let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in { @@ -3665,7 +3668,7 @@ def S2_allocframe: ST0Inst < // S2_storer[bhwdf]_pci: Store byte/half/word/double. // S2_storer[bhwdf]_pci -> S2_storerbnew_pci -let Uses = [CS], isNVStorable = 1 in +let Uses = [CS] in class T_store_pci <string mnemonic, RegisterClass RC, Operand Imm, bits<4>MajOp, MemAccessSize AlignSize, string RegSrc = "Rt"> @@ -3679,6 +3682,8 @@ class T_store_pci <string mnemonic, RegisterClass RC, bits<1> Mu; bits<5> Rt; let accessSize = AlignSize; + let isNVStorable = !if(!eq(mnemonic,"memd"), 0, + !if(!eq(RegSrc,"Rt.h"), 0, 1)); let IClass = 0b1010; let Inst{27-25} = 0b100; @@ -3696,15 +3701,15 @@ class T_store_pci <string mnemonic, RegisterClass RC, } def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000, - ByteAccess>; + ByteAccess>; def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010, - HalfWordAccess>; + HalfWordAccess>; def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011, - HalfWordAccess, "Rt.h">; + HalfWordAccess, "Rt.h">; def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100, - WordAccess>; + WordAccess>; def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110, - DoubleWordAccess>; + DoubleWordAccess>; let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4 in class T_storenew_pci <string mnemonic, Operand Imm, @@ -3762,7 +3767,7 @@ def S2_storerd_pci_pseudo : T_store_pci_pseudo <"memd", DoubleRegs>; //===----------------------------------------------------------------------===// // Circular stores with auto-increment register //===----------------------------------------------------------------------===// -let Uses = [CS], isNVStorable = 1 in +let Uses = [CS] in class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp, MemAccessSize AlignSize, string RegSrc = "Rt"> : STInst <(outs IntRegs:$_dst_), @@ -3775,6 +3780,8 @@ class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp, bits<5> Rt; let accessSize = AlignSize; + let isNVStorable = !if(!eq(mnemonic,"memd"), 0, + !if(!eq(RegSrc,"Rt.h"), 0, 1)); let IClass = 0b1010; let Inst{27-25} = 0b100; @@ -5784,7 +5791,19 @@ include "HexagonInstrInfoV5.td" //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// +// V60 Instructions + +//===----------------------------------------------------------------------===// + +include "HexagonInstrInfoV60.td" + +//===----------------------------------------------------------------------===// +// V60 Instructions - +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// // ALU32/64/Vector + //===----------------------------------------------------------------------===/// include "HexagonInstrInfoVector.td" + +include "HexagonInstrAlias.td" diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td index 65b0f49..87d6b35 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV4.td @@ -684,7 +684,7 @@ def: Pat<(i64 (zext (i32 IntRegs:$src1))), // Template class for store instructions with Absolute set addressing mode. //===----------------------------------------------------------------------===// let isExtended = 1, opExtendable = 1, opExtentBits = 6, - addrMode = AbsoluteSet, isNVStorable = 1 in + addrMode = AbsoluteSet in class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC, bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0> : STInst<(outs IntRegs:$dst), @@ -696,6 +696,9 @@ class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC, let accessSize = AccessSz; let BaseOpcode = BaseOp#"_AbsSet"; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); + let IClass = 0b1010; let Inst{27-24} = 0b1011; @@ -750,7 +753,7 @@ let mayStore = 1, addrMode = AbsoluteSet in { } let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm", -addrMode = BaseLongOffset, AddedComplexity = 40 in + addrMode = BaseLongOffset, AddedComplexity = 40 in class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC, bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0> : STInst<(outs), @@ -766,6 +769,10 @@ class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC, let accessSize = AccessSz; let CextOpcode = CextOp; let BaseOpcode = CextOp#"_shl"; + + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); + let IClass = 0b1010; let Inst{27-24} =0b1101; @@ -856,6 +863,9 @@ class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH> bits<2> u2; bits<5> Rt; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); + let IClass = 0b0011; let Inst{27-24} = 0b1011; @@ -888,6 +898,8 @@ class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, let isPredicatedFalse = isNot; let isPredicatedNew = isPredNew; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1)); let IClass = 0b0011; @@ -1826,43 +1838,22 @@ def: LogLogNot_pat<or, or, C4_or_orn>; // below are needed to support code generation for PIC //===----------------------------------------------------------------------===// -def SDT_HexagonPICAdd +def SDT_HexagonAtGot + : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i32>]>; +def SDT_HexagonAtPcrel : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; -def SDT_HexagonGOTAdd - : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; - -def SDT_HexagonGOTAddInternal : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; -def SDT_HexagonGOTAddInternalJT : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; -def SDT_HexagonGOTAddInternalBA : SDTypeProfile<1, 1, [SDTCisVT<0, i32>]>; - -def Hexagonpic_add : SDNode<"HexagonISD::PIC_ADD", SDT_HexagonPICAdd>; -def Hexagonat_got : SDNode<"HexagonISD::AT_GOT", SDT_HexagonGOTAdd>; -def Hexagongat_pcrel : SDNode<"HexagonISD::AT_PCREL", - SDT_HexagonGOTAddInternal>; -def Hexagongat_pcrel_jt : SDNode<"HexagonISD::AT_PCREL", - SDT_HexagonGOTAddInternalJT>; -def Hexagongat_pcrel_ba : SDNode<"HexagonISD::AT_PCREL", - SDT_HexagonGOTAddInternalBA>; - -// PIC: Map from a block address computation to a PC-relative add -def: Pat<(Hexagongat_pcrel_ba tblockaddress:$src1), - (C4_addipc u32ImmPred:$src1)>; - -// PIC: Map from the computation to generate a GOT pointer to a PC-relative add -def: Pat<(Hexagonpic_add texternalsym:$src1), - (C4_addipc u32ImmPred:$src1)>; -// PIC: Map from a jump table address computation to a PC-relative add -def: Pat<(Hexagongat_pcrel_jt tjumptable:$src1), - (C4_addipc u32ImmPred:$src1)>; +// AT_GOT address-of-GOT, address-of-global, offset-in-global +def HexagonAtGot : SDNode<"HexagonISD::AT_GOT", SDT_HexagonAtGot>; +// AT_PCREL address-of-global +def HexagonAtPcrel : SDNode<"HexagonISD::AT_PCREL", SDT_HexagonAtPcrel>; -// PIC: Map from a GOT-relative symbol reference to a load -def: Pat<(Hexagonat_got (i32 IntRegs:$src1), tglobaladdr:$src2), - (L2_loadri_io IntRegs:$src1, s30_2ImmPred:$src2)>; - -// PIC: Map from a static symbol reference to a PC-relative add -def: Pat<(Hexagongat_pcrel tglobaladdr:$src1), - (C4_addipc u32ImmPred:$src1)>; +def: Pat<(HexagonAtGot I32:$got, I32:$addr, (i32 0)), + (L2_loadri_io I32:$got, imm:$addr)>; +def: Pat<(HexagonAtGot I32:$got, I32:$addr, s30_2ImmPred:$off), + (A2_addi (L2_loadri_io I32:$got, imm:$addr), imm:$off)>; +def: Pat<(HexagonAtPcrel I32:$addr), + (C4_addipc imm:$addr)>; //===----------------------------------------------------------------------===// // CR - @@ -1903,7 +1894,7 @@ def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Ru, s6Ext:$s6), "$Rd = add($Rs, add($Ru, #$s6))" , [(set (i32 IntRegs:$Rd), (add (i32 IntRegs:$Rs), - (add (i32 IntRegs:$Ru), s16_16ImmPred:$s6)))], + (add (i32 IntRegs:$Ru), s32ImmPred:$s6)))], "", ALU64_tc_2_SLOT23> { bits<5> Rd; bits<5> Rs; @@ -3290,27 +3281,33 @@ defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel; let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1, Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in { def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">; + let isExtended = 1, opExtendable = 0 in + def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">; } // Restore registers and dealloc frame before a tail call. let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in { def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel; + let isExtended = 1, opExtendable = 0 in + def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel; } // Save registers function call. let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in { def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel; + let isExtended = 1, opExtendable = 0 in + def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel; } //===----------------------------------------------------------------------===// // Template class for non predicated store instructions with // GP-Relative or absolute addressing. //===----------------------------------------------------------------------===// -let hasSideEffects = 0, isPredicable = 1, isNVStorable = 1 in +let hasSideEffects = 0, isPredicable = 1 in class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<2>MajOp, Operand AddrOp, bit isAbs, bit isHalf> - : STInst<(outs), (ins AddrOp:$addr, RC:$src), - mnemonic # !if(isAbs, "(##", "(#")#"$addr) = $src"#!if(isHalf, ".h",""), + bits<2>MajOp, bit isAbs, bit isHalf> + : STInst<(outs), (ins ImmOp:$addr, RC:$src), + mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""), [], "", V2LDST_tc_st_SLOT01> { bits<19> addr; bits<5> src; @@ -3321,6 +3318,9 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2}, !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1}, /* u16_0Imm */ addr{15-0}))); + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); + let IClass = 0b0100; let Inst{27} = 1; let Inst{26-25} = offsetBits{15-14}; @@ -3337,11 +3337,10 @@ class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, // Template class for predicated store instructions with // GP-Relative or absolute addressing. //===----------------------------------------------------------------------===// -let hasSideEffects = 0, isPredicated = 1, isNVStorable = 1, opExtentBits = 6, - opExtendable = 1 in +let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp, bit isHalf, bit isNot, bit isNew> - : STInst<(outs), (ins PredRegs:$src1, u6Ext:$absaddr, RC: $src2), + : STInst<(outs), (ins PredRegs:$src1, u32MustExt:$absaddr, RC: $src2), !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ", ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""), [], "", ST_tc_st_SLOT01>, AddrModeRel { @@ -3351,6 +3350,8 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp, let isPredicatedNew = isNew; let isPredicatedFalse = isNot; + // Store upper-half and store doubleword cannot be NV. + let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1)); let IClass = 0b1010; @@ -3371,7 +3372,7 @@ class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp, //===----------------------------------------------------------------------===// class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp, bits<2> MajOp, bit isHalf> - : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1, isHalf>, + : T_StoreAbsGP <mnemonic, RC, u32MustExt, MajOp, 1, isHalf>, AddrModeRel { string ImmOpStr = !cast<string>(ImmOp); let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19, @@ -3538,7 +3539,7 @@ defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>; let isAsmParserOnly = 1 in class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp, bits<2> MajOp, bit isHalf = 0> - : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0, isHalf> { + : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> { // Set BaseOpcode same as absolute addressing instructions so that // non-predicated GP-Rel instructions can have relate with predicated // Absolute instruction. @@ -3553,7 +3554,7 @@ multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp, // Absolute instruction. let BaseOpcode = BaseOp#_abs in { def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp, - globaladdress, 0, isHalf>; + 0, isHalf>; // New-value store def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp, 0> ; } @@ -3615,9 +3616,9 @@ let AddedComplexity = 100 in { //===----------------------------------------------------------------------===// let isPredicable = 1, hasSideEffects = 0 in class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, - bits<3> MajOp, Operand AddrOp, bit isAbs> - : LDInst <(outs RC:$dst), (ins AddrOp:$addr), - "$dst = "#mnemonic# !if(isAbs, "(##", "(#")#"$addr)", + bits<3> MajOp> + : LDInst <(outs RC:$dst), (ins ImmOp:$addr), + "$dst = "#mnemonic# "(#$addr)", [], "", V2LDST_tc_ld_SLOT01> { bits<5> dst; bits<19> addr; @@ -3642,7 +3643,7 @@ class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp, class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp, bits<3> MajOp> - : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, u32Imm, 1>, AddrModeRel { + : T_LoadAbsGP <mnemonic, RC, u32MustExt, MajOp>, AddrModeRel { string ImmOpStr = !cast<string>(ImmOp); let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19, @@ -3660,10 +3661,11 @@ class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp, // Template class for predicated load instructions with // absolute addressing mode. //===----------------------------------------------------------------------===// -let isPredicated = 1, opExtentBits = 6, opExtendable = 2 in +let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6, + opExtendable = 2 in class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isPredNot, bit isPredNew> - : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u6Ext:$absaddr), + : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32MustExt:$absaddr), !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ", ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel { bits<5> dst; @@ -3737,7 +3739,7 @@ defm loadrd : LD_Abs<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>; let isAsmParserOnly = 1 in class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp, bits<3> MajOp> - : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp, globaladdress, 0>, PredNewRel { + : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel { let BaseOpcode = BaseOp#_abs; } @@ -3841,26 +3843,6 @@ let AddedComplexity = 100 in { def: Stoream_pat<truncstorei32, I64, addrga, LoReg, S2_storeriabs>; } -// Map from Pd = load(globaladdress) -> Rd = memb(globaladdress), Pd = Rd -let AddedComplexity = 100 in -def : Pat <(i1 (load (HexagonCONST32_GP tglobaladdr:$global))), - (i1 (C2_tfrrp (i32 (L2_loadrbgp tglobaladdr:$global))))>; - -// Transfer global address into a register -let isExtended = 1, opExtendable = 1, AddedComplexity=50, isMoveImm = 1, -isAsCheapAsAMove = 1, isReMaterializable = 1, isCodeGenOnly = 1 in -def TFRI_V4 : ALU32_ri<(outs IntRegs:$dst), (ins s16Ext:$src1), - "$dst = #$src1", - [(set IntRegs:$dst, (HexagonCONST32 tglobaladdr:$src1))]>; - -// Transfer a block address into a register -def : Pat<(HexagonCONST32_GP tblockaddress:$src1), - (TFRI_V4 tblockaddress:$src1)>; - -let AddedComplexity = 50 in -def : Pat<(HexagonCONST32_GP tglobaladdr:$src1), - (TFRI_V4 tglobaladdr:$src1)>; - // i8/i16/i32 -> i64 loads // We need a complexity of 120 here to override preceding handling of // zextload. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td index 337f4ea..823961f 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV5.td @@ -98,21 +98,21 @@ def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1), // HexagonInstrInfo.td patterns. let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1, isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT, - isCodeGenOnly = 1 in + isCodeGenOnly = 1, isPseudo = 1 in def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1), "$dst = #$src1", [(set F32:$dst, fpimm:$src1)]>, Requires<[HasV5T]>; -let isExtended = 1, opExtendable = 2, isPredicated = 1, - hasSideEffects = 0, validSubTargets = HasV5SubT, isCodeGenOnly = 1 in +let isExtended = 1, opExtendable = 2, isPredicated = 1, hasSideEffects = 0, + validSubTargets = HasV5SubT, isCodeGenOnly = 1, isPseudo = 1 in def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, f32Ext:$src2), "if ($src1) $dst = #$src2", []>, Requires<[HasV5T]>; -let isPseudo = 1, isExtended = 1, opExtendable = 2, isPredicated = 1, - isPredicatedFalse = 1, hasSideEffects = 0, validSubTargets = HasV5SubT in +let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1, + hasSideEffects = 0, validSubTargets = HasV5SubT, isPseudo = 1 in def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst), (ins PredRegs:$src1, f32Ext:$src2), "if (!$src1) $dst = #$src2", []>, diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td new file mode 100644 index 0000000..897ada0 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoV60.td @@ -0,0 +1,2241 @@ +//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon V60 instructions in TableGen format. +// +//===----------------------------------------------------------------------===// + + +// Vector store +let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in +{ + class VSTInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VM_ST, + IType type = TypeCVI_VM_ST> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>, OpcodeHexagon; + +} + +// Vector load +let Predicates = [HasV60T, UseHVX] in +let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in + class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VM_LD, + IType type = TypeCVI_VM_LD> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>; + +let Predicates = [HasV60T, UseHVX] in +let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in +class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VM_ST, + IType type = TypeCVI_VM_ST> +: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>; + +//===----------------------------------------------------------------------===// +// Vector loads with base + immediate offset +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, accessSize = Vector64Access in +class T_vload_ai<string asmStr> + : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2), + asmStr>; + +let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in +class T_vload_ai_128B<string asmStr> + : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2), + asmStr>; + +let isCVLoadable = 1, hasNewValue = 1 in { + def V6_vL32b_ai : T_vload_ai <"$dst = vmem($src1+#$src2)">, + V6_vL32b_ai_enc; + def V6_vL32b_nt_ai : T_vload_ai <"$dst = vmem($src1+#$src2):nt">, + V6_vL32b_nt_ai_enc; + // 128B + def V6_vL32b_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">, + V6_vL32b_ai_128B_enc; + def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">, + V6_vL32b_nt_ai_128B_enc; +} + +let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in { + def V6_vL32Ub_ai : T_vload_ai <"$dst = vmemu($src1+#$src2)">, + V6_vL32Ub_ai_enc; + def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">, + V6_vL32Ub_ai_128B_enc; +} + +let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1, + hasNewValue = 1 in { + def V6_vL32b_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">, + V6_vL32b_cur_ai_enc; + def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">, + V6_vL32b_nt_cur_ai_enc; + // 128B + def V6_vL32b_cur_ai_128B : T_vload_ai_128B + <"$dst.cur = vmem($src1+#$src2)">, + V6_vL32b_cur_ai_128B_enc; + def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B + <"$dst.cur = vmem($src1+#$src2):nt">, + V6_vL32b_nt_cur_ai_128B_enc; +} + + +let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in { + def V6_vL32b_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">, + V6_vL32b_tmp_ai_enc; + def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">, + V6_vL32b_nt_tmp_ai_enc; + // 128B + def V6_vL32b_tmp_ai_128B : T_vload_ai_128B + <"$dst.tmp = vmem($src1+#$src2)">, + V6_vL32b_tmp_ai_128B_enc; + def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B + <"$dst.tmp = vmem($src1+#$src2)">, + V6_vL32b_nt_tmp_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - unconditional +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, accessSize = Vector64Access in +class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isNT> + : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel { + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_ai : T_vstore_ai_64B <"vmem", "vS32b_ai">, + V6_vS32b_ai_enc; + def V6_vS32b_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai">, + V6_vS32b_ai_128B_enc; +} + +let isNVStorable = 1, isNonTemporal = 1 in { + def V6_vS32b_nt_ai : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>, + V6_vS32b_nt_ai_enc; + def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>, + V6_vS32b_nt_ai_128B_enc; +} + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_ai : T_vstore_ai_64B <"vmemu", "vs32Ub_ai">, + V6_vS32Ub_ai_enc; + def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vs32Ub_ai">, + V6_vS32Ub_ai_128B_enc; +} +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - unconditional new +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1, + Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in +class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT> + : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel { + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_ai_64B <string baseOp, bit isNT = 0> + : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_ai_128B <string baseOp, bit isNT = 0> + : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>; + +def V6_vS32b_new_ai : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc; +def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">, + V6_vS32b_new_ai_128B_enc; + +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_ai : T_vstore_new_ai_64B<"vS32b_ai", 1>, + V6_vS32b_nt_new_ai_enc; + def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>, + V6_vS32b_nt_new_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - conditional +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, isPredicated = 1 in +class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) " + #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_pred_ai_64B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_pred_ai_128B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, + isPredNot, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">, + V6_vS32b_pred_ai_enc; + def V6_vS32b_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>, + V6_vS32b_npred_ai_enc; + // 128B + def V6_vS32b_pred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">, + V6_vS32b_pred_ai_128B_enc; + def V6_vS32b_npred_ai_128B : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>, + V6_vS32b_npred_ai_128B_enc; +} +let isNVStorable = 1, isNonTemporal = 1 in { + def V6_vS32b_nt_pred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>, + V6_vS32b_nt_pred_ai_enc; + def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>, + V6_vS32b_nt_npred_ai_enc; + // 128B + def V6_vS32b_nt_pred_ai_128B : T_vstore_pred_ai_128B + <"vmem", "vS32b_ai", 0, 1>, + V6_vS32b_nt_pred_ai_128B_enc; + def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B + <"vmem", "vS32b_ai", 1, 1>, + V6_vS32b_nt_npred_ai_128B_enc; +} + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">, + V6_vS32Ub_pred_ai_enc; + def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>, + V6_vS32Ub_npred_ai_enc; + // 128B + def V6_vS32Ub_pred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">, + V6_vS32Ub_pred_ai_128B_enc; + def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>, + V6_vS32Ub_npred_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - byte-enabled aligned +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset in +class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC, + bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs), + (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)" + #!if(isNT, ":nt", "")#" = $src4"> { + let isPredicatedFalse = isPredNot; +} + +let accessSize = Vector64Access in +class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>; + +def V6_vS32b_qpred_ai : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc; +def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>, + V6_vS32b_nqpred_ai_enc; +def V6_vS32b_nt_qpred_ai : T_vstore_qpred_ai_64B <0, 1>, + V6_vS32b_nt_qpred_ai_enc; +def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>, + V6_vS32b_nt_nqpred_ai_enc; +// 128B +def V6_vS32b_qpred_ai_128B : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc; +def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>, + V6_vS32b_nqpred_ai_128B_enc; +def V6_vS32b_nt_qpred_ai_128B : T_vstore_qpred_ai_128B<0, 1>, + V6_vS32b_nt_qpred_ai_128B_enc; +def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>, + V6_vS32b_nt_nqpred_ai_128B_enc; + + +//===----------------------------------------------------------------------===// +// Vector stores with base + immediate offset - conditional new +//===----------------------------------------------------------------------===// +let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3, + isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in +class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC, + bit isPredNot, bit isNT> + : V6_STInst <(outs), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)" + #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, + isPredNot, isNT>; + + +def V6_vS32b_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai">, + V6_vS32b_new_pred_ai_enc; +def V6_vS32b_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>, + V6_vS32b_new_npred_ai_enc; +// 128B +def V6_vS32b_new_pred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai">, + V6_vS32b_new_pred_ai_128B_enc; +def V6_vS32b_new_npred_ai_128B : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>, + V6_vS32b_new_npred_ai_128B_enc; +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_pred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>, + V6_vS32b_nt_new_pred_ai_enc; + def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>, + V6_vS32b_nt_new_npred_ai_enc; + // 128B + def V6_vS32b_nt_new_pred_ai_128B : T_vstore_new_pred_ai_128B + <"vS32b_ai", 0, 1>, + V6_vS32b_nt_new_pred_ai_128B_enc; + def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B + <"vS32b_ai", 1, 1>, + V6_vS32b_nt_new_npred_ai_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector loads with immediate offset. +//===----------------------------------------------------------------------===// +let addrMode = PostInc, hasNewValue = 1 in +class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC> + : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_), + (ins IntRegs:$src1, ImmOp:$src2), asmStr, [], + "$src1 = $_dst_">; + +let accessSize = Vector64Access in +class T_vload_pi_64B <string asmStr> + : T_vload_pi <asmStr, s3_6Imm, VectorRegs>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vload_pi_128B <string asmStr> + : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>; + +let isCVLoadable = 1 in { + def V6_vL32b_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">, + V6_vL32b_pi_enc; + def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">, + V6_vL32b_nt_pi_enc; + // 128B + def V6_vL32b_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">, + V6_vL32b_pi_128B_enc; + def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">, + V6_vL32b_nt_pi_128B_enc; +} + +let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in { + def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">, + V6_vL32Ub_pi_enc; + // 128B + def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">, + V6_vL32Ub_pi_128B_enc; +} + +let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in { + def V6_vL32b_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">, + V6_vL32b_cur_pi_enc; + def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">, + V6_vL32b_nt_cur_pi_enc; + // 128B + def V6_vL32b_cur_pi_128B : T_vload_pi_128B + <"$dst.cur = vmem($src1++#$src2)">, + V6_vL32b_cur_pi_128B_enc; + def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B + <"$dst.cur = vmem($src1++#$src2):nt">, + V6_vL32b_nt_cur_pi_128B_enc; +} + +let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in { + def V6_vL32b_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">, + V6_vL32b_tmp_pi_enc; + def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">, + V6_vL32b_nt_tmp_pi_enc; + //128B + def V6_vL32b_tmp_pi_128B : T_vload_pi_128B + <"$dst.tmp = vmem($src1++#$src2)">, + V6_vL32b_tmp_pi_128B_enc; + def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B + <"$dst.tmp = vmem($src1++#$src2):nt">, + V6_vL32b_nt_tmp_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with immediate offset. +//===----------------------------------------------------------------------===// +let addrMode = PostInc in +class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isNT> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [], + "$src1 = $_dst_">, NewValueRel; + +let accessSize = Vector64Access in +class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0> + : T_vstore_pi <mnemonic, baseOp, s3_7Imm, VectorRegs128B, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_pi : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc; + def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">, + V6_vS32b_pi_128B_enc; +} + +let isNVStorable = 1 , isNonTemporal = 1 in { + def V6_vS32b_nt_pi : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>, + V6_vS32b_nt_pi_enc; + def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>, + V6_vS32b_nt_pi_128B_enc; +} + + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pi : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pi_enc; + def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment unconditional .new vector stores with immediate offset. +//===----------------------------------------------------------------------===// +let addrMode = PostInc, isNVStore = 1 in +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1, + opNewValue = 3, isNVStore = 1 in +class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ImmOp:$src2, RC:$src3), + "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [], + "$src1 = $_dst_">, NewValueRel { + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_pi_64B <string baseOp, bit isNT = 0> + : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_pi_128B <string baseOp, bit isNT = 0> + : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>; + + +def V6_vS32b_new_pi : T_vstore_new_pi_64B <"vS32b_pi">, + V6_vS32b_new_pi_enc; +def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">, + V6_vS32b_new_pi_128B_enc; + +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_pi : T_vstore_new_pi_64B <"vS32b_pi", 1>, + V6_vS32b_nt_new_pi_enc; + def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>, + V6_vS32b_nt_new_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment conditional vector stores with immediate offset +//===----------------------------------------------------------------------===// +let isPredicated = 1, addrMode = PostInc in +class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp, + RegisterClass RC, bit isPredNot, bit isNT> + : V6_STInst<(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">, NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_pred_pi_64B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_pred_pi_128B <string mnemonic, string baseOp, + bit isPredNot = 0, bit isNT = 0> + : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, + isPredNot, isNT>; + +let isNVStorable = 1 in { + def V6_vS32b_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">, + V6_vS32b_pred_pi_enc; + def V6_vS32b_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>, + V6_vS32b_npred_pi_enc; + // 128B + def V6_vS32b_pred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">, + V6_vS32b_pred_pi_128B_enc; + def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>, + V6_vS32b_npred_pi_128B_enc; +} +let isNVStorable = 1, isNonTemporal = 1 in { + def V6_vS32b_nt_pred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>, + V6_vS32b_nt_pred_pi_enc; + def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>, + V6_vS32b_nt_npred_pi_enc; + // 128B + def V6_vS32b_nt_pred_pi_128B : T_vstore_pred_pi_128B + <"vmem", "vS32b_pi", 0, 1>, + V6_vS32b_nt_pred_pi_128B_enc; + def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B + <"vmem", "vS32b_pi", 1, 1>, + V6_vS32b_nt_npred_pi_128B_enc; +} + +let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pred_pi_enc; + def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>, + V6_vS32Ub_npred_pi_enc; + // 128B + def V6_vS32Ub_pred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">, + V6_vS32Ub_pred_pi_128B_enc; + def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>, + V6_vS32Ub_npred_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with immediate offset - byte-enabled aligned +//===----------------------------------------------------------------------===// +let addrMode = PostInc in +class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0, + bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">; + +let accessSize = Vector64Access in +class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0> + : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>; + +def V6_vS32b_qpred_pi : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc; +def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc; +// 128B +def V6_vS32b_qpred_pi_128B : T_vstore_qpred_pi_128B, + V6_vS32b_qpred_pi_128B_enc; +def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>, + V6_vS32b_nqpred_pi_128B_enc; + +let isNonTemporal = 1 in { + def V6_vS32b_nt_qpred_pi : T_vstore_qpred_pi_64B <0, 1>, + V6_vS32b_nt_qpred_pi_enc; + def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>, + V6_vS32b_nt_nqpred_pi_enc; + // 128B + def V6_vS32b_nt_qpred_pi_128B : T_vstore_qpred_pi_128B<0, 1>, + V6_vS32b_nt_qpred_pi_128B_enc; + def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>, + V6_vS32b_nt_nqpred_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment conditional .new vector stores with immediate offset +//===----------------------------------------------------------------------===// +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1, + isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in +class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC, + bit isPredNot, bit isNT> + : V6_STInst <(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4), + "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)" + #!if(isNT, ":nt", "")#" = $src4.new", [], + "$src2 = $_dst_"> , NewValueRel { + let isPredicatedFalse = isPredNot; + let BaseOpcode = baseOp; +} + +let accessSize = Vector64Access in +class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>; + +let isCodeGenOnly = 1, accessSize = Vector128Access in +class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0> + : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, + isPredNot, isNT>; + +def V6_vS32b_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi">, + V6_vS32b_new_pred_pi_enc; +def V6_vS32b_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>, + V6_vS32b_new_npred_pi_enc; +// 128B +def V6_vS32b_new_pred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi">, + V6_vS32b_new_pred_pi_128B_enc; +def V6_vS32b_new_npred_pi_128B : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>, + V6_vS32b_new_npred_pi_128B_enc; +let isNonTemporal = 1 in { + def V6_vS32b_nt_new_pred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>, + V6_vS32b_nt_new_pred_pi_enc; + def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>, + V6_vS32b_nt_new_npred_pi_enc; + // 128B + def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B + <"vS32b_pi", 0, 1>, + V6_vS32b_nt_new_pred_pi_128B_enc; + def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B + <"vS32b_pi", 1, 1>, + V6_vS32b_nt_new_npred_pi_128B_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector loads with register offset +//===----------------------------------------------------------------------===// +let hasNewValue = 1 in +class T_vload_ppu<string asmStr> + : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_), + (ins IntRegs:$src1, ModRegs:$src2), asmStr, [], + "$src1 = $_dst_">, NewValueRel; + +let isCVLoadable = 1 in { + def V6_vL32b_ppu : T_vload_ppu <"$dst = vmem($src1++$src2)">, + V6_vL32b_ppu_enc; + def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">, + V6_vL32b_nt_ppu_enc; +} + +let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in +def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">, + V6_vL32Ub_ppu_enc; + +let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in { + def V6_vL32b_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">, + V6_vL32b_cur_ppu_enc; + def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">, + V6_vL32b_nt_cur_ppu_enc; +} + +let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in { + def V6_vL32b_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">, + V6_vL32b_tmp_ppu_enc; + def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">, + V6_vL32b_nt_tmp_ppu_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with register offset +//===----------------------------------------------------------------------===// +class T_vstore_ppu <string mnemonic, bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3), + mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [], + "$src1 = $_dst_">, NewValueRel; + +let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in { + def V6_vS32b_ppu : T_vstore_ppu <"vmem">, + V6_vS32b_ppu_enc; + let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in + def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>, + V6_vS32b_nt_ppu_enc; +} + +let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in +def V6_vS32Ub_ppu : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc; + +//===----------------------------------------------------------------------===// +// Post increment .new vector stores with register offset +//===----------------------------------------------------------------------===// +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1, + opNewValue = 3, isNVStore = 1 in +class T_vstore_new_ppu <bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3), + "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [], + "$src1 = $_dst_">, NewValueRel; + +let BaseOpcode = "vS32b_ppu" in +def V6_vS32b_new_ppu : T_vstore_new_ppu, V6_vS32b_new_ppu_enc; + +let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in +def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc; + +//===----------------------------------------------------------------------===// +// Post increment conditional .new vector stores with register offset +//===----------------------------------------------------------------------===// +let isPredicated = 1 in +class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0> + : V6_STInst<(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">, NewValueRel { + let isPredicatedFalse = isPredNot; +} + +let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in { + def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc; + def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc; +} + +let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in { + def V6_vS32b_nt_pred_ppu : T_vstore_pred_ppu <"vmem", 0, 1>, + V6_vS32b_nt_pred_ppu_enc; + def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>, + V6_vS32b_nt_npred_ppu_enc; +} + +let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, + Type = TypeCVI_VM_STU in { + def V6_vS32Ub_pred_ppu : T_vstore_pred_ppu <"vmemu">, + V6_vS32Ub_pred_ppu_enc; + def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>, + V6_vS32Ub_npred_ppu_enc; +} + +//===----------------------------------------------------------------------===// +// Post increment vector stores with register offset - byte-enabled aligned +//===----------------------------------------------------------------------===// +class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4), + "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)" + #!if(isNT, ":nt", "")#" = $src4", [], + "$src2 = $_dst_">, NewValueRel; + +def V6_vS32b_qpred_ppu : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc; +def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc; +def V6_vS32b_nt_qpred_ppu : T_vstore_qpred_ppu<0, 1>, + V6_vS32b_nt_qpred_ppu_enc; +def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>, + V6_vS32b_nt_nqpred_ppu_enc; + +//===----------------------------------------------------------------------===// +// Post increment conditional .new vector stores with register offset +//===----------------------------------------------------------------------===// +let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1, + isNewValue = 1, opNewValue = 4, isNVStore = 1 in +class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0> + : V6_STInst <(outs IntRegs:$_dst_), + (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4), + "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)" + #!if(isNT, ":nt", "")#" = $src4.new", [], + "$src2 = $_dst_">, NewValueRel { + let isPredicatedFalse = isPredNot; +} + +let BaseOpcode = "vS32b_ppu" in { + def V6_vS32b_new_pred_ppu : T_vstore_new_pred_ppu, + V6_vS32b_new_pred_ppu_enc; + def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>, + V6_vS32b_new_npred_ppu_enc; +} + +let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in { +def V6_vS32b_nt_new_pred_ppu : T_vstore_new_pred_ppu<0, 1>, + V6_vS32b_nt_new_pred_ppu_enc; +def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>, + V6_vS32b_nt_new_npred_ppu_enc; +} + +let isPseudo = 1, validSubTargets = HasV60SubT in +class STrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC>: + VSTInst<(outs), (ins IntRegs:$addr, ImmOp:$off, RC:$src), + #mnemonic#"($addr+#$off) = $src", []>; + +def STrivv_indexed: STrivv_template<"vvmem", s4_6Imm, VecDblRegs>, + Requires<[HasV60T, UseHVXSgl]>; +def STrivv_indexed_128B: STrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>, + Requires<[HasV60T, UseHVXDbl]>; + +multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> { + def : Pat<(store (VTSgl VecDblRegs:$src1), IntRegs:$addr), + (STrivv_indexed IntRegs:$addr, #0, (VTSgl VecDblRegs:$src1))>, + Requires<[UseHVXSgl]>; + + def : Pat<(store (VTDbl VecDblRegs128B:$src1), IntRegs:$addr), + (STrivv_indexed_128B IntRegs:$addr, #0, + (VTDbl VecDblRegs128B:$src1))>, + Requires<[UseHVXDbl]>; +} + +defm : STrivv_pats <v128i8, v256i8>; +defm : STrivv_pats <v64i16, v128i16>; +defm : STrivv_pats <v32i32, v64i32>; +defm : STrivv_pats <v16i64, v32i64>; + + +multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { + // Aligned stores + def : Pat<(store (VTSgl VectorRegs:$src1), IntRegs:$addr), + (V6_vS32b_ai IntRegs:$addr, #0, (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; + + // 128B Aligned stores + def : Pat<(store (VTDbl VectorRegs128B:$src1), IntRegs:$addr), + (V6_vS32b_ai_128B IntRegs:$addr, #0, (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; + + // Fold Add R+IFF into vector store. + let AddedComplexity = 10 in + def : Pat<(store (VTSgl VectorRegs:$src1), + (add IntRegs:$src2, s4_6ImmPred:$offset)), + (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset, + (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; + + // Fold Add R+IFF into vector store 128B. + let AddedComplexity = 10 in + def : Pat<(store (VTDbl VectorRegs128B:$src1), + (add IntRegs:$src2, s4_7ImmPred:$offset)), + (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset, + (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; +} + +defm : vS32b_ai_pats <v64i8, v128i8>; +defm : vS32b_ai_pats <v32i16, v64i16>; +defm : vS32b_ai_pats <v16i32, v32i32>; +defm : vS32b_ai_pats <v8i64, v16i64>; + +let isPseudo = 1, validSubTargets = HasV60SubT in +class LDrivv_template<string mnemonic, Operand ImmOp, RegisterClass RC> + : V6_LDInst <(outs RC:$dst), (ins IntRegs:$addr, ImmOp:$off), + "$dst="#mnemonic#"($addr+#$off)", + []>, + Requires<[HasV60T,UseHVXSgl]>; + +def LDrivv_indexed: LDrivv_template<"vvmem", s4_6Imm, VecDblRegs>; +def LDrivv_indexed_128B: LDrivv_template<"vvmem", s4_7Imm, VecDblRegs128B>; + +multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> { + def : Pat < (VTSgl (load IntRegs:$addr)), + (LDrivv_indexed IntRegs:$addr, #0) >, + Requires<[UseHVXSgl]>; + + def : Pat < (VTDbl (load IntRegs:$addr)), + (LDrivv_indexed_128B IntRegs:$addr, #0) >, + Requires<[UseHVXDbl]>; +} + +defm : LDrivv_pats <v128i8, v256i8>; +defm : LDrivv_pats <v64i16, v128i16>; +defm : LDrivv_pats <v32i32, v64i32>; +defm : LDrivv_pats <v16i64, v32i64>; + +multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { + // Aligned loads + def : Pat < (VTSgl (load IntRegs:$addr)), + (V6_vL32b_ai IntRegs:$addr, #0) >, + Requires<[UseHVXSgl]>; + + // 128B Load + def : Pat < (VTDbl (load IntRegs:$addr)), + (V6_vL32b_ai_128B IntRegs:$addr, #0) >, + Requires<[UseHVXDbl]>; + + // Fold Add R+IFF into vector load. + let AddedComplexity = 10 in + def : Pat<(VTDbl (load (add IntRegs:$src2, s4_7ImmPred:$offset))), + (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>, + Requires<[UseHVXDbl]>; + + let AddedComplexity = 10 in + def : Pat<(VTSgl (load (add IntRegs:$src2, s4_6ImmPred:$offset))), + (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>, + Requires<[UseHVXSgl]>; +} + +defm : vL32b_ai_pats <v64i8, v128i8>; +defm : vL32b_ai_pats <v32i16, v64i16>; +defm : vL32b_ai_pats <v16i32, v32i32>; +defm : vL32b_ai_pats <v8i64, v16i64>; + +// Store vector predicate pseudo. +let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, + isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def STriq_pred_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecPredRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; + +def STriq_pred_vec_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; + +def STriq_pred_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecPredRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; + +def STriq_pred_vec_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +// Load vector predicate pseudo. +let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13, + opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def LDriq_pred_V6 : LDInst<(outs VecPredRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDriq_pred_vec_V6 : LDInst<(outs VectorRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDriq_pred_V6_128B : LDInst<(outs VecPredRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +def LDriq_pred_vec_V6_128B : LDInst<(outs VectorRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +// Store vector pseudo. +let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, + isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def STriv_pseudo_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def STriv_pseudo_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VectorRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13, + isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +def STrivv_pseudo_V6 : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecDblRegs:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def STrivv_pseudo_V6_128B : STInst<(outs), + (ins IntRegs:$base, s32Imm:$offset, VecDblRegs128B:$src1), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +// Load vector pseudo. +let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13, + opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def LDriv_pseudo_V6 : LDInst<(outs VectorRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDriv_pseudo_V6_128B : LDInst<(outs VectorRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13, + opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def LDrivv_pseudo_V6 : LDInst<(outs VecDblRegs:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def LDrivv_pseudo_V6_128B : LDInst<(outs VecDblRegs128B:$dst), + (ins IntRegs:$base, s32Imm:$offset), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXDbl]>; +} + +class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [], + string cstr = "", InstrItinClass itin = CVI_VA_DV, + IType type = TypeCVI_VA_DV> + : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>; + +let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in { +def VSelectPseudo_V6 : VSELInst<(outs VectorRegs:$dst), + (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +def VSelectDblPseudo_V6 : VSELInst<(outs VecDblRegs:$dst), + (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), + ".error \"should not emit\" ", + []>, + Requires<[HasV60T,UseHVXSgl]>; +} + +def : Pat <(v16i32 (selectcc (i32 IntRegs:$lhs), (i32 IntRegs:$rhs), + (v16i32 VectorRegs:$tval), + (v16i32 VectorRegs:$fval), SETEQ)), + (v16i32 (VSelectPseudo_V6 (i32 (C2_cmpeq (i32 IntRegs:$lhs), + (i32 IntRegs:$rhs))), + (v16i32 VectorRegs:$tval), + (v16i32 VectorRegs:$fval)))>; + + +let hasNewValue = 1 in +class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2), + asmString >; + +multiclass T_vmpy <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_vmpy <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_vmpy_VV <string asmString>: + T_vmpy <asmString, VectorRegs, VectorRegs>; + +multiclass T_vmpy_WW <string asmString>: + T_vmpy <asmString, VecDblRegs, VecDblRegs>; + +multiclass T_vmpy_VW <string asmString>: + T_vmpy <asmString, VectorRegs, VecDblRegs>; + +multiclass T_vmpy_WV <string asmString>: + T_vmpy <asmString, VecDblRegs, VectorRegs>; + +defm V6_vtmpyb :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc; +defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc; +defm V6_vdsaduh :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc; +defm V6_vmpybus :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc; +defm V6_vmpabus :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc; +defm V6_vmpahb :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc; +defm V6_vmpyh :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc; +defm V6_vmpyuh :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc; +defm V6_vmpyiwh :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc; +defm V6_vtmpyhb :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc; +defm V6_vmpyub :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc; + +let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in +defm V6_vmpyihb :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc; + +defm V6_vdmpybus_dv : + T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc; +defm V6_vdmpyhsusat : + T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc; +defm V6_vdmpyhsuisat : + T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc; +defm V6_vdmpyhsat : + T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc; +defm V6_vdmpyhisat : + T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc; +defm V6_vdmpyhb_dv : + T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc; +defm V6_vmpyhss : + T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc; +defm V6_vmpyhsrs : + T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc; + +let Itinerary = CVI_VP, Type = TypeCVI_VP in +defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc; + +let Itinerary = CVI_VX, Type = TypeCVI_VX in { +defm V6_vdmpyhb : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc; +defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc; +defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc; +defm V6_vmpyiwb : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc; +defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vasrw : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc; +defm V6_vasrh : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc; +defm V6_vaslw : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc; +defm V6_vaslh : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc; +defm V6_vlsrw : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc; +defm V6_vlsrh : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc; +} + +let hasNewValue = 1 in +class T_HVX_alu <string asmString, InstrItinClass itin, + RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2), + asmString >{ + let Itinerary = itin; + let Type = !cast<IType>("Type"#itin); +} + +multiclass T_HVX_alu <string asmString, RegisterClass RCout, + RegisterClass RCin, InstrItinClass itin> { + def NAME : T_HVX_alu <asmString, itin, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_alu <asmString, itin, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_alu_VV <string asmString>: + T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>; + +multiclass T_HVX_alu_WW <string asmString>: + T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>; + +multiclass T_HVX_alu_WV <string asmString>: + T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>; + + +let Itinerary = CVI_VX, Type = TypeCVI_VX in { +defm V6_vrmpyubv : + T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc; +defm V6_vrmpybv : + T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc; +defm V6_vrmpybusv : + T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc; +defm V6_vabsdiffub : + T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc; +defm V6_vabsdiffh : + T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc; +defm V6_vabsdiffuh : + T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc; +defm V6_vabsdiffw : + T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc; +} + +let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in { +defm V6_vdmpyhvsat : + T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc; +defm V6_vmpyhvsrs : + T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc; +defm V6_vmpyih : + T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc; +} + +defm V6_vand : + T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc; +defm V6_vor : + T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc; +defm V6_vxor : + T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc; +defm V6_vaddw : + T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc; +defm V6_vaddubsat : + T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc; +defm V6_vadduhsat : + T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc; +defm V6_vaddhsat : + T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc; +defm V6_vaddwsat : + T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc; +defm V6_vsubb : + T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc; +defm V6_vsubh : + T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc; +defm V6_vsubw : + T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc; +defm V6_vsububsat : + T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc; +defm V6_vsubuhsat : + T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc; +defm V6_vsubhsat : + T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc; +defm V6_vsubwsat : + T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc; +defm V6_vavgub : + T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc; +defm V6_vavguh : + T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc; +defm V6_vavgh : + T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc; +defm V6_vavgw : + T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc; +defm V6_vnavgub : + T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc; +defm V6_vnavgh : + T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc; +defm V6_vnavgw : + T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc; +defm V6_vavgubrnd : + T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc; +defm V6_vavguhrnd : + T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc; +defm V6_vavghrnd : + T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc; +defm V6_vavgwrnd : + T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc; + +defm V6_vmpybv : + T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc; +defm V6_vmpyubv : + T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc; +defm V6_vmpybusv : + T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc; +defm V6_vmpyhv : + T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc; +defm V6_vmpyuhv : + T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc; +defm V6_vmpyhus : + T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc; +defm V6_vaddubh : + T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc; +defm V6_vadduhw : + T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc; +defm V6_vaddhw : + T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc; +defm V6_vsububh : + T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc; +defm V6_vsubuhw : + T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc; +defm V6_vsubhw : + T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc; + +defm V6_vaddb_dv : + T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc; +defm V6_vaddh_dv : + T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc; +defm V6_vaddw_dv : + T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc; +defm V6_vaddubsat_dv : + T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc; +defm V6_vadduhsat_dv : + T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc; +defm V6_vaddhsat_dv : + T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc; +defm V6_vaddwsat_dv : + T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc; +defm V6_vsubb_dv : + T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc; +defm V6_vsubh_dv : + T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc; +defm V6_vsubw_dv : + T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc; +defm V6_vsububsat_dv : + T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc; +defm V6_vsubuhsat_dv : + T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc; +defm V6_vsubhsat_dv : + T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc; +defm V6_vsubwsat_dv : + T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc; + +let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in { +defm V6_vmpabusv : + T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc; +defm V6_vmpabuuv : + T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc; +} + +let isAccumulator = 1, hasNewValue = 1 in +class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout, + RegisterClass RCin1, RegisterClass RCin2> + : CVI_VA_Resource1 <(outs RCout:$dst), + (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString, + [], "$dst = $_src_" > { + let Itinerary = itin; + let Type = !cast<IType>("Type"#itin); +} + +multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout, + RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > { + def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vmpyacc <asmString, itin, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin1#"128B"), + !cast<RegisterClass>(RCin2# + !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>; +} + +multiclass T_HVX_vmpyacc_VVR <string asmString>: + T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>; + +multiclass T_HVX_vmpyacc_VWR <string asmString>: + T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_WVR <string asmString>: + T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_WWR <string asmString>: + T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_VVV <string asmString>: + T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>; + +multiclass T_HVX_vmpyacc_WVV <string asmString>: + T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>; + + +defm V6_vtmpyb_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">, + V6_vtmpyb_acc_enc; +defm V6_vtmpybus_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">, + V6_vtmpybus_acc_enc; +defm V6_vtmpyhb_acc : + T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">, + V6_vtmpyhb_acc_enc; +defm V6_vdmpyhb_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">, + V6_vdmpyhb_acc_enc; +defm V6_vrmpyub_acc : + T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">, + V6_vrmpyub_acc_enc; +defm V6_vrmpybus_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">, + V6_vrmpybus_acc_enc; +defm V6_vdmpybus_acc : + T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">, + V6_vdmpybus_acc_enc; +defm V6_vdmpybus_dv_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">, + V6_vdmpybus_dv_acc_enc; +defm V6_vdmpyhsuisat_acc : + T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">, + V6_vdmpyhsuisat_acc_enc; +defm V6_vdmpyhisat_acc : + T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">, + V6_vdmpyhisat_acc_enc; +defm V6_vdmpyhb_dv_acc : + T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">, + V6_vdmpyhb_dv_acc_enc; +defm V6_vmpybus_acc : + T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">, + V6_vmpybus_acc_enc; +defm V6_vmpabus_acc : + T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">, + V6_vmpabus_acc_enc; +defm V6_vmpahb_acc : + T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">, + V6_vmpahb_acc_enc; +defm V6_vmpyhsat_acc : + T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">, + V6_vmpyhsat_acc_enc; +defm V6_vmpyuh_acc : + T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">, + V6_vmpyuh_acc_enc; +defm V6_vmpyiwb_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">, + V6_vmpyiwb_acc_enc; +defm V6_vdsaduh_acc : + T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">, + V6_vdsaduh_acc_enc; +defm V6_vmpyihb_acc : + T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">, + V6_vmpyihb_acc_enc; +defm V6_vmpyub_acc : + T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">, + V6_vmpyub_acc_enc; + +let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in { +defm V6_vdmpyhsusat_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">, + V6_vdmpyhsusat_acc_enc; +defm V6_vdmpyhsat_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">, + V6_vdmpyhsat_acc_enc; +defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR + <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vaslw_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc; +defm V6_vasrw_acc : + T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc; +} + +defm V6_vdmpyhvsat_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">, + V6_vdmpyhvsat_acc_enc; +defm V6_vmpybusv_acc : + T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">, + V6_vmpybusv_acc_enc; +defm V6_vmpybv_acc : + T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc; +defm V6_vmpyhus_acc : + T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc; +defm V6_vmpyhv_acc : + T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc; +defm V6_vmpyiewh_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">, + V6_vmpyiewh_acc_enc; +defm V6_vmpyiewuh_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">, + V6_vmpyiewuh_acc_enc; +defm V6_vmpyih_acc : + T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc; +defm V6_vmpyowh_rnd_sacc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">, + V6_vmpyowh_rnd_sacc_enc; +defm V6_vmpyowh_sacc : + T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">, + V6_vmpyowh_sacc_enc; +defm V6_vmpyubv_acc : + T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">, + V6_vmpyubv_acc_enc; +defm V6_vmpyuhv_acc : + T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">, + V6_vmpyuhv_acc_enc; +defm V6_vrmpybusv_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">, + V6_vrmpybusv_acc_enc; +defm V6_vrmpybv_acc : + T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc; +defm V6_vrmpyubv_acc : + T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">, + V6_vrmpyubv_acc_enc; + + +class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString, + [], "$dst = $_src_" > { + let Itinerary = CVI_VA; + let Type = TypeCVI_VA; +} + +multiclass T_HVX_vcmp <string asmString> { + def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>; +} + +defm V6_veqb_and : + T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc; +defm V6_veqh_and : + T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc; +defm V6_veqw_and : + T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc; +defm V6_vgtb_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc; +defm V6_vgth_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc; +defm V6_vgtw_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc; +defm V6_vgtub_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc; +defm V6_vgtuh_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc; +defm V6_vgtuw_and : + T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc; +defm V6_veqb_or : + T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc; +defm V6_veqh_or : + T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc; +defm V6_veqw_or : + T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc; +defm V6_vgtb_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc; +defm V6_vgth_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc; +defm V6_vgtw_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc; +defm V6_vgtub_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc; +defm V6_vgtuh_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc; +defm V6_vgtuw_or : + T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc; +defm V6_veqb_xor : + T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc; +defm V6_veqh_xor : + T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc; +defm V6_veqw_xor : + T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc; +defm V6_vgtb_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc; +defm V6_vgth_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc; +defm V6_vgtw_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc; +defm V6_vgtub_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc; +defm V6_vgtuh_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc; +defm V6_vgtuw_xor : + T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc; + +defm V6_vminub : + T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc; +defm V6_vminuh : + T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc; +defm V6_vminh : + T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc; +defm V6_vminw : + T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc; +defm V6_vmaxub : + T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc; +defm V6_vmaxuh : + T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc; +defm V6_vmaxh : + T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc; +defm V6_vmaxw : + T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc; +defm V6_vshuffeb : + T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc; +defm V6_vshuffob : + T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc; +defm V6_vshufeh : + T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc; +defm V6_vshufoh : + T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc; + +let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in { +defm V6_vmpyowh_rnd : + T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">, + V6_vmpyowh_rnd_enc; +defm V6_vmpyiewuh : + T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc; +defm V6_vmpyewuh : + T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc; +defm V6_vmpyowh : + T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc; +defm V6_vmpyiowh : + T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc; +} +let Itinerary = CVI_VX, Type = TypeCVI_VX in +defm V6_vmpyieoh : + T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc; + +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in { +defm V6_vshufoeh : + T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc; +defm V6_vshufoeb : + T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc; +} + +let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in +defm V6_vcombine : + T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc; + +def SDTHexagonVCOMBINE: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, + SDTCisSubVecOfVec<1, 0>]>; + +def HexagonVCOMBINE: SDNode<"HexagonISD::VCOMBINE", SDTHexagonVCOMBINE>; + +def: Pat<(v32i32 (HexagonVCOMBINE (v16i32 VectorRegs:$Vs), + (v16i32 VectorRegs:$Vt))), + (V6_vcombine VectorRegs:$Vs, VectorRegs:$Vt)>, + Requires<[UseHVXSgl]>; +def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs), + (v32i32 VecDblRegs:$Vt))), + (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, + Requires<[UseHVXDbl]>; + +let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in { +defm V6_vsathub : + T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc; +defm V6_vsatwh : + T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vroundwh : + T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc; +defm V6_vroundwuh : + T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc; +defm V6_vroundhb : + T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc; +defm V6_vroundhub : + T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc; +defm V6_vasrwv : + T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc; +defm V6_vlsrwv : + T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc; +defm V6_vlsrhv : + T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc; +defm V6_vasrhv : + T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc; +defm V6_vaslwv : + T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc; +defm V6_vaslhv : + T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc; +} + +defm V6_vaddb : + T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc; +defm V6_vaddh : + T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc; + +let Itinerary = CVI_VP, Type = TypeCVI_VP in { +defm V6_vdelta : + T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc; +defm V6_vrdelta : + T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc; +defm V6_vdealb4w : + T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc; +defm V6_vpackeb : + T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc; +defm V6_vpackeh : + T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc; +defm V6_vpackhub_sat : + T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc; +defm V6_vpackhb_sat : + T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc; +defm V6_vpackwuh_sat : + T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc; +defm V6_vpackwh_sat : + T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc; +defm V6_vpackob : + T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc; +defm V6_vpackoh : + T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc; +} + +let hasNewValue = 1, hasSideEffects = 0 in +class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2> + : CVI_VA_Resource1 <(outs RC2:$dst), + (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString, + [], "$dst = $_src_" > { + let Itinerary = CVI_VA; + let Type = TypeCVI_VA; +} + +multiclass T_HVX_condALU <string asmString> { + def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>; +} + +defm V6_vaddbq : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">, + V6_vaddbq_enc; +defm V6_vaddhq : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">, + V6_vaddhq_enc; +defm V6_vaddwq : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">, + V6_vaddwq_enc; +defm V6_vsubbq : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">, + V6_vsubbq_enc; +defm V6_vsubhq : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">, + V6_vsubhq_enc; +defm V6_vsubwq : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">, + V6_vsubwq_enc; +defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">, + V6_vaddbnq_enc; +defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">, + V6_vaddhnq_enc; +defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">, + V6_vaddwnq_enc; +defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">, + V6_vsubbnq_enc; +defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">, + V6_vsubhnq_enc; +defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">, + V6_vsubwnq_enc; + +let hasNewValue = 1 in +class T_HVX_alu_2op <string asmString, InstrItinClass itin, + RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1), + asmString >{ + let Itinerary = itin; + let Type = !cast<IType>("Type"#itin); +} + +multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout, + RegisterClass RCin, InstrItinClass itin> { + def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_alu_2op <asmString, itin, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +let hasNewValue = 1 in +multiclass T_HVX_alu_2op_VV <string asmString>: + T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>; + +multiclass T_HVX_alu_2op_WV <string asmString>: + T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>; + + +defm V6_vabsh : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">, + V6_vabsh_enc; +defm V6_vabsw : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">, + V6_vabsw_enc; +defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">, + V6_vabsh_sat_enc; +defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">, + V6_vabsw_sat_enc; +defm V6_vnot : T_HVX_alu_2op_VV <"$dst = vnot($src1)">, + V6_vnot_enc; +defm V6_vassign : T_HVX_alu_2op_VV <"$dst = $src1">, + V6_vassign_enc; + +defm V6_vzb : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">, + V6_vzb_enc; +defm V6_vzh : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">, + V6_vzh_enc; +defm V6_vsb : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">, + V6_vsb_enc; +defm V6_vsh : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">, + V6_vsh_enc; + +let Itinerary = CVI_VP, Type = TypeCVI_VP in { +defm V6_vdealh : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">, + V6_vdealh_enc; +defm V6_vdealb : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">, + V6_vdealb_enc; +defm V6_vshuffh : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">, + V6_vshuffh_enc; +defm V6_vshuffb : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">, + V6_vshuffb_enc; +} + +let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in { +defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">, + V6_vunpackub_enc; +defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">, + V6_vunpackuh_enc; +defm V6_vunpackb : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">, + V6_vunpackb_enc; +defm V6_vunpackh : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">, + V6_vunpackh_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vcl0w : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">, + V6_vcl0w_enc; +defm V6_vcl0h : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">, + V6_vcl0h_enc; +defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">, + V6_vnormamtw_enc; +defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">, + V6_vnormamth_enc; +defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">, + V6_vpopcounth_enc; +} + +let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG, + Type = TypeCVI_VX_DV in +class T_HVX_vmpyacc2 <string asmString, RegisterClass RC> + : CVI_VA_Resource1 <(outs RC:$dst), + (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1Imm:$src3), + asmString, [], "$dst = $_src_" > ; + + +multiclass T_HVX_vmpyacc2 <string asmString> { + def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>; +} + +defm V6_vrmpybusi_acc : + T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">, + V6_vrmpybusi_acc_enc; +defm V6_vrsadubi_acc : + T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">, + V6_vrsadubi_acc_enc; +defm V6_vrmpyubi_acc : + T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">, + V6_vrmpyubi_acc_enc; + + +let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in +class T_HVX_vmpy2 <string asmString, RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1Imm:$src3), + asmString>; + + +multiclass T_HVX_vmpy2 <string asmString> { + def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>; +} + +defm V6_vrmpybusi : + T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc; +defm V6_vrsadubi : + T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc; +defm V6_vrmpyubi : + T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc; + + +let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS, + hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in +class T_HVX_perm <string asmString, RegisterClass RC> + : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_), + (ins RC:$src1, RC:$src2, IntRegs:$src3), + asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >; + +multiclass T_HVX_perm <string asmString> { + def NAME : T_HVX_perm <asmString, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>; +} + +let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in { + defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc; + defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc; +} + +// Conditional vector move. +let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in +class T_HVX_cmov <bit isPredNot, RegisterClass RC> + : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2), + "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> { + let isPredicatedFalse = isPredNot; +} + +multiclass T_HVX_cmov <bit isPredNot = 0> { + def NAME : T_HVX_cmov <isPredNot, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>; +} + +defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc; +defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc; + +// Conditional vector combine. +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1, + hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in +class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 < (outs RCout:$dst), + (ins PredRegs:$src1, RCin:$src2, RCin:$src3), + "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> { + let isPredicatedFalse = isPredNot; +} + +multiclass T_HVX_ccombine <bit isPredNot = 0> { + def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>; +} + +defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc; +defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc; + +let hasNewValue = 1 in +class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VX_DV_Resource1<(outs RCout:$dst), + (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), + asmString >; + +multiclass T_HVX_shift <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_HVX_shift <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_shift_VV <string asmString>: + T_HVX_shift <asmString, VectorRegs, VectorRegs>; + +multiclass T_HVX_shift_WV <string asmString>: + T_HVX_shift <asmString, VecDblRegs, VectorRegs>; + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in { +defm V6_valignb : + T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc; +defm V6_vlalignb : + T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc; +} + +let Itinerary = CVI_VS, Type = TypeCVI_VS in { +defm V6_vasrwh : + T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc; +defm V6_vasrwhsat : + T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">, + V6_vasrwhsat_enc; +defm V6_vasrwhrndsat : + T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">, + V6_vasrwhrndsat_enc; +defm V6_vasrwuhsat : + T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">, + V6_vasrwuhsat_enc; +defm V6_vasrhubsat : + T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">, + V6_vasrhubsat_enc; +defm V6_vasrhubrndsat : + T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">, + V6_vasrhubrndsat_enc; +defm V6_vasrhbrndsat : + T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">, + V6_vasrhbrndsat_enc; +} + +// Assembler mapped -- alias? +//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc; +let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in { +defm V6_vshuffvdd : + T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc; +defm V6_vdealvdd : + T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc; +} + +let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in +class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1), + asmString, [], "$dst = $_src_">; + +multiclass T_HVX_unpack <string asmString> { + def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>; +} + +defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc; +defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc; + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1, + hasSideEffects = 0 in +class T_HVX_valign <string asmString, RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3Imm:$src3), + asmString>; + +multiclass T_HVX_valign <string asmString> { + def NAME : T_HVX_valign <asmString, VectorRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>; +} + +defm V6_valignbi : + T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc; +defm V6_vlalignbi : + T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc; + +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in +class T_HVX_predAlu <string asmString, RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2), + asmString>; + +multiclass T_HVX_predAlu <string asmString> { + def NAME : T_HVX_predAlu <asmString, VecPredRegs>; + + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>; +} + +defm V6_pred_and : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc; +defm V6_pred_or : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc; +defm V6_pred_xor : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc; +defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc; +defm V6_pred_and_n : + T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc; + +let Itinerary = CVI_VA, Type = TypeCVI_VA in +class T_HVX_prednot <RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1), + "$dst = not($src1)">, V6_pred_not_enc; + +def V6_pred_not : T_HVX_prednot <VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>; + +let Itinerary = CVI_VA, Type = TypeCVI_VA in +class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2), + asmString >; + +multiclass T_HVX_vcmp2 <string asmString> { + def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>; +} + +defm V6_veqb : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc; +defm V6_veqh : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc; +defm V6_veqw : T_HVX_vcmp2 <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc; +defm V6_vgtb : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc; +defm V6_vgth : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc; +defm V6_vgtw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc; +defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc; +defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc; +defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc; + +let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in +class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, IntRegs:$src2), + "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc; + +def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>; + +let isAccumulator = 1 in +class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, IntRegs:$src2), + "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc; + +def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>; +let isCodeGenOnly = 1 in +def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>; + +let hasNewValue = 1, hasSideEffects = 0 in +class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), + (ins RCin:$src1, IntRegs:$src2), + "$dst = vand($src1,$src2)" >, V6_vandqrt_enc; + +def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>; + +let hasNewValue = 1, hasSideEffects = 0 in +class T_V6_lvsplatw <RegisterClass RC> + : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1), + "$dst = vsplat($src1)" >, V6_lvsplatw_enc; + +def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>; +let isCodeGenOnly = 1 in +def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>; + + +let hasNewValue = 1 in +class T_V6_vinsertwr <RegisterClass RC> + : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1), + "$dst.w = vinsert($src1)", [], "$dst = $_src_">, + V6_vinsertwr_enc; + +def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>; +let isCodeGenOnly = 1 in +def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>; + + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in +class T_V6_pred_scalar2 <RegisterClass RC> + : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1), + "$dst = vsetq($src1)">, V6_pred_scalar2_enc; + +def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>; +let isCodeGenOnly = 1 in +def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>; + +class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin> + : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2), + "$dst = vand($src1,$src2)">, V6_vandvrt_enc; + +def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>; +let isCodeGenOnly = 1 in +def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>; + +let validSubTargets = HasV60SubT in +class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp > + : SInst2 <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2), asmString>; + +class T_HVX_rol_R <string asmString> + : T_HVX_rol <asmString, IntRegs, u5Imm>; +class T_HVX_rol_P <string asmString> + : T_HVX_rol <asmString, DoubleRegs, u6Imm>; + +def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc; +let hasNewValue = 1, opNewValue = 0 in +def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc; + +let validSubTargets = HasV60SubT in +class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp> + : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2), + asmString, [], "$dst = $_src_" >; + +class T_HVX_rol_acc_P <string asmString> + : T_HVX_rol_acc <asmString, DoubleRegs, u6Imm>; + +class T_HVX_rol_acc_R <string asmString> + : T_HVX_rol_acc <asmString, IntRegs, u5Imm>; + +def S6_rol_i_p_nac : + T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc; +def S6_rol_i_p_acc : + T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc; +def S6_rol_i_p_and : + T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc; +def S6_rol_i_p_or : + T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc; +def S6_rol_i_p_xacc : + T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc; + +let hasNewValue = 1, opNewValue = 0 in { +def S6_rol_i_r_nac : + T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc; +def S6_rol_i_r_acc : + T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc; +def S6_rol_i_r_and : + T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc; +def S6_rol_i_r_or : + T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc; +def S6_rol_i_r_xacc : + T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc; +} + +let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in +class T_V6_extractw <RegisterClass RC> + : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2), + "$dst = vextract($src1,$src2)">, V6_extractw_enc; + +def V6_extractw : T_V6_extractw <VectorRegs>; +let isCodeGenOnly = 1 in +def V6_extractw_128B : T_V6_extractw <VectorRegs128B>; + +let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT in +class T_sys0op <string asmString> + : ST1Inst <(outs), (ins), asmString>; + +let isSolo = 1, validSubTargets = HasV55SubT in { +def Y5_l2gunlock : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc; +def Y5_l2gclean : T_sys0op <"l2gclean">, Y5_l2gclean_enc; +def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc; +} + +class T_sys1op <string asmString, RegisterClass RC> + : ST1Inst <(outs), (ins RC:$src1), asmString>; + +class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>; +class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>; + +let isSoloAX = 1, validSubTargets = HasV55SubT in +def Y5_l2unlocka : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc; + +let isSolo = 1, validSubTargets = HasV60SubT in { +def Y6_l2gcleanpa : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc; +def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc; +} + +let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1, + validSubTargets = HasV55SubT in +def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1), + "$dst = l2locka($src1)">, Y5_l2locka_enc; + +// not defined on etc side. why? +// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc; + +let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1, + hasSideEffects = 0, +validSubTargets = HasV55SubT in +def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2), + (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2), + "$dst1,$dst2 = vacsh($src1,$src2)", [], + "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc; + +let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1, + hasSideEffects = 0 in +class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1, + RegisterClass RCin2> + : CVI_VA_Resource1<(outs RCout:$dst), + (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>; + +multiclass T_HVX_alu2 <string asmString, RegisterClass RC > { + def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"), + VecPredRegs128B, VectorRegs128B>; +} + +multiclass T_HVX_alu2_V <string asmString> : + T_HVX_alu2 <asmString, VectorRegs>; + +multiclass T_HVX_alu2_W <string asmString> : + T_HVX_alu2 <asmString, VecDblRegs>; + +defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc; + +let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1, + hasSideEffects = 0 in +defm V6_vmux : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc; + +class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin> + : CVI_VA_Resource1<(outs RCout:$dst), + (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>; + +multiclass T_HVX_vlutb <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_HVX_vlutb <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_vlutb_V <string asmString> : + T_HVX_vlutb <asmString, VectorRegs, VectorRegs>; + +multiclass T_HVX_vlutb_W <string asmString> : + T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>; + +let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in +class T_HVX_vlutb_acc <string asmString, RegisterClass RCout, + RegisterClass RCin> + : CVI_VA_Resource1<(outs RCout:$dst), + (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3), + asmString, [], "$dst = $_src_">; + +multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout, + RegisterClass RCin> { + def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>; + let isCodeGenOnly = 1 in + def NAME#_128B : T_HVX_vlutb_acc<asmString, + !cast<RegisterClass>(RCout#"128B"), + !cast<RegisterClass>(RCin#"128B")>; +} + +multiclass T_HVX_vlutb_acc_V <string asmString> : + T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>; + +multiclass T_HVX_vlutb_acc_W <string asmString> : + T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>; + + +let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in +defm V6_vlutvvb: + T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc; + +let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in +defm V6_vlutvwh: + T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc; + +let hasNewValue = 1 in { + defm V6_vlutvvb_oracc: + T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">, + V6_vlutvvb_oracc_enc; + defm V6_vlutvwh_oracc: + T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">, + V6_vlutvwh_oracc_enc; +} + +// It's a fake instruction and should not be defined? +def S2_cabacencbin + : SInst2<(outs DoubleRegs:$dst), + (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3), + "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc; + +// Vhist instructions +def V6_vhistq + : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1), + "vhist($src1)">, V6_vhistq_enc; + +def V6_vhist + : CVI_HIST_Resource1 <(outs), (ins), + "vhist" >, V6_vhist_enc; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td index f4fb946..96dd531 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonInstrInfoVector.td @@ -35,6 +35,34 @@ multiclass bitconvert_64<ValueType a, ValueType b> { (a DoubleRegs:$src)>; } +multiclass bitconvert_vec<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VectorRegs:$src))), + (b VectorRegs:$src)>; + def : Pat <(a (bitconvert (b VectorRegs:$src))), + (a VectorRegs:$src)>; +} + +multiclass bitconvert_dblvec<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VecDblRegs:$src))), + (b VecDblRegs:$src)>; + def : Pat <(a (bitconvert (b VecDblRegs:$src))), + (a VecDblRegs:$src)>; +} + +multiclass bitconvert_predvec<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VecPredRegs:$src))), + (b VectorRegs:$src)>; + def : Pat <(a (bitconvert (b VectorRegs:$src))), + (a VecPredRegs:$src)>; +} + +multiclass bitconvert_dblvec128B<ValueType a, ValueType b> { + def : Pat <(b (bitconvert (a VecDblRegs128B:$src))), + (b VecDblRegs128B:$src)>; + def : Pat <(a (bitconvert (b VecDblRegs128B:$src))), + (a VecDblRegs128B:$src)>; +} + // Bit convert vector types. defm : bitconvert_32<v4i8, i32>; defm : bitconvert_32<v2i16, i32>; @@ -47,6 +75,21 @@ defm : bitconvert_64<v8i8, v4i16>; defm : bitconvert_64<v8i8, v2i32>; defm : bitconvert_64<v4i16, v2i32>; +defm : bitconvert_vec<v64i8, v16i32>; +defm : bitconvert_vec<v8i64 , v16i32>; +defm : bitconvert_vec<v32i16, v16i32>; + +defm : bitconvert_dblvec<v16i64, v128i8>; +defm : bitconvert_dblvec<v32i32, v128i8>; +defm : bitconvert_dblvec<v64i16, v128i8>; + +defm : bitconvert_dblvec128B<v64i32, v128i16>; +defm : bitconvert_dblvec128B<v256i8, v128i16>; +defm : bitconvert_dblvec128B<v32i64, v128i16>; + +defm : bitconvert_dblvec128B<v64i32, v256i8>; +defm : bitconvert_dblvec128B<v32i64, v256i8>; +defm : bitconvert_dblvec128B<v128i16, v256i8>; // Vector shift support. Vector shifting in Hexagon is rather different // from internal representation of LLVM. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td index 1d0d015..b207aaf 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsics.td @@ -691,15 +691,15 @@ def: T_RR_pat<A2_combine_hl, int_hexagon_A2_combine_hl>; def: T_RR_pat<A2_combine_lh, int_hexagon_A2_combine_lh>; def: T_RR_pat<A2_combine_ll, int_hexagon_A2_combine_ll>; -def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s8ExtPred, s8ImmPred>; +def: T_II_pat<A2_combineii, int_hexagon_A2_combineii, s32ImmPred, s8ImmPred>; def: Pat<(i32 (int_hexagon_C2_mux (I32:$Rp), (I32:$Rs), (I32:$Rt))), (i32 (C2_mux (C2_tfrrp IntRegs:$Rp), IntRegs:$Rs, IntRegs:$Rt))>; // Mux -def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s8ExtPred>; -def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s8ExtPred>; -def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s8ExtPred, s8ImmPred>; +def : T_QRI_pat<C2_muxir, int_hexagon_C2_muxir, s32ImmPred>; +def : T_QIR_pat<C2_muxri, int_hexagon_C2_muxri, s32ImmPred>; +def : T_QII_pat<C2_muxii, int_hexagon_C2_muxii, s32ImmPred, s8ImmPred>; // Shift halfword def : T_R_pat<A2_aslh, int_hexagon_A2_aslh>; @@ -720,17 +720,17 @@ def : T_RR_pat<C2_cmpeq, int_hexagon_C2_cmpeq>; def : T_RR_pat<C2_cmpgt, int_hexagon_C2_cmpgt>; def : T_RR_pat<C2_cmpgtu, int_hexagon_C2_cmpgtu>; -def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s10ExtPred>; -def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s10ExtPred>; -def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u9ExtPred>; +def : T_RI_pat<C2_cmpeqi, int_hexagon_C2_cmpeqi, s32ImmPred>; +def : T_RI_pat<C2_cmpgti, int_hexagon_C2_cmpgti, s32ImmPred>; +def : T_RI_pat<C2_cmpgtui, int_hexagon_C2_cmpgtui, u32ImmPred>; -def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s8ExtPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgei (I32:$src1), s32ImmPred:$src2)), (i32 (C2_cmpgti (I32:$src1), - (DEC_CONST_SIGNED s8ExtPred:$src2)))>; + (DEC_CONST_SIGNED s32ImmPred:$src2)))>; -def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u8ExtPred:$src2)), +def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), u32ImmPred:$src2)), (i32 (C2_cmpgtui (I32:$src1), - (DEC_CONST_UNSIGNED u8ExtPred:$src2)))>; + (DEC_CONST_UNSIGNED u32ImmPred:$src2)))>; // The instruction, Pd=cmp.geu(Rs, #u8) -> Pd=cmp.eq(Rs,Rs) when #u8 == 0. def : Pat <(i32 (int_hexagon_C2_cmpgeui (I32:$src1), 0)), @@ -1289,3 +1289,5 @@ def: T_stc_pat<S2_storerf_pci_pseudo, int_hexagon_circ_sthhi, s4_1ImmPred, I32>; include "HexagonIntrinsicsV3.td" include "HexagonIntrinsicsV4.td" include "HexagonIntrinsicsV5.td" +include "HexagonIntrinsicsV60.td" + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td new file mode 100644 index 0000000..24a3e4d --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td @@ -0,0 +1,836 @@ +//=- HexagonIntrinsicsV60.td - Target Description for Hexagon -*- tablegen *-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the Hexagon V60 Compiler Intrinsics in TableGen format. +// +//===----------------------------------------------------------------------===// + +let isCodeGenOnly = 1 in { +def HEXAGON_V6_vd0_pseudo : CVI_VA_Resource<(outs VectorRegs:$dst), + (ins ), + "$dst=#0", + [(set VectorRegs:$dst, (int_hexagon_V6_vd0 ))]>; + +def HEXAGON_V6_vd0_pseudo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst), + (ins ), + "$dst=#0", + [(set VectorRegs128B:$dst, (int_hexagon_V6_vd0_128B ))]>; +} +let isPseudo = 1 in +def HEXAGON_V6_vassignp : CVI_VA_Resource<(outs VecDblRegs:$dst), + (ins VecDblRegs:$src1), + "$dst=vassignp_W($src1)", + [(set VecDblRegs:$dst, (int_hexagon_V6_vassignp VecDblRegs:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst), + (ins VecDblRegs128B:$src1), + "$dst=vassignp_W_128B($src1)", + [(set VecDblRegs128B:$dst, (int_hexagon_V6_vassignp_128B + VecDblRegs128B:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_lo : CVI_VA_Resource<(outs VectorRegs:$dst), + (ins VecDblRegs:$src1), + "$dst=lo_W($src1)", + [(set VectorRegs:$dst, (int_hexagon_V6_lo VecDblRegs:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_hi : CVI_VA_Resource<(outs VectorRegs:$dst), + (ins VecDblRegs:$src1), + "$dst=hi_W($src1)", + [(set VectorRegs:$dst, (int_hexagon_V6_hi VecDblRegs:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_lo_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst), + (ins VecDblRegs128B:$src1), + "$dst=lo_W($src1)", + [(set VectorRegs128B:$dst, (int_hexagon_V6_lo_128B VecDblRegs128B:$src1))]>; + +let isPseudo = 1 in +def HEXAGON_V6_hi_128B : CVI_VA_Resource<(outs VectorRegs128B:$dst), + (ins VecDblRegs128B:$src1), + "$dst=hi_W($src1)", + [(set VectorRegs128B:$dst, (int_hexagon_V6_hi_128B VecDblRegs128B:$src1))]>; + +let AddedComplexity = 100 in { +def : Pat < (v16i32 (int_hexagon_V6_lo (v32i32 VecDblRegs:$src1))), + (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_loreg)) >, + Requires<[UseHVXSgl]>; + +def : Pat < (v16i32 (int_hexagon_V6_hi (v32i32 VecDblRegs:$src1))), + (v16i32 (EXTRACT_SUBREG (v32i32 VecDblRegs:$src1), subreg_hireg)) >, + Requires<[UseHVXSgl]>; + +def : Pat < (v32i32 (int_hexagon_V6_lo_128B (v64i32 VecDblRegs128B:$src1))), + (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), + subreg_loreg)) >, + Requires<[UseHVXDbl]>; + +def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 VecDblRegs128B:$src1))), + (v32i32 (EXTRACT_SUBREG (v64i32 VecDblRegs128B:$src1), + subreg_hireg)) >, + Requires<[UseHVXDbl]>; +} + +def : Pat <(v512i1 (bitconvert (v16i32 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v16i32 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (bitconvert (v32i16 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v32i16 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (bitconvert (v64i8 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v64i8 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (bitconvert (v8i64 VectorRegs:$src1))), + (v512i1 (V6_vandvrt(v8i64 VectorRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v16i32 (bitconvert (v512i1 VecPredRegs:$src1))), + (v16i32 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v32i16 (bitconvert (v512i1 VecPredRegs:$src1))), + (v32i16 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v64i8 (bitconvert (v512i1 VecPredRegs:$src1))), + (v64i8 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v8i64 (bitconvert (v512i1 VecPredRegs:$src1))), + (v8i64 (V6_vandqrt(v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v1024i1 (bitconvert (v32i32 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v32i32 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (bitconvert (v64i16 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v64i16 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (bitconvert (v128i8 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v128i8 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (bitconvert (v16i64 VectorRegs128B:$src1))), + (v1024i1 (V6_vandvrt_128B(v16i64 VectorRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v32i32 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v32i32 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v64i16 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v64i16 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v128i8 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v128i8 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v16i64 (bitconvert (v1024i1 VecPredRegs128B:$src1))), + (v16i64 (V6_vandqrt_128B(v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; + +let AddedComplexity = 140 in { +def : Pat <(store (v512i1 VecPredRegs:$src1), (i32 IntRegs:$addr)), + (V6_vS32b_ai IntRegs:$addr, 0, + (v16i32 (V6_vandqrt (v512i1 VecPredRegs:$src1), + (A2_tfrsi 0x01010101))))>, + Requires<[UseHVXSgl]>; + +def : Pat <(v512i1 (load (i32 IntRegs:$addr))), + (v512i1 (V6_vandvrt + (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXSgl]>; + +def : Pat <(store (v1024i1 VecPredRegs128B:$src1), (i32 IntRegs:$addr)), + (V6_vS32b_ai_128B IntRegs:$addr, 0, + (v32i32 (V6_vandqrt_128B (v1024i1 VecPredRegs128B:$src1), + (A2_tfrsi 0x01010101))))>, + Requires<[UseHVXDbl]>; + +def : Pat <(v1024i1 (load (i32 IntRegs:$addr))), + (v1024i1 (V6_vandvrt_128B + (v32i32 (V6_vL32b_ai_128B IntRegs:$addr, 0)), + (A2_tfrsi 0x01010101)))>, + Requires<[UseHVXDbl]>; +} + +multiclass T_R_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID IntRegs:$src1), (MI IntRegs:$src1)>, + Requires<[UseHVXSgl]>; + def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1), + (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_V_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1), + (MI VectorRegs:$src1)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_Q_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1), + (MI VecPredRegs:$src1)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2), + (MI VecDblRegs:$src1, IntRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B")VecDblRegs128B:$src1, IntRegs:$src2), + (!cast<InstHexagon>(MI#"_128B")VecDblRegs128B:$src1, IntRegs:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2), + (MI VectorRegs:$src1, IntRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B")VectorRegs128B:$src1, IntRegs:$src2), + (!cast<InstHexagon>(MI#"_128B")VectorRegs128B:$src1, IntRegs:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2), + (MI VecDblRegs:$src1, VectorRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WW_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2), + (MI VecDblRegs:$src1, VecDblRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2), + (MI VectorRegs:$src1, VectorRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_QR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2), + (MI VecPredRegs:$src1, IntRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + IntRegs:$src2), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + IntRegs:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_QQ_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2), + (MI VecPredRegs:$src1, VecPredRegs:$src2)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + VecPredRegs128B:$src2), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + VecPredRegs128B:$src2)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WWR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3), + (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3), + (MI VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3), + (MI VecDblRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VWR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3), + (MI VectorRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WVV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_QVV_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), + (MI VecPredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VQR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3), + (MI VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VecPredRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VecPredRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + + +multiclass T_QVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3), + (MI VecPredRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3), + (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, + VectorRegs128B:$src2, + IntRegs:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVI_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3), + (MI VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, imm:$src3), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, imm:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WRI_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2, imm:$src3), + (MI VecDblRegs:$src1, IntRegs:$src2, imm:$src3)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + IntRegs:$src2, imm:$src3), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + IntRegs:$src2, imm:$src3)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WWRI_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4), + (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3, imm:$src4)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3, imm:$src4), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VecDblRegs128B:$src2, + IntRegs:$src3, imm:$src4)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_VVVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4), + (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4), + (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4)>, + Requires<[UseHVXDbl]>; +} + +multiclass T_WVVR_pat <InstHexagon MI, Intrinsic IntID> { + def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4), + (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, + IntRegs:$src4)>, + Requires<[UseHVXSgl]>; + + def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4), + (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, + VectorRegs128B:$src2, + VectorRegs128B:$src3, + IntRegs:$src4)>, + Requires<[UseHVXDbl]>; +} + +defm : T_WR_pat<V6_vtmpyb, int_hexagon_V6_vtmpyb>; +defm : T_WR_pat <V6_vtmpybus, int_hexagon_V6_vtmpybus>; +defm : T_VR_pat <V6_vdmpyhb, int_hexagon_V6_vdmpyhb>; +defm : T_VR_pat <V6_vrmpyub, int_hexagon_V6_vrmpyub>; +defm : T_VR_pat <V6_vrmpybus, int_hexagon_V6_vrmpybus>; +defm : T_WR_pat <V6_vdsaduh, int_hexagon_V6_vdsaduh>; +defm : T_VR_pat <V6_vdmpybus, int_hexagon_V6_vdmpybus>; +defm : T_WR_pat <V6_vdmpybus_dv, int_hexagon_V6_vdmpybus_dv>; +defm : T_VR_pat <V6_vdmpyhsusat, int_hexagon_V6_vdmpyhsusat>; +defm : T_WR_pat <V6_vdmpyhsuisat, int_hexagon_V6_vdmpyhsuisat>; +defm : T_VR_pat <V6_vdmpyhsat, int_hexagon_V6_vdmpyhsat>; +defm : T_WR_pat <V6_vdmpyhisat, int_hexagon_V6_vdmpyhisat>; +defm : T_WR_pat <V6_vdmpyhb_dv, int_hexagon_V6_vdmpyhb_dv>; +defm : T_VR_pat <V6_vmpybus, int_hexagon_V6_vmpybus>; +defm : T_WR_pat <V6_vmpabus, int_hexagon_V6_vmpabus>; +defm : T_WR_pat <V6_vmpahb, int_hexagon_V6_vmpahb>; +defm : T_VR_pat <V6_vmpyh, int_hexagon_V6_vmpyh>; +defm : T_VR_pat <V6_vmpyhss, int_hexagon_V6_vmpyhss>; +defm : T_VR_pat <V6_vmpyhsrs, int_hexagon_V6_vmpyhsrs>; +defm : T_VR_pat <V6_vmpyuh, int_hexagon_V6_vmpyuh>; +defm : T_VR_pat <V6_vmpyihb, int_hexagon_V6_vmpyihb>; +defm : T_VR_pat <V6_vror, int_hexagon_V6_vror>; +defm : T_VR_pat <V6_vasrw, int_hexagon_V6_vasrw>; +defm : T_VR_pat <V6_vasrh, int_hexagon_V6_vasrh>; +defm : T_VR_pat <V6_vaslw, int_hexagon_V6_vaslw>; +defm : T_VR_pat <V6_vaslh, int_hexagon_V6_vaslh>; +defm : T_VR_pat <V6_vlsrw, int_hexagon_V6_vlsrw>; +defm : T_VR_pat <V6_vlsrh, int_hexagon_V6_vlsrh>; +defm : T_VR_pat <V6_vmpyiwh, int_hexagon_V6_vmpyiwh>; +defm : T_VR_pat <V6_vmpyiwb, int_hexagon_V6_vmpyiwb>; +defm : T_WR_pat <V6_vtmpyhb, int_hexagon_V6_vtmpyhb>; +defm : T_VR_pat <V6_vmpyub, int_hexagon_V6_vmpyub>; + +defm : T_VV_pat <V6_vrmpyubv, int_hexagon_V6_vrmpyubv>; +defm : T_VV_pat <V6_vrmpybv, int_hexagon_V6_vrmpybv>; +defm : T_VV_pat <V6_vrmpybusv, int_hexagon_V6_vrmpybusv>; +defm : T_VV_pat <V6_vdmpyhvsat, int_hexagon_V6_vdmpyhvsat>; +defm : T_VV_pat <V6_vmpybv, int_hexagon_V6_vmpybv>; +defm : T_VV_pat <V6_vmpyubv, int_hexagon_V6_vmpyubv>; +defm : T_VV_pat <V6_vmpybusv, int_hexagon_V6_vmpybusv>; +defm : T_VV_pat <V6_vmpyhv, int_hexagon_V6_vmpyhv>; +defm : T_VV_pat <V6_vmpyuhv, int_hexagon_V6_vmpyuhv>; +defm : T_VV_pat <V6_vmpyhvsrs, int_hexagon_V6_vmpyhvsrs>; +defm : T_VV_pat <V6_vmpyhus, int_hexagon_V6_vmpyhus>; +defm : T_WW_pat <V6_vmpabusv, int_hexagon_V6_vmpabusv>; +defm : T_VV_pat <V6_vmpyih, int_hexagon_V6_vmpyih>; +defm : T_VV_pat <V6_vand, int_hexagon_V6_vand>; +defm : T_VV_pat <V6_vor, int_hexagon_V6_vor>; +defm : T_VV_pat <V6_vxor, int_hexagon_V6_vxor>; +defm : T_VV_pat <V6_vaddw, int_hexagon_V6_vaddw>; +defm : T_VV_pat <V6_vaddubsat, int_hexagon_V6_vaddubsat>; +defm : T_VV_pat <V6_vadduhsat, int_hexagon_V6_vadduhsat>; +defm : T_VV_pat <V6_vaddhsat, int_hexagon_V6_vaddhsat>; +defm : T_VV_pat <V6_vaddwsat, int_hexagon_V6_vaddwsat>; +defm : T_VV_pat <V6_vsubb, int_hexagon_V6_vsubb>; +defm : T_VV_pat <V6_vsubh, int_hexagon_V6_vsubh>; +defm : T_VV_pat <V6_vsubw, int_hexagon_V6_vsubw>; +defm : T_VV_pat <V6_vsububsat, int_hexagon_V6_vsububsat>; +defm : T_VV_pat <V6_vsubuhsat, int_hexagon_V6_vsubuhsat>; +defm : T_VV_pat <V6_vsubhsat, int_hexagon_V6_vsubhsat>; +defm : T_VV_pat <V6_vsubwsat, int_hexagon_V6_vsubwsat>; +defm : T_WW_pat <V6_vaddb_dv, int_hexagon_V6_vaddb_dv>; +defm : T_WW_pat <V6_vaddh_dv, int_hexagon_V6_vaddh_dv>; +defm : T_WW_pat <V6_vaddw_dv, int_hexagon_V6_vaddw_dv>; +defm : T_WW_pat <V6_vaddubsat_dv, int_hexagon_V6_vaddubsat_dv>; +defm : T_WW_pat <V6_vadduhsat_dv, int_hexagon_V6_vadduhsat_dv>; +defm : T_WW_pat <V6_vaddhsat_dv, int_hexagon_V6_vaddhsat_dv>; +defm : T_WW_pat <V6_vaddwsat_dv, int_hexagon_V6_vaddwsat_dv>; +defm : T_WW_pat <V6_vsubb_dv, int_hexagon_V6_vsubb_dv>; +defm : T_WW_pat <V6_vsubh_dv, int_hexagon_V6_vsubh_dv>; +defm : T_WW_pat <V6_vsubw_dv, int_hexagon_V6_vsubw_dv>; +defm : T_WW_pat <V6_vsububsat_dv, int_hexagon_V6_vsububsat_dv>; +defm : T_WW_pat <V6_vsubuhsat_dv, int_hexagon_V6_vsubuhsat_dv>; +defm : T_WW_pat <V6_vsubhsat_dv, int_hexagon_V6_vsubhsat_dv>; +defm : T_WW_pat <V6_vsubwsat_dv, int_hexagon_V6_vsubwsat_dv>; +defm : T_VV_pat <V6_vaddubh, int_hexagon_V6_vaddubh>; +defm : T_VV_pat <V6_vadduhw, int_hexagon_V6_vadduhw>; +defm : T_VV_pat <V6_vaddhw, int_hexagon_V6_vaddhw>; +defm : T_VV_pat <V6_vsububh, int_hexagon_V6_vsububh>; +defm : T_VV_pat <V6_vsubuhw, int_hexagon_V6_vsubuhw>; +defm : T_VV_pat <V6_vsubhw, int_hexagon_V6_vsubhw>; +defm : T_VV_pat <V6_vabsdiffub, int_hexagon_V6_vabsdiffub>; +defm : T_VV_pat <V6_vabsdiffh, int_hexagon_V6_vabsdiffh>; +defm : T_VV_pat <V6_vabsdiffuh, int_hexagon_V6_vabsdiffuh>; +defm : T_VV_pat <V6_vabsdiffw, int_hexagon_V6_vabsdiffw>; +defm : T_VV_pat <V6_vavgub, int_hexagon_V6_vavgub>; +defm : T_VV_pat <V6_vavguh, int_hexagon_V6_vavguh>; +defm : T_VV_pat <V6_vavgh, int_hexagon_V6_vavgh>; +defm : T_VV_pat <V6_vavgw, int_hexagon_V6_vavgw>; +defm : T_VV_pat <V6_vnavgub, int_hexagon_V6_vnavgub>; +defm : T_VV_pat <V6_vnavgh, int_hexagon_V6_vnavgh>; +defm : T_VV_pat <V6_vnavgw, int_hexagon_V6_vnavgw>; +defm : T_VV_pat <V6_vavgubrnd, int_hexagon_V6_vavgubrnd>; +defm : T_VV_pat <V6_vavguhrnd, int_hexagon_V6_vavguhrnd>; +defm : T_VV_pat <V6_vavghrnd, int_hexagon_V6_vavghrnd>; +defm : T_VV_pat <V6_vavgwrnd, int_hexagon_V6_vavgwrnd>; +defm : T_WW_pat <V6_vmpabuuv, int_hexagon_V6_vmpabuuv>; + +defm : T_VVR_pat <V6_vdmpyhb_acc, int_hexagon_V6_vdmpyhb_acc>; +defm : T_VVR_pat <V6_vrmpyub_acc, int_hexagon_V6_vrmpyub_acc>; +defm : T_VVR_pat <V6_vrmpybus_acc, int_hexagon_V6_vrmpybus_acc>; +defm : T_VVR_pat <V6_vdmpybus_acc, int_hexagon_V6_vdmpybus_acc>; +defm : T_VVR_pat <V6_vdmpyhsusat_acc, int_hexagon_V6_vdmpyhsusat_acc>; +defm : T_VVR_pat <V6_vdmpyhsat_acc, int_hexagon_V6_vdmpyhsat_acc>; +defm : T_VVR_pat <V6_vmpyiwb_acc, int_hexagon_V6_vmpyiwb_acc>; +defm : T_VVR_pat <V6_vmpyiwh_acc, int_hexagon_V6_vmpyiwh_acc>; +defm : T_VVR_pat <V6_vmpyihb_acc, int_hexagon_V6_vmpyihb_acc>; +defm : T_VVR_pat <V6_vaslw_acc, int_hexagon_V6_vaslw_acc>; +defm : T_VVR_pat <V6_vasrw_acc, int_hexagon_V6_vasrw_acc>; + +defm : T_VWR_pat <V6_vdmpyhsuisat_acc, int_hexagon_V6_vdmpyhsuisat_acc>; +defm : T_VWR_pat <V6_vdmpyhisat_acc, int_hexagon_V6_vdmpyhisat_acc>; + +defm : T_WVR_pat <V6_vmpybus_acc, int_hexagon_V6_vmpybus_acc>; +defm : T_WVR_pat <V6_vmpyhsat_acc, int_hexagon_V6_vmpyhsat_acc>; +defm : T_WVR_pat <V6_vmpyuh_acc, int_hexagon_V6_vmpyuh_acc>; +defm : T_WVR_pat <V6_vmpyub_acc, int_hexagon_V6_vmpyub_acc>; + +defm : T_WWR_pat <V6_vtmpyb_acc, int_hexagon_V6_vtmpyb_acc>; +defm : T_WWR_pat <V6_vtmpybus_acc, int_hexagon_V6_vtmpybus_acc>; +defm : T_WWR_pat <V6_vtmpyhb_acc, int_hexagon_V6_vtmpyhb_acc>; +defm : T_WWR_pat <V6_vdmpybus_dv_acc, int_hexagon_V6_vdmpybus_dv_acc>; +defm : T_WWR_pat <V6_vdmpyhb_dv_acc, int_hexagon_V6_vdmpyhb_dv_acc>; +defm : T_WWR_pat <V6_vmpabus_acc, int_hexagon_V6_vmpabus_acc>; +defm : T_WWR_pat <V6_vmpahb_acc, int_hexagon_V6_vmpahb_acc>; +defm : T_WWR_pat <V6_vdsaduh_acc, int_hexagon_V6_vdsaduh_acc>; + +defm : T_VVV_pat <V6_vdmpyhvsat_acc, int_hexagon_V6_vdmpyhvsat_acc>; +defm : T_WVV_pat <V6_vmpybusv_acc, int_hexagon_V6_vmpybusv_acc>; +defm : T_WVV_pat <V6_vmpybv_acc, int_hexagon_V6_vmpybv_acc>; +defm : T_WVV_pat <V6_vmpyhus_acc, int_hexagon_V6_vmpyhus_acc>; +defm : T_WVV_pat <V6_vmpyhv_acc, int_hexagon_V6_vmpyhv_acc>; +defm : T_VVV_pat <V6_vmpyiewh_acc, int_hexagon_V6_vmpyiewh_acc>; +defm : T_VVV_pat <V6_vmpyiewuh_acc, int_hexagon_V6_vmpyiewuh_acc>; +defm : T_VVV_pat <V6_vmpyih_acc, int_hexagon_V6_vmpyih_acc>; +defm : T_VVV_pat <V6_vmpyowh_rnd_sacc, int_hexagon_V6_vmpyowh_rnd_sacc>; +defm : T_VVV_pat <V6_vmpyowh_sacc, int_hexagon_V6_vmpyowh_sacc>; +defm : T_WVV_pat <V6_vmpyubv_acc, int_hexagon_V6_vmpyubv_acc>; +defm : T_WVV_pat <V6_vmpyuhv_acc, int_hexagon_V6_vmpyuhv_acc>; +defm : T_VVV_pat <V6_vrmpybusv_acc, int_hexagon_V6_vrmpybusv_acc>; +defm : T_VVV_pat <V6_vrmpybv_acc, int_hexagon_V6_vrmpybv_acc>; +defm : T_VVV_pat <V6_vrmpyubv_acc, int_hexagon_V6_vrmpyubv_acc>; + +// Compare instructions +defm : T_QVV_pat <V6_veqb_and, int_hexagon_V6_veqb_and>; +defm : T_QVV_pat <V6_veqh_and, int_hexagon_V6_veqh_and>; +defm : T_QVV_pat <V6_veqw_and, int_hexagon_V6_veqw_and>; +defm : T_QVV_pat <V6_vgtb_and, int_hexagon_V6_vgtb_and>; +defm : T_QVV_pat <V6_vgth_and, int_hexagon_V6_vgth_and>; +defm : T_QVV_pat <V6_vgtw_and, int_hexagon_V6_vgtw_and>; +defm : T_QVV_pat <V6_vgtub_and, int_hexagon_V6_vgtub_and>; +defm : T_QVV_pat <V6_vgtuh_and, int_hexagon_V6_vgtuh_and>; +defm : T_QVV_pat <V6_vgtuw_and, int_hexagon_V6_vgtuw_and>; +defm : T_QVV_pat <V6_veqb_or, int_hexagon_V6_veqb_or>; +defm : T_QVV_pat <V6_veqh_or, int_hexagon_V6_veqh_or>; +defm : T_QVV_pat <V6_veqw_or, int_hexagon_V6_veqw_or>; +defm : T_QVV_pat <V6_vgtb_or, int_hexagon_V6_vgtb_or>; +defm : T_QVV_pat <V6_vgth_or, int_hexagon_V6_vgth_or>; +defm : T_QVV_pat <V6_vgtw_or, int_hexagon_V6_vgtw_or>; +defm : T_QVV_pat <V6_vgtub_or, int_hexagon_V6_vgtub_or>; +defm : T_QVV_pat <V6_vgtuh_or, int_hexagon_V6_vgtuh_or>; +defm : T_QVV_pat <V6_vgtuw_or, int_hexagon_V6_vgtuw_or>; +defm : T_QVV_pat <V6_veqb_xor, int_hexagon_V6_veqb_xor>; +defm : T_QVV_pat <V6_veqh_xor, int_hexagon_V6_veqh_xor>; +defm : T_QVV_pat <V6_veqw_xor, int_hexagon_V6_veqw_xor>; +defm : T_QVV_pat <V6_vgtb_xor, int_hexagon_V6_vgtb_xor>; +defm : T_QVV_pat <V6_vgth_xor, int_hexagon_V6_vgth_xor>; +defm : T_QVV_pat <V6_vgtw_xor, int_hexagon_V6_vgtw_xor>; +defm : T_QVV_pat <V6_vgtub_xor, int_hexagon_V6_vgtub_xor>; +defm : T_QVV_pat <V6_vgtuh_xor, int_hexagon_V6_vgtuh_xor>; +defm : T_QVV_pat <V6_vgtuw_xor, int_hexagon_V6_vgtuw_xor>; + +defm : T_VV_pat <V6_vminub, int_hexagon_V6_vminub>; +defm : T_VV_pat <V6_vminuh, int_hexagon_V6_vminuh>; +defm : T_VV_pat <V6_vminh, int_hexagon_V6_vminh>; +defm : T_VV_pat <V6_vminw, int_hexagon_V6_vminw>; +defm : T_VV_pat <V6_vmaxub, int_hexagon_V6_vmaxub>; +defm : T_VV_pat <V6_vmaxuh, int_hexagon_V6_vmaxuh>; +defm : T_VV_pat <V6_vmaxh, int_hexagon_V6_vmaxh>; +defm : T_VV_pat <V6_vmaxw, int_hexagon_V6_vmaxw>; +defm : T_VV_pat <V6_vdelta, int_hexagon_V6_vdelta>; +defm : T_VV_pat <V6_vrdelta, int_hexagon_V6_vrdelta>; +defm : T_VV_pat <V6_vdealb4w, int_hexagon_V6_vdealb4w>; +defm : T_VV_pat <V6_vmpyowh_rnd, int_hexagon_V6_vmpyowh_rnd>; +defm : T_VV_pat <V6_vshuffeb, int_hexagon_V6_vshuffeb>; +defm : T_VV_pat <V6_vshuffob, int_hexagon_V6_vshuffob>; +defm : T_VV_pat <V6_vshufeh, int_hexagon_V6_vshufeh>; +defm : T_VV_pat <V6_vshufoh, int_hexagon_V6_vshufoh>; +defm : T_VV_pat <V6_vshufoeh, int_hexagon_V6_vshufoeh>; +defm : T_VV_pat <V6_vshufoeb, int_hexagon_V6_vshufoeb>; +defm : T_VV_pat <V6_vcombine, int_hexagon_V6_vcombine>; +defm : T_VV_pat <V6_vmpyieoh, int_hexagon_V6_vmpyieoh>; +defm : T_VV_pat <V6_vsathub, int_hexagon_V6_vsathub>; +defm : T_VV_pat <V6_vsatwh, int_hexagon_V6_vsatwh>; +defm : T_VV_pat <V6_vroundwh, int_hexagon_V6_vroundwh>; +defm : T_VV_pat <V6_vroundwuh, int_hexagon_V6_vroundwuh>; +defm : T_VV_pat <V6_vroundhb, int_hexagon_V6_vroundhb>; +defm : T_VV_pat <V6_vroundhub, int_hexagon_V6_vroundhub>; +defm : T_VV_pat <V6_vasrwv, int_hexagon_V6_vasrwv>; +defm : T_VV_pat <V6_vlsrwv, int_hexagon_V6_vlsrwv>; +defm : T_VV_pat <V6_vlsrhv, int_hexagon_V6_vlsrhv>; +defm : T_VV_pat <V6_vasrhv, int_hexagon_V6_vasrhv>; +defm : T_VV_pat <V6_vaslwv, int_hexagon_V6_vaslwv>; +defm : T_VV_pat <V6_vaslhv, int_hexagon_V6_vaslhv>; +defm : T_VV_pat <V6_vaddb, int_hexagon_V6_vaddb>; +defm : T_VV_pat <V6_vaddh, int_hexagon_V6_vaddh>; +defm : T_VV_pat <V6_vmpyiewuh, int_hexagon_V6_vmpyiewuh>; +defm : T_VV_pat <V6_vmpyiowh, int_hexagon_V6_vmpyiowh>; +defm : T_VV_pat <V6_vpackeb, int_hexagon_V6_vpackeb>; +defm : T_VV_pat <V6_vpackeh, int_hexagon_V6_vpackeh>; +defm : T_VV_pat <V6_vpackhub_sat, int_hexagon_V6_vpackhub_sat>; +defm : T_VV_pat <V6_vpackhb_sat, int_hexagon_V6_vpackhb_sat>; +defm : T_VV_pat <V6_vpackwuh_sat, int_hexagon_V6_vpackwuh_sat>; +defm : T_VV_pat <V6_vpackwh_sat, int_hexagon_V6_vpackwh_sat>; +defm : T_VV_pat <V6_vpackob, int_hexagon_V6_vpackob>; +defm : T_VV_pat <V6_vpackoh, int_hexagon_V6_vpackoh>; +defm : T_VV_pat <V6_vmpyewuh, int_hexagon_V6_vmpyewuh>; +defm : T_VV_pat <V6_vmpyowh, int_hexagon_V6_vmpyowh>; + +defm : T_QVV_pat <V6_vaddbq, int_hexagon_V6_vaddbq>; +defm : T_QVV_pat <V6_vaddhq, int_hexagon_V6_vaddhq>; +defm : T_QVV_pat <V6_vaddwq, int_hexagon_V6_vaddwq>; +defm : T_QVV_pat <V6_vaddbnq, int_hexagon_V6_vaddbnq>; +defm : T_QVV_pat <V6_vaddhnq, int_hexagon_V6_vaddhnq>; +defm : T_QVV_pat <V6_vaddwnq, int_hexagon_V6_vaddwnq>; +defm : T_QVV_pat <V6_vsubbq, int_hexagon_V6_vsubbq>; +defm : T_QVV_pat <V6_vsubhq, int_hexagon_V6_vsubhq>; +defm : T_QVV_pat <V6_vsubwq, int_hexagon_V6_vsubwq>; +defm : T_QVV_pat <V6_vsubbnq, int_hexagon_V6_vsubbnq>; +defm : T_QVV_pat <V6_vsubhnq, int_hexagon_V6_vsubhnq>; +defm : T_QVV_pat <V6_vsubwnq, int_hexagon_V6_vsubwnq>; + +defm : T_V_pat <V6_vabsh, int_hexagon_V6_vabsh>; +defm : T_V_pat <V6_vabsw, int_hexagon_V6_vabsw>; +defm : T_V_pat <V6_vabsw_sat, int_hexagon_V6_vabsw_sat>; +defm : T_V_pat <V6_vabsh_sat, int_hexagon_V6_vabsh_sat>; +defm : T_V_pat <V6_vnot, int_hexagon_V6_vnot>; +defm : T_V_pat <V6_vassign, int_hexagon_V6_vassign>; +defm : T_V_pat <V6_vzb, int_hexagon_V6_vzb>; +defm : T_V_pat <V6_vzh, int_hexagon_V6_vzh>; +defm : T_V_pat <V6_vsb, int_hexagon_V6_vsb>; +defm : T_V_pat <V6_vsh, int_hexagon_V6_vsh>; +defm : T_V_pat <V6_vdealh, int_hexagon_V6_vdealh>; +defm : T_V_pat <V6_vdealb, int_hexagon_V6_vdealb>; +defm : T_V_pat <V6_vunpackub, int_hexagon_V6_vunpackub>; +defm : T_V_pat <V6_vunpackuh, int_hexagon_V6_vunpackuh>; +defm : T_V_pat <V6_vunpackb, int_hexagon_V6_vunpackb>; +defm : T_V_pat <V6_vunpackh, int_hexagon_V6_vunpackh>; +defm : T_V_pat <V6_vshuffh, int_hexagon_V6_vshuffh>; +defm : T_V_pat <V6_vshuffb, int_hexagon_V6_vshuffb>; +defm : T_V_pat <V6_vcl0w, int_hexagon_V6_vcl0w>; +defm : T_V_pat <V6_vpopcounth, int_hexagon_V6_vpopcounth>; +defm : T_V_pat <V6_vcl0h, int_hexagon_V6_vcl0h>; +defm : T_V_pat <V6_vnormamtw, int_hexagon_V6_vnormamtw>; +defm : T_V_pat <V6_vnormamth, int_hexagon_V6_vnormamth>; + +defm : T_WRI_pat <V6_vrmpybusi, int_hexagon_V6_vrmpybusi>; +defm : T_WRI_pat <V6_vrsadubi, int_hexagon_V6_vrsadubi>; +defm : T_WRI_pat <V6_vrmpyubi, int_hexagon_V6_vrmpyubi>; + +defm : T_WWRI_pat <V6_vrmpybusi_acc, int_hexagon_V6_vrmpybusi_acc>; +defm : T_WWRI_pat <V6_vrsadubi_acc, int_hexagon_V6_vrsadubi_acc>; +defm : T_WWRI_pat <V6_vrmpyubi_acc, int_hexagon_V6_vrmpyubi_acc>; + +// assembler mapped. +//defm : T_V_pat <V6_vtran2x2, int_hexagon_V6_vtran2x2>; +// not present earlier.. need to add intrinsic +defm : T_VVR_pat <V6_valignb, int_hexagon_V6_valignb>; +defm : T_VVR_pat <V6_vlalignb, int_hexagon_V6_vlalignb>; +defm : T_VVR_pat <V6_vasrwh, int_hexagon_V6_vasrwh>; +defm : T_VVR_pat <V6_vasrwhsat, int_hexagon_V6_vasrwhsat>; +defm : T_VVR_pat <V6_vasrwhrndsat, int_hexagon_V6_vasrwhrndsat>; +defm : T_VVR_pat <V6_vasrwuhsat, int_hexagon_V6_vasrwuhsat>; +defm : T_VVR_pat <V6_vasrhubsat, int_hexagon_V6_vasrhubsat>; +defm : T_VVR_pat <V6_vasrhubrndsat, int_hexagon_V6_vasrhubrndsat>; +defm : T_VVR_pat <V6_vasrhbrndsat, int_hexagon_V6_vasrhbrndsat>; + +defm : T_VVR_pat <V6_vshuffvdd, int_hexagon_V6_vshuffvdd>; +defm : T_VVR_pat <V6_vdealvdd, int_hexagon_V6_vdealvdd>; + +defm : T_WV_pat <V6_vunpackob, int_hexagon_V6_vunpackob>; +defm : T_WV_pat <V6_vunpackoh, int_hexagon_V6_vunpackoh>; +defm : T_VVI_pat <V6_valignbi, int_hexagon_V6_valignbi>; +defm : T_VVI_pat <V6_vlalignbi, int_hexagon_V6_vlalignbi>; + +defm : T_QVV_pat <V6_vswap, int_hexagon_V6_vswap>; +defm : T_QVV_pat <V6_vmux, int_hexagon_V6_vmux>; +defm : T_QQ_pat <V6_pred_and, int_hexagon_V6_pred_and>; +defm : T_QQ_pat <V6_pred_or, int_hexagon_V6_pred_or>; +defm : T_Q_pat <V6_pred_not, int_hexagon_V6_pred_not>; +defm : T_QQ_pat <V6_pred_xor, int_hexagon_V6_pred_xor>; +defm : T_QQ_pat <V6_pred_or_n, int_hexagon_V6_pred_or_n>; +defm : T_QQ_pat <V6_pred_and_n, int_hexagon_V6_pred_and_n>; +defm : T_VV_pat <V6_veqb, int_hexagon_V6_veqb>; +defm : T_VV_pat <V6_veqh, int_hexagon_V6_veqh>; +defm : T_VV_pat <V6_veqw, int_hexagon_V6_veqw>; +defm : T_VV_pat <V6_vgtb, int_hexagon_V6_vgtb>; +defm : T_VV_pat <V6_vgth, int_hexagon_V6_vgth>; +defm : T_VV_pat <V6_vgtw, int_hexagon_V6_vgtw>; +defm : T_VV_pat <V6_vgtub, int_hexagon_V6_vgtub>; +defm : T_VV_pat <V6_vgtuh, int_hexagon_V6_vgtuh>; +defm : T_VV_pat <V6_vgtuw, int_hexagon_V6_vgtuw>; + +defm : T_VQR_pat <V6_vandqrt_acc, int_hexagon_V6_vandqrt_acc>; +defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>; +defm : T_QR_pat <V6_vandqrt, int_hexagon_V6_vandqrt>; +defm : T_R_pat <V6_lvsplatw, int_hexagon_V6_lvsplatw>; +defm : T_R_pat <V6_pred_scalar2, int_hexagon_V6_pred_scalar2>; +defm : T_VR_pat <V6_vandvrt, int_hexagon_V6_vandvrt>; + +defm : T_VVR_pat <V6_vlutvvb, int_hexagon_V6_vlutvvb>; +defm : T_VVR_pat <V6_vlutvwh, int_hexagon_V6_vlutvwh>; +defm : T_VVVR_pat <V6_vlutvvb_oracc, int_hexagon_V6_vlutvvb_oracc>; +defm : T_WVVR_pat <V6_vlutvwh_oracc, int_hexagon_V6_vlutvwh_oracc>; + +defm : T_QVR_pat <V6_vandvrt_acc, int_hexagon_V6_vandvrt_acc>; +def : T_PI_pat <S6_rol_i_p, int_hexagon_S6_rol_i_p>; +def : T_RI_pat <S6_rol_i_r, int_hexagon_S6_rol_i_r>; +def : T_PPI_pat <S6_rol_i_p_nac, int_hexagon_S6_rol_i_p_nac>; +def : T_PPI_pat <S6_rol_i_p_acc, int_hexagon_S6_rol_i_p_acc>; +def : T_PPI_pat <S6_rol_i_p_and, int_hexagon_S6_rol_i_p_and>; +def : T_PPI_pat <S6_rol_i_p_or, int_hexagon_S6_rol_i_p_or>; +def : T_PPI_pat <S6_rol_i_p_xacc, int_hexagon_S6_rol_i_p_xacc>; +def : T_RRI_pat <S6_rol_i_r_nac, int_hexagon_S6_rol_i_r_nac>; +def : T_RRI_pat <S6_rol_i_r_acc, int_hexagon_S6_rol_i_r_acc>; +def : T_RRI_pat <S6_rol_i_r_and, int_hexagon_S6_rol_i_r_and>; +def : T_RRI_pat <S6_rol_i_r_or, int_hexagon_S6_rol_i_r_or>; +def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>; + +defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>; +defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>; + +def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>; + +def: Pat<(v64i16 (trunc v64i32:$Vdd)), + (v64i16 (V6_vpackwh_sat_128B + (v32i32 (HEXAGON_V6_hi_128B VecDblRegs128B:$Vdd)), + (v32i32 (HEXAGON_V6_lo_128B VecDblRegs128B:$Vdd))))>, + Requires<[UseHVXDbl]>; + + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp index 75189b6..624c0f6 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp @@ -26,39 +26,71 @@ using namespace llvm; -static MCOperand GetSymbolRef(const MachineOperand& MO, const MCSymbol* Symbol, - HexagonAsmPrinter& Printer) { +namespace llvm { + void HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP); +} + +static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol, + HexagonAsmPrinter &Printer) { MCContext &MC = Printer.OutContext; const MCExpr *ME; - ME = MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, MC); + // Populate the relocation type based on Hexagon target flags + // set on an operand + MCSymbolRefExpr::VariantKind RelocationType; + switch (MO.getTargetFlags()) { + default: + RelocationType = MCSymbolRefExpr::VK_None; + break; + case HexagonII::MO_PCREL: + RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL; + break; + case HexagonII::MO_GOT: + RelocationType = MCSymbolRefExpr::VK_GOT; + break; + case HexagonII::MO_LO16: + RelocationType = MCSymbolRefExpr::VK_Hexagon_LO16; + break; + case HexagonII::MO_HI16: + RelocationType = MCSymbolRefExpr::VK_Hexagon_HI16; + break; + case HexagonII::MO_GPREL: + RelocationType = MCSymbolRefExpr::VK_Hexagon_GPREL; + break; + } + + ME = MCSymbolRefExpr::create(Symbol, RelocationType, MC); if (!MO.isJTI() && MO.getOffset()) ME = MCBinaryExpr::createAdd(ME, MCConstantExpr::create(MO.getOffset(), MC), MC); - return (MCOperand::createExpr(ME)); + return MCOperand::createExpr(ME); } // Create an MCInst from a MachineInstr -void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, - HexagonAsmPrinter& AP) { - if(MI->getOpcode() == Hexagon::ENDLOOP0){ +void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI, + MCInst &MCB, HexagonAsmPrinter &AP) { + if (MI->getOpcode() == Hexagon::ENDLOOP0) { HexagonMCInstrInfo::setInnerLoop(MCB); return; } - if(MI->getOpcode() == Hexagon::ENDLOOP1){ + if (MI->getOpcode() == Hexagon::ENDLOOP1) { HexagonMCInstrInfo::setOuterLoop(MCB); return; } - MCInst* MCI = new (AP.OutContext) MCInst; + MCInst *MCI = new (AP.OutContext) MCInst; MCI->setOpcode(MI->getOpcode()); assert(MCI->getOpcode() == static_cast<unsigned>(MI->getOpcode()) && "MCI opcode should have been set on construction"); + bool MustExtend = false; for (unsigned i = 0, e = MI->getNumOperands(); i < e; i++) { const MachineOperand &MO = MI->getOperand(i); MCOperand MCO; + if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended) + MustExtend = true; switch (MO.getType()) { default: @@ -73,11 +105,14 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, APFloat Val = MO.getFPImm()->getValueAPF(); // FP immediates are used only when setting GPRs, so they may be dealt // with like regular immediates from this point on. - MCO = MCOperand::createImm(*Val.bitcastToAPInt().getRawData()); + MCO = MCOperand::createExpr( + MCConstantExpr::create(*Val.bitcastToAPInt().getRawData(), + AP.OutContext)); break; } case MachineOperand::MO_Immediate: - MCO = MCOperand::createImm(MO.getImm()); + MCO = MCOperand::createExpr( + MCConstantExpr::create(MO.getImm(), AP.OutContext)); break; case MachineOperand::MO_MachineBasicBlock: MCO = MCOperand::createExpr @@ -104,5 +139,8 @@ void llvm::HexagonLowerToMC(MachineInstr const* MI, MCInst& MCB, MCI->addOperand(MCO); } + AP.HexagonProcessInstruction(*MCI, *MI); + HexagonMCInstrInfo::extendIfNeeded(AP.OutContext, MCII, MCB, *MCI, + MustExtend); MCB.addOperand(MCOperand::createInst(MCI)); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 35f732c..7a52d68 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -179,7 +179,11 @@ void VLIWMachineScheduler::schedule() { initQueues(TopRoots, BotRoots); bool IsTopNode = false; - while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) { + while (true) { + DEBUG(dbgs() << "** VLIWMachineScheduler::schedule picking next node\n"); + SUnit *SU = SchedImpl->pickNode(IsTopNode); + if (!SU) break; + if (!checkSchedLimit()) break; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp index 707bfdb..20c4ab1 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp @@ -92,6 +92,7 @@ namespace { /// \brief A handle to the branch probability pass. const MachineBranchProbabilityInfo *MBPI; + bool isNewValueJumpCandidate(const MachineInstr *MI) const; }; } // end of anonymous namespace @@ -280,9 +281,9 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII, return true; } -// Given a compare operator, return a matching New Value Jump -// compare operator. Make sure that MI here is included in -// HexagonInstrInfo.cpp::isNewValueJumpCandidate + +// Given a compare operator, return a matching New Value Jump compare operator. +// Make sure that MI here is included in isNewValueJumpCandidate. static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg, bool secondRegNewified, MachineBasicBlock *jmpTarget, @@ -341,6 +342,24 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg, return taken ? Hexagon::J4_cmpgtui_t_jumpnv_t : Hexagon::J4_cmpgtui_t_jumpnv_nt; + case Hexagon::C4_cmpneq: + return taken ? Hexagon::J4_cmpeq_f_jumpnv_t + : Hexagon::J4_cmpeq_f_jumpnv_nt; + + case Hexagon::C4_cmplte: + if (secondRegNewified) + return taken ? Hexagon::J4_cmplt_f_jumpnv_t + : Hexagon::J4_cmplt_f_jumpnv_nt; + return taken ? Hexagon::J4_cmpgt_f_jumpnv_t + : Hexagon::J4_cmpgt_f_jumpnv_nt; + + case Hexagon::C4_cmplteu: + if (secondRegNewified) + return taken ? Hexagon::J4_cmpltu_f_jumpnv_t + : Hexagon::J4_cmpltu_f_jumpnv_nt; + return taken ? Hexagon::J4_cmpgtu_f_jumpnv_t + : Hexagon::J4_cmpgtu_f_jumpnv_nt; + default: llvm_unreachable("Could not find matching New Value Jump instruction."); } @@ -348,6 +367,26 @@ static unsigned getNewValueJumpOpcode(MachineInstr *MI, int reg, return 0; } +bool HexagonNewValueJump::isNewValueJumpCandidate(const MachineInstr *MI) + const { + switch (MI->getOpcode()) { + case Hexagon::C2_cmpeq: + case Hexagon::C2_cmpeqi: + case Hexagon::C2_cmpgt: + case Hexagon::C2_cmpgti: + case Hexagon::C2_cmpgtu: + case Hexagon::C2_cmpgtui: + case Hexagon::C4_cmpneq: + case Hexagon::C4_cmplte: + case Hexagon::C4_cmplteu: + return true; + + default: + return false; + } +} + + bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n" @@ -372,7 +411,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { // Loop through all the bb's of the function for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; DEBUG(dbgs() << "** dumping bb ** " << MBB->getNumber() << "\n"); @@ -468,7 +507,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { MI->getOperand(0).getReg() == predReg) { // Not all compares can be new value compare. Arch Spec: 7.6.1.1 - if (QII->isNewValueJumpCandidate(MI)) { + if (isNewValueJumpCandidate(MI)) { assert((MI->getDesc().isCompare()) && "Only compare instruction can be collapsed into New Value Jump"); @@ -591,8 +630,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) { DebugLoc dl = MI->getDebugLoc(); MachineInstr *NewMI; - assert((QII->isNewValueJumpCandidate(cmpInstr)) && - "This compare is not a New Value Jump candidate."); + assert((isNewValueJumpCandidate(cmpInstr)) && + "This compare is not a New Value Jump candidate."); unsigned opc = getNewValueJumpOpcode(cmpInstr, cmpOp2, isSecondOpNewified, jmpTarget, MBPI); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td index 2bece8f..fbd29cd 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonOperands.td @@ -1,4 +1,4 @@ -//===- HexagonOperands.td - Hexagon immediate processing -*- tablegen -*-===// +//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===// // // The LLVM Compiler Infrastructure // @@ -7,59 +7,114 @@ // //===----------------------------------------------------------------------===// +def s32ImmOperand : AsmOperandClass { let Name = "s32Imm"; } +def s8ImmOperand : AsmOperandClass { let Name = "s8Imm"; } +def s8Imm64Operand : AsmOperandClass { let Name = "s8Imm64"; } +def s6ImmOperand : AsmOperandClass { let Name = "s6Imm"; } +def s4ImmOperand : AsmOperandClass { let Name = "s4Imm"; } def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; } def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; } def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; } def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; } - +def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; } +def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; } +def u64ImmOperand : AsmOperandClass { let Name = "u64Imm"; } +def u32ImmOperand : AsmOperandClass { let Name = "u32Imm"; } +def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; } +def u16ImmOperand : AsmOperandClass { let Name = "u16Imm"; } +def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; } +def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; } +def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; } +def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; } +def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; } +def u10ImmOperand : AsmOperandClass { let Name = "u10Imm"; } +def u9ImmOperand : AsmOperandClass { let Name = "u9Imm"; } +def u8ImmOperand : AsmOperandClass { let Name = "u8Imm"; } +def u7ImmOperand : AsmOperandClass { let Name = "u7Imm"; } +def u6ImmOperand : AsmOperandClass { let Name = "u6Imm"; } +def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; } +def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; } +def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; } +def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; } +def u5ImmOperand : AsmOperandClass { let Name = "u5Imm"; } +def u4ImmOperand : AsmOperandClass { let Name = "u4Imm"; } +def u3ImmOperand : AsmOperandClass { let Name = "u3Imm"; } +def u2ImmOperand : AsmOperandClass { let Name = "u2Imm"; } +def u1ImmOperand : AsmOperandClass { let Name = "u1Imm"; } +def n8ImmOperand : AsmOperandClass { let Name = "n8Imm"; } // Immediate operands. -let PrintMethod = "printImmOperand" in { - def s32Imm : Operand<i32>; - def s8Imm : Operand<i32>; - def s8Imm64 : Operand<i64>; - def s6Imm : Operand<i32>; +let OperandType = "OPERAND_IMMEDIATE", + DecoderMethod = "unsignedImmDecoder" in { + def s32Imm : Operand<i32> { let ParserMatchClass = s32ImmOperand; + let DecoderMethod = "s32ImmDecoder"; } + def s8Imm : Operand<i32> { let ParserMatchClass = s8ImmOperand; + let DecoderMethod = "s8ImmDecoder"; } + def s8Imm64 : Operand<i64> { let ParserMatchClass = s8Imm64Operand; + let DecoderMethod = "s8ImmDecoder"; } + def s6Imm : Operand<i32> { let ParserMatchClass = s6ImmOperand; + let DecoderMethod = "s6_0ImmDecoder"; } def s6_3Imm : Operand<i32>; - def s4Imm : Operand<i32>; - def s4_0Imm : Operand<i32> { let DecoderMethod = "s4_0ImmDecoder"; } - def s4_1Imm : Operand<i32> { let DecoderMethod = "s4_1ImmDecoder"; } - def s4_2Imm : Operand<i32> { let DecoderMethod = "s4_2ImmDecoder"; } - def s4_3Imm : Operand<i32> { let DecoderMethod = "s4_3ImmDecoder"; } - def u64Imm : Operand<i64>; - def u32Imm : Operand<i32>; - def u26_6Imm : Operand<i32>; - def u16Imm : Operand<i32>; - def u16_0Imm : Operand<i32>; - def u16_1Imm : Operand<i32>; - def u16_2Imm : Operand<i32>; - def u16_3Imm : Operand<i32>; - def u11_3Imm : Operand<i32>; - def u10Imm : Operand<i32>; - def u9Imm : Operand<i32>; - def u8Imm : Operand<i32>; - def u7Imm : Operand<i32>; - def u6Imm : Operand<i32>; - def u6_0Imm : Operand<i32>; - def u6_1Imm : Operand<i32>; - def u6_2Imm : Operand<i32>; - def u6_3Imm : Operand<i32>; - def u5Imm : Operand<i32>; + def s4Imm : Operand<i32> { let ParserMatchClass = s4ImmOperand; + let DecoderMethod = "s4_0ImmDecoder"; } + def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; + let DecoderMethod = "s4_0ImmDecoder"; } + def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; + let DecoderMethod = "s4_1ImmDecoder"; } + def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; + let DecoderMethod = "s4_2ImmDecoder"; } + def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; + let DecoderMethod = "s4_3ImmDecoder"; } + def u64Imm : Operand<i64> { let ParserMatchClass = u64ImmOperand; } + def u32Imm : Operand<i32> { let ParserMatchClass = u32ImmOperand; } + def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; } + def u16Imm : Operand<i32> { let ParserMatchClass = u16ImmOperand; } + def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; } + def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; } + def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; } + def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; } + def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; } + def u10Imm : Operand<i32> { let ParserMatchClass = u10ImmOperand; } + def u9Imm : Operand<i32> { let ParserMatchClass = u9ImmOperand; } + def u8Imm : Operand<i32> { let ParserMatchClass = u8ImmOperand; } + def u7Imm : Operand<i32> { let ParserMatchClass = u7ImmOperand; } + def u6Imm : Operand<i32> { let ParserMatchClass = u6ImmOperand; } + def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; } + def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; } + def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; } + def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; } + def u5Imm : Operand<i32> { let ParserMatchClass = u5ImmOperand; } + def u5_0Imm : Operand<i32>; + def u5_1Imm : Operand<i32>; def u5_2Imm : Operand<i32>; def u5_3Imm : Operand<i32>; - def u4Imm : Operand<i32>; + def u4Imm : Operand<i32> { let ParserMatchClass = u4ImmOperand; } def u4_0Imm : Operand<i32>; + def u4_1Imm : Operand<i32>; def u4_2Imm : Operand<i32>; - def u3Imm : Operand<i32>; + def u4_3Imm : Operand<i32>; + def u3Imm : Operand<i32> { let ParserMatchClass = u3ImmOperand; } def u3_0Imm : Operand<i32>; def u3_1Imm : Operand<i32>; - def u2Imm : Operand<i32>; - def u1Imm : Operand<i32>; - def n8Imm : Operand<i32>; - def m6Imm : Operand<i32>; + def u3_2Imm : Operand<i32>; + def u3_3Imm : Operand<i32>; + def u2Imm : Operand<i32> { let ParserMatchClass = u2ImmOperand; } + def u1Imm : Operand<i32> { let ParserMatchClass = u1ImmOperand; } + def n8Imm : Operand<i32> { let ParserMatchClass = n8ImmOperand; } } -let PrintMethod = "printNOneImmOperand" in -def nOneImm : Operand<i32>; +let OperandType = "OPERAND_IMMEDIATE" in { + def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand; + let PrintMethod = "prints4_6ImmOperand"; + let DecoderMethod = "s4_6ImmDecoder";} + def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand"; + let DecoderMethod = "s4_6ImmDecoder";} + def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand; + let PrintMethod = "prints3_6ImmOperand"; + let DecoderMethod = "s3_6ImmDecoder";} + def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand"; + let DecoderMethod = "s3_6ImmDecoder";} +} // // Immediate predicates @@ -81,32 +136,12 @@ def s31_1ImmPred : PatLeaf<(i32 imm), [{ def s30_2ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<31,1>(v); + return isShiftedInt<30,2>(v); }]>; def s29_3ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<31,1>(v); -}]>; - -def s22_10ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<22,10>(v); -}]>; - -def s8_24ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<8,24>(v); -}]>; - -def s16_16ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<16,16>(v); -}]>; - -def s26_6ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isShiftedInt<26,6>(v); + return isShiftedInt<29,3>(v); }]>; def s16ImmPred : PatLeaf<(i32 imm), [{ @@ -114,16 +149,6 @@ def s16ImmPred : PatLeaf<(i32 imm), [{ return isInt<16>(v); }]>; -def s13ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<13>(v); -}]>; - -def s12ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<12>(v); -}]>; - def s11_0ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isInt<11>(v); @@ -149,16 +174,6 @@ def s10ImmPred : PatLeaf<(i32 imm), [{ return isInt<10>(v); }]>; -def s9ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<9>(v); -}]>; - -def m9ImmPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - return isInt<9>(v) && (v != -256); -}]>; - def s8ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isInt<8>(v); @@ -194,7 +209,6 @@ def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4,3>(v); }]>; - def u64ImmPred : PatLeaf<(i64 imm), [{ // Adding "N ||" to suppress gcc unused warning. return (N || true); @@ -230,19 +244,19 @@ def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26,6>(v); }]>; -def u16ImmPred : PatLeaf<(i32 imm), [{ +def u16_0ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isUInt<16>(v); }]>; -def u16_s8ImmPred : PatLeaf<(i32 imm), [{ +def u16_1ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isShiftedUInt<16,8>(v); + return isShiftedUInt<16,1>(v); }]>; -def u16_0ImmPred : PatLeaf<(i32 imm), [{ +def u16_2ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - return isUInt<16>(v); + return isShiftedUInt<16,2>(v); }]>; def u11_3ImmPred : PatLeaf<(i32 imm), [{ @@ -250,6 +264,11 @@ def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11,3>(v); }]>; +def u10ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isUInt<10>(v); +}]>; + def u9ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); return isUInt<9>(v); @@ -321,6 +340,11 @@ def u1ImmPred : PatLeaf<(i1 imm), [{ return isUInt<1>(v); }]>; +def u1ImmPred32 : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + return isUInt<1>(v); +}]>; + def m5BImmPred : PatLeaf<(i32 imm), [{ // m5BImmPred predicate - True if the (char) number is in range -1 .. -31 // and will fit in a 5 bit field when made positive, for use in memops. @@ -379,7 +403,7 @@ def Clr5ImmPred : PatLeaf<(i32 imm), [{ }]>; def SetClr5ImmPred : PatLeaf<(i32 imm), [{ - // SetClr5ImmPred predicate - True if the immediate is in range 0..31. + // True if the immediate is in range 0..31. int32_t v = (int32_t)N->getSExtValue(); return (v >= 0 && v <= 31); }]>; @@ -404,14 +428,13 @@ def Clr4ImmPred : PatLeaf<(i32 imm), [{ }]>; def SetClr4ImmPred : PatLeaf<(i32 imm), [{ - // SetClr4ImmPred predicate - True if the immediate is in the range 0..15. + // True if the immediate is in the range 0..15. int16_t v = (int16_t)N->getSExtValue(); return (v >= 0 && v <= 15); }]>; def Set3ImmPred : PatLeaf<(i32 imm), [{ - // Set3ImmPred predicate - True if the number is in the series of values: - // [ 2^0, 2^1, ... 2^7 ]. + // True if the number is in the series of values: [ 2^0, 2^1, ... 2^7 ]. // For use in setbit immediate. uint8_t v = (int8_t)N->getSExtValue(); // Constrain to 8 bits, and then check for single bit. @@ -419,9 +442,7 @@ def Set3ImmPred : PatLeaf<(i32 imm), [{ }]>; def Clr3ImmPred : PatLeaf<(i32 imm), [{ - // Clr3ImmPred predicate - True if the number is in the series of - // bit negated values: - // [ 2^0, 2^1, ... 2^7 ]. + // True if the number is in the series of bit negated values: [ 2^0, 2^1, ... 2^7 ]. // For use in setbit and clrbit immediate. uint8_t v = ~ (int8_t)N->getSExtValue(); // Constrain to 8 bits, and then check for single bit. @@ -429,76 +450,109 @@ def Clr3ImmPred : PatLeaf<(i32 imm), [{ }]>; def SetClr3ImmPred : PatLeaf<(i32 imm), [{ - // SetClr3ImmPred predicate - True if the immediate is in the range 0..7. + // True if the immediate is in the range 0..7. int8_t v = (int8_t)N->getSExtValue(); return (v >= 0 && v <= 7); }]>; // Extendable immediate operands. - -let PrintMethod = "printExtOperand" in { - def f32Ext : Operand<f32>; - def s16Ext : Operand<i32> { let DecoderMethod = "s16ImmDecoder"; } - def s12Ext : Operand<i32> { let DecoderMethod = "s12ImmDecoder"; } - def s11_0Ext : Operand<i32> { let DecoderMethod = "s11_0ImmDecoder"; } - def s11_1Ext : Operand<i32> { let DecoderMethod = "s11_1ImmDecoder"; } - def s11_2Ext : Operand<i32> { let DecoderMethod = "s11_2ImmDecoder"; } - def s11_3Ext : Operand<i32> { let DecoderMethod = "s11_3ImmDecoder"; } - def s10Ext : Operand<i32> { let DecoderMethod = "s10ImmDecoder"; } - def s9Ext : Operand<i32> { let DecoderMethod = "s90ImmDecoder"; } - def s8Ext : Operand<i32> { let DecoderMethod = "s8ImmDecoder"; } - def s7Ext : Operand<i32>; - def s6Ext : Operand<i32> { let DecoderMethod = "s6_0ImmDecoder"; } - def u6Ext : Operand<i32>; - def u7Ext : Operand<i32>; - def u8Ext : Operand<i32>; - def u9Ext : Operand<i32>; - def u10Ext : Operand<i32>; - def u6_0Ext : Operand<i32>; - def u6_1Ext : Operand<i32>; - def u6_2Ext : Operand<i32>; - def u6_3Ext : Operand<i32>; +def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; } +def s16ExtOperand : AsmOperandClass { let Name = "s16Ext"; } +def s12ExtOperand : AsmOperandClass { let Name = "s12Ext"; } +def s10ExtOperand : AsmOperandClass { let Name = "s10Ext"; } +def s9ExtOperand : AsmOperandClass { let Name = "s9Ext"; } +def s8ExtOperand : AsmOperandClass { let Name = "s8Ext"; } +def s7ExtOperand : AsmOperandClass { let Name = "s7Ext"; } +def s6ExtOperand : AsmOperandClass { let Name = "s6Ext"; } +def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; } +def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; } +def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; } +def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; } +def u6ExtOperand : AsmOperandClass { let Name = "u6Ext"; } +def u7ExtOperand : AsmOperandClass { let Name = "u7Ext"; } +def u8ExtOperand : AsmOperandClass { let Name = "u8Ext"; } +def u9ExtOperand : AsmOperandClass { let Name = "u9Ext"; } +def u10ExtOperand : AsmOperandClass { let Name = "u10Ext"; } +def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; } +def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; } +def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; } +def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; } +def u32MustExtOperand : AsmOperandClass { let Name = "u32MustExt"; } + + + +let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand", + DecoderMethod = "unsignedImmDecoder" in { + def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; } + def s16Ext : Operand<i32> { let ParserMatchClass = s16ExtOperand; + let DecoderMethod = "s16ImmDecoder"; } + def s12Ext : Operand<i32> { let ParserMatchClass = s12ExtOperand; + let DecoderMethod = "s12ImmDecoder"; } + def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand; + let DecoderMethod = "s11_0ImmDecoder"; } + def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand; + let DecoderMethod = "s11_1ImmDecoder"; } + def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand; + let DecoderMethod = "s11_2ImmDecoder"; } + def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand; + let DecoderMethod = "s11_3ImmDecoder"; } + def s10Ext : Operand<i32> { let ParserMatchClass = s10ExtOperand; + let DecoderMethod = "s10ImmDecoder"; } + def s9Ext : Operand<i32> { let ParserMatchClass = s9ExtOperand; + let DecoderMethod = "s90ImmDecoder"; } + def s8Ext : Operand<i32> { let ParserMatchClass = s8ExtOperand; + let DecoderMethod = "s8ImmDecoder"; } + def s7Ext : Operand<i32> { let ParserMatchClass = s7ExtOperand; } + def s6Ext : Operand<i32> { let ParserMatchClass = s6ExtOperand; + let DecoderMethod = "s6_0ImmDecoder"; } + def u6Ext : Operand<i32> { let ParserMatchClass = u6ExtOperand; } + def u7Ext : Operand<i32> { let ParserMatchClass = u7ExtOperand; } + def u8Ext : Operand<i32> { let ParserMatchClass = u8ExtOperand; } + def u9Ext : Operand<i32> { let ParserMatchClass = u9ExtOperand; } + def u10Ext : Operand<i32> { let ParserMatchClass = u10ExtOperand; } + def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; } + def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; } + def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; } + def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; } + def u32MustExt : Operand<i32> { let ParserMatchClass = u32MustExtOperand; } } -def s10ExtPred : PatLeaf<(i32 imm), [{ - int64_t v = (int64_t)N->getSExtValue(); - if (isInt<10>(v)) - return true; - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); +def s4_7ImmPred : PatLeaf<(i32 imm), [{ + int64_t v = (int64_t)N->getSExtValue(); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 10-bit sign extended field and + // is 128-byte aligned. + return isShiftedInt<4,7>(v); + return false; }]>; -def s8ExtPred : PatLeaf<(i32 imm), [{ +def s3_7ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - if (isInt<8>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit signed field. - return isConstExtProfitable(Node) && isInt<32>(v); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 9-bit sign extended field and + // is 128-byte aligned. + return isShiftedInt<3,7>(v); + return false; }]>; -def u8ExtPred : PatLeaf<(i32 imm), [{ +def s4_6ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<8>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 10-bit sign extended field and + // is 64-byte aligned. + return isShiftedInt<4,6>(v); + return false; }]>; -def u9ExtPred : PatLeaf<(i32 imm), [{ +def s3_6ImmPred : PatLeaf<(i32 imm), [{ int64_t v = (int64_t)N->getSExtValue(); - if (isUInt<9>(v)) - return true; - - // Return true if extending this immediate is profitable and the value - // can fit in a 32-bit unsigned field. - return isConstExtProfitable(Node) && isUInt<32>(v); + if (HST->hasV60TOps()) + // Return true if the immediate can fit in a 9-bit sign extended field and + // is 64-byte aligned. + return isShiftedInt<3,6>(v); + return false; }]>; @@ -523,21 +577,21 @@ let PrintMethod = "printGlobalOperand" in { let PrintMethod = "printJumpTable" in def jumptablebase : Operand<i32>; -def brtarget : Operand<OtherVT>; +def brtarget : Operand<OtherVT> { + let DecoderMethod = "brtargetDecoder"; + let PrintMethod = "printBrtarget"; +} def brtargetExt : Operand<OtherVT> { - let PrintMethod = "printExtBrtarget"; + let DecoderMethod = "brtargetDecoder"; + let PrintMethod = "printBrtarget"; +} +def calltarget : Operand<i32> { + let DecoderMethod = "brtargetDecoder"; + let PrintMethod = "printBrtarget"; } -def calltarget : Operand<i32>; def bblabel : Operand<i32>; -def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf , [], "BasicBlockSDNode">; - -def symbolHi32 : Operand<i32> { - let PrintMethod = "printSymbolHi"; -} -def symbolLo32 : Operand<i32> { - let PrintMethod = "printSymbolLo"; -} +def bbl : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">; // Return true if for a 32 to 64-bit sign-extended load. def is_sext_i32 : PatLeaf<(i64 DoubleRegs:$src1), [{ diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp new file mode 100644 index 0000000..1723771 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp @@ -0,0 +1,150 @@ +//===- HexagonOptimizeSZextends.cpp - Remove unnecessary argument extends -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Pass that removes sign extends for function parameters. These parameters +// are already sign extended by the caller per Hexagon's ABI +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/StackProtector.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" + +#include "Hexagon.h" + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonOptimizeSZextends(); + void initializeHexagonOptimizeSZextendsPass(PassRegistry&); +} + +namespace { + struct HexagonOptimizeSZextends : public FunctionPass { + public: + static char ID; + HexagonOptimizeSZextends() : FunctionPass(ID) { + initializeHexagonOptimizeSZextendsPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + const char *getPassName() const override { + return "Remove sign extends"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineFunctionAnalysis>(); + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addPreserved<StackProtector>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool intrinsicAlreadySextended(Intrinsic::ID IntID); + }; +} + +char HexagonOptimizeSZextends::ID = 0; + +INITIALIZE_PASS(HexagonOptimizeSZextends, "reargs", + "Remove Sign and Zero Extends for Args", false, false) + +bool HexagonOptimizeSZextends::intrinsicAlreadySextended(Intrinsic::ID IntID) { + switch(IntID) { + case llvm::Intrinsic::hexagon_A2_addh_l16_sat_ll: + return true; + default: + break; + } + return false; +} + +bool HexagonOptimizeSZextends::runOnFunction(Function &F) { + unsigned Idx = 1; + // Try to optimize sign extends in formal parameters. It's relying on + // callee already sign extending the values. I'm not sure if our ABI + // requires callee to sign extend though. + for (auto &Arg : F.args()) { + if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) { + if (!isa<PointerType>(Arg.getType())) { + for (auto UI = Arg.use_begin(); UI != Arg.use_end();) { + if (isa<SExtInst>(*UI)) { + Instruction* Use = cast<Instruction>(*UI); + SExtInst* SI = new SExtInst(&Arg, Use->getType()); + assert (EVT::getEVT(SI->getType()) == + (EVT::getEVT(Use->getType()))); + ++UI; + Use->replaceAllUsesWith(SI); + Instruction* First = &F.getEntryBlock().front(); + SI->insertBefore(First); + Use->eraseFromParent(); + } else { + ++UI; + } + } + } + } + ++Idx; + } + + // Try to remove redundant sext operations on Hexagon. The hardware + // already sign extends many 16 bit intrinsic operations to 32 bits. + // For example: + // %34 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %x, i32 %y) + // %sext233 = shl i32 %34, 16 + // %conv52 = ashr exact i32 %sext233, 16 + for (auto &B : F) { + for (auto &I : B) { + // Look for arithmetic shift right by 16. + BinaryOperator *Ashr = dyn_cast<BinaryOperator>(&I); + if (!(Ashr && Ashr->getOpcode() == Instruction::AShr)) + continue; + Value *AshrOp1 = Ashr->getOperand(1); + ConstantInt *C = dyn_cast<ConstantInt>(AshrOp1); + // Right shifted by 16. + if (!(C && C->getSExtValue() == 16)) + continue; + + // The first operand of Ashr comes from logical shift left. + Instruction *Shl = dyn_cast<Instruction>(Ashr->getOperand(0)); + if (!(Shl && Shl->getOpcode() == Instruction::Shl)) + continue; + Value *Intr = Shl->getOperand(0); + Value *ShlOp1 = Shl->getOperand(1); + C = dyn_cast<ConstantInt>(ShlOp1); + // Left shifted by 16. + if (!(C && C->getSExtValue() == 16)) + continue; + + // The first operand of Shl comes from an intrinsic. + if (IntrinsicInst *I = dyn_cast<IntrinsicInst>(Intr)) { + if (!intrinsicAlreadySextended(I->getIntrinsicID())) + continue; + // All is well. Replace all uses of AShr with I. + for (auto UI = Ashr->user_begin(), UE = Ashr->user_end(); + UI != UE; ++UI) { + const Use &TheUse = UI.getUse(); + if (Instruction *J = dyn_cast<Instruction>(TheUse.getUser())) { + J->replaceUsesOfWith(Ashr, I); + } + } + } + } + } + + return true; +} + + +FunctionPass *llvm::createHexagonOptimizeSZextends() { + return new HexagonOptimizeSZextends(); +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp index 93dcbe2..e68ff85 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonPeephole.cpp @@ -124,7 +124,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { // Loop over all of the basic blocks. for (MachineFunction::iterator MBBb = MF.begin(), MBBe = MF.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; PeepholeMap.clear(); PeepholeDoubleRegsMap.clear(); @@ -180,7 +180,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) { unsigned DstReg = Dst.getReg(); unsigned SrcReg = Src1.getReg(); PeepholeDoubleRegsMap[DstReg] = - std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/); + std::make_pair(*&SrcReg, Hexagon::subreg_hireg); } // Look for P=NOT(P). diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp index f6bb4a0..61c0589 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp @@ -66,6 +66,8 @@ HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF) const { switch (HST.getHexagonArchVersion()) { case HexagonSubtarget::V4: case HexagonSubtarget::V5: + case HexagonSubtarget::V55: + case HexagonSubtarget::V60: return CallerSavedRegsV4; } llvm_unreachable( @@ -84,6 +86,8 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { switch (MF->getSubtarget<HexagonSubtarget>().getHexagonArchVersion()) { case HexagonSubtarget::V4: case HexagonSubtarget::V5: + case HexagonSubtarget::V55: + case HexagonSubtarget::V60: return CalleeSavedRegsV3; } llvm_unreachable("Callee saved registers requested for unknown architecture " @@ -98,7 +102,7 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF) Reserved.set(Hexagon::R29); Reserved.set(Hexagon::R30); Reserved.set(Hexagon::R31); - Reserved.set(Hexagon::D14); + Reserved.set(Hexagon::PC); Reserved.set(Hexagon::D15); Reserved.set(Hexagon::LC0); Reserved.set(Hexagon::LC1); @@ -116,62 +120,21 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, assert(SPAdj == 0 && "Unexpected"); MachineInstr &MI = *II; - MachineBasicBlock &MB = *MI.getParent(); MachineFunction &MF = *MB.getParent(); - MachineFrameInfo &MFI = *MF.getFrameInfo(); - auto &HST = static_cast<const HexagonSubtarget&>(MF.getSubtarget()); + auto &HST = MF.getSubtarget<HexagonSubtarget>(); auto &HII = *HST.getInstrInfo(); auto &HFI = *HST.getFrameLowering(); + unsigned BP = 0; int FI = MI.getOperand(FIOp).getIndex(); - int Offset = MFI.getObjectOffset(FI) + MI.getOperand(FIOp+1).getImm(); - bool HasAlloca = MFI.hasVarSizedObjects(); - bool HasAlign = needsStackRealignment(MF); - - // XXX: Fixed objects cannot be accessed through SP if there are aligned - // objects in the local frame, or if there are dynamically allocated objects. - // In such cases, there has to be FP available. - if (!HFI.hasFP(MF)) { - assert(!HasAlloca && !HasAlign && "This function must have frame pointer"); - // We will not reserve space on the stack for the lr and fp registers. - Offset -= 8; - } - - unsigned SP = getStackRegister(), FP = getFrameRegister(); - unsigned AP = 0; - if (MachineInstr *AI = HFI.getAlignaInstr(MF)) - AP = AI->getOperand(0).getReg(); - unsigned FrameSize = MFI.getStackSize(); - - // Special handling of dbg_value instructions and INLINEASM. - if (MI.isDebugValue() || MI.isInlineAsm()) { - MI.getOperand(FIOp).ChangeToRegister(SP, false /*isDef*/); - MI.getOperand(FIOp+1).ChangeToImmediate(Offset+FrameSize); - return; - } - - bool UseFP = false, UseAP = false; // Default: use SP. - if (MFI.isFixedObjectIndex(FI) || MFI.isObjectPreAllocated(FI)) { - UseFP = HasAlloca || HasAlign; - } else { - if (HasAlloca) { - if (HasAlign) - UseAP = true; - else - UseFP = true; - } - } + // Select the base pointer (BP) and calculate the actual offset from BP + // to the beginning of the object at index FI. + int Offset = HFI.getFrameIndexReference(MF, FI, BP); + // Add the offset from the instruction. + int RealOffset = Offset + MI.getOperand(FIOp+1).getImm(); unsigned Opc = MI.getOpcode(); - bool ValidSP = HII.isValidOffset(Opc, FrameSize+Offset); - bool ValidFP = HII.isValidOffset(Opc, Offset); - - // Calculate the actual offset in the instruction. - int64_t RealOffset = Offset; - if (!UseFP && !UseAP) - RealOffset = FrameSize+Offset; - switch (Opc) { case Hexagon::TFR_FIA: MI.setDesc(HII.get(Hexagon::A2_addi)); @@ -184,20 +147,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, break; } - unsigned BP = 0; - bool Valid = false; - if (UseFP) { - BP = FP; - Valid = ValidFP; - } else if (UseAP) { - BP = AP; - Valid = ValidFP; - } else { - BP = SP; - Valid = ValidSP; - } - - if (Valid) { + if (HII.isValidOffset(Opc, RealOffset)) { MI.getOperand(FIOp).ChangeToRegister(BP, false); MI.getOperand(FIOp+1).ChangeToImmediate(RealOffset); return; @@ -223,8 +173,8 @@ unsigned HexagonRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const HexagonFrameLowering *TFI = getFrameLowering(MF); if (TFI->hasFP(MF)) - return Hexagon::R30; - return Hexagon::R29; + return getFrameRegister(); + return getStackRegister(); } @@ -238,17 +188,9 @@ unsigned HexagonRegisterInfo::getStackRegister() const { } -bool -HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const { - const HexagonFrameLowering *TFI = getFrameLowering(MF); - return TFI->hasFP(MF); -} - - -bool -HexagonRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - return MFI->getMaxAlignment() > 8; +bool HexagonRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) + const { + return MF.getSubtarget<HexagonSubtarget>().getFrameLowering()->hasFP(MF); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h index 7edefee..db7e0f2 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h @@ -63,8 +63,6 @@ public: return true; } - bool needsStackRealignment(const MachineFunction &MF) const override; - /// Returns true if the frame pointer is valid. bool useFPForScavengingIndex(const MachineFunction &MF) const override; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td index edf1c25..81629dc 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td @@ -53,6 +53,12 @@ let Namespace = "Hexagon" in { let Num = num; } + + // Rq - vector predicate registers + class Rq<bits<3> num, string n> : Register<n, []> { + let HWEncoding{2-0} = num; + } + // Rc - control registers class Rc<bits<5> num, string n, list<string> alt = [], list<Register> alias = []> : @@ -131,20 +137,21 @@ let Namespace = "Hexagon" in { def LC1 : Rc<3, "lc1", ["c3"]>, DwarfRegNum<[70]>; def P3_0 : Rc<4, "p3:0", ["c4"], [P0, P1, P2, P3]>, DwarfRegNum<[71]>; - def C6 : Rc<6, "c6", [], [M0]>, DwarfRegNum<[72]>; - def C7 : Rc<7, "c7", [], [M1]>, DwarfRegNum<[73]>; + def C5 : Rc<5, "c5", ["c5"]>, DwarfRegNum<[72]>; // future use + def C6 : Rc<6, "c6", [], [M0]>, DwarfRegNum<[73]>; + def C7 : Rc<7, "c7", [], [M1]>, DwarfRegNum<[74]>; - def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[74]> { + def USR : Rc<8, "usr", ["c8"]>, DwarfRegNum<[75]> { let SubRegIndices = [subreg_overflow]; let SubRegs = [USR_OVF]; } - def PC : Rc<9, "pc">, DwarfRegNum<[75]>; - def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[76]>; - def GP : Rc<11, "gp">, DwarfRegNum<[77]>; - def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[78]>; - def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[79]>; - def UPCL : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[80]>; - def UPCH : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[81]>; + def PC : Rc<9, "pc">, DwarfRegNum<[76]>; + def UGP : Rc<10, "ugp", ["c10"]>, DwarfRegNum<[77]>; + def GP : Rc<11, "gp">, DwarfRegNum<[78]>; + def CS0 : Rc<12, "cs0", ["c12"]>, DwarfRegNum<[79]>; + def CS1 : Rc<13, "cs1", ["c13"]>, DwarfRegNum<[80]>; + def UPCL : Rc<14, "upcyclelo", ["c14"]>, DwarfRegNum<[81]>; + def UPCH : Rc<15, "upcyclehi", ["c15"]>, DwarfRegNum<[82]>; } // Control registers pairs. @@ -158,6 +165,36 @@ let Namespace = "Hexagon" in { def UPC : Rcc<14, "c15:14", [UPCL, UPCH]>, DwarfRegNum<[80]>; } + foreach i = 0-31 in { + def V#i : Ri<i, "v"#i>, DwarfRegNum<[!add(i, 99)]>; + } + + // Aliases of the V* registers used to hold double vec values. + let SubRegIndices = [subreg_loreg, subreg_hireg], CoveredBySubRegs = 1 in { + def W0 : Rd< 0, "v1:0", [V0, V1]>, DwarfRegNum<[99]>; + def W1 : Rd< 2, "v3:2", [V2, V3]>, DwarfRegNum<[101]>; + def W2 : Rd< 4, "v5:4", [V4, V5]>, DwarfRegNum<[103]>; + def W3 : Rd< 6, "v7:6", [V6, V7]>, DwarfRegNum<[105]>; + def W4 : Rd< 8, "v9:8", [V8, V9]>, DwarfRegNum<[107]>; + def W5 : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>; + def W6 : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>; + def W7 : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>; + def W8 : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>; + def W9 : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>; + def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>; + def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>; + def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>; + def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>; + def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>; + def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>; + } + + // Vector Predicate registers. + def Q0 : Rq<0, "q0">, DwarfRegNum<[131]>; + def Q1 : Rq<1, "q1">, DwarfRegNum<[132]>; + def Q2 : Rq<2, "q2">, DwarfRegNum<[133]>; + def Q3 : Rq<3, "q3">, DwarfRegNum<[134]>; + // Register classes. // // FIXME: the register order should be defined in terms of the preferred @@ -169,10 +206,34 @@ def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32, R10, R11, R29, R30, R31)> { } +// Registers are listed in reverse order for allocation preference reasons. +def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32, + (add R7, R6, R5, R4, R3, R2, R1, R0)> ; + def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64, (add (sequence "D%u", 0, 4), (sequence "D%u", 6, 13), D5, D14, D15)>; +def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512, + (add (sequence "V%u", 0, 31))>; + +def VecDblRegs : RegisterClass<"Hexagon", + [v128i8, v64i16, v32i32, v16i64], 1024, + (add (sequence "W%u", 0, 15))>; + +def VectorRegs128B : RegisterClass<"Hexagon", + [v128i8, v64i16, v32i32, v16i64], 1024, + (add (sequence "V%u", 0, 31))>; + +def VecDblRegs128B : RegisterClass<"Hexagon", + [v256i8,v128i16,v64i32,v32i64], 2048, + (add (sequence "W%u", 0, 15))>; + +def VecPredRegs : RegisterClass<"Hexagon", [v512i1], 512, + (add (sequence "Q%u", 0, 3))>; + +def VecPredRegs128B : RegisterClass<"Hexagon", [v1024i1], 1024, + (add (sequence "Q%u", 0, 3))>; def PredRegs : RegisterClass<"Hexagon", [i1, v2i1, v4i1, v8i1, v4i8, v2i16, i32], 32, diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp deleted file mode 100644 index 7069ad3..0000000 --- a/contrib/llvm/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp +++ /dev/null @@ -1,91 +0,0 @@ -//===- HexagonRemoveExtendArgs.cpp - Remove unnecessary argument sign extends // -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Pass that removes sign extends for function parameters. These parameters -// are already sign extended by the caller per Hexagon's ABI -// -//===----------------------------------------------------------------------===// - -#include "Hexagon.h" -#include "HexagonTargetMachine.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/StackProtector.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Pass.h" -#include "llvm/Transforms/Scalar.h" - -using namespace llvm; - -namespace llvm { - FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); - void initializeHexagonRemoveExtendArgsPass(PassRegistry&); -} - -namespace { - struct HexagonRemoveExtendArgs : public FunctionPass { - public: - static char ID; - HexagonRemoveExtendArgs() : FunctionPass(ID) { - initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - - const char *getPassName() const override { - return "Remove sign extends"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<MachineFunctionAnalysis>(); - AU.addPreserved<MachineFunctionAnalysis>(); - AU.addPreserved<StackProtector>(); - FunctionPass::getAnalysisUsage(AU); - } - }; -} - -char HexagonRemoveExtendArgs::ID = 0; - -INITIALIZE_PASS(HexagonRemoveExtendArgs, "reargs", - "Remove Sign and Zero Extends for Args", false, false) - -bool HexagonRemoveExtendArgs::runOnFunction(Function &F) { - unsigned Idx = 1; - for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE; - ++AI, ++Idx) { - if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) { - Argument* Arg = AI; - if (!isa<PointerType>(Arg->getType())) { - for (auto UI = Arg->user_begin(); UI != Arg->user_end();) { - if (isa<SExtInst>(*UI)) { - Instruction* I = cast<Instruction>(*UI); - SExtInst* SI = new SExtInst(Arg, I->getType()); - assert (EVT::getEVT(SI->getType()) == - (EVT::getEVT(I->getType()))); - ++UI; - I->replaceAllUsesWith(SI); - Instruction* First = F.getEntryBlock().begin(); - SI->insertBefore(First); - I->eraseFromParent(); - } else { - ++UI; - } - } - } - } - } - return true; -} - - - -FunctionPass* -llvm::createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM) { - return new HexagonRemoveExtendArgs(); -} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td index 528cafc..6e4987b 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSchedule.td @@ -13,6 +13,12 @@ include "HexagonScheduleV4.td" +// V55 Machine Info + +include "HexagonScheduleV55.td" + //===----------------------------------------------------------------------===// -// V4 Machine Info - +// V60 Machine Info - //===----------------------------------------------------------------------===// + +include "HexagonScheduleV60.td" + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td index a7d2d47..67af147 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td +++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV4.td @@ -35,10 +35,11 @@ def SLOT_ENDLOOP: FuncUnit; // Itinerary classes. def PSEUDO : InstrItinClass; -def PSEUDOM : InstrItinClass; +def PSEUDOM : InstrItinClass; // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4. def DUPLEX : InstrItinClass; def PREFIX : InstrItinClass; +def COMPOUND_CJ_ARCHDEPSLOT : InstrItinClass; def COMPOUND : InstrItinClass; def ALU32_2op_tc_1_SLOT0123 : InstrItinClass; @@ -58,6 +59,7 @@ def CR_tc_2early_SLOT3 : InstrItinClass; def CR_tc_3x_SLOT23 : InstrItinClass; def CR_tc_3x_SLOT3 : InstrItinClass; def J_tc_2early_SLOT23 : InstrItinClass; +def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT : InstrItinClass; def J_tc_2early_SLOT2 : InstrItinClass; def LD_tc_ld_SLOT01 : InstrItinClass; def LD_tc_ld_SLOT0 : InstrItinClass; @@ -91,6 +93,7 @@ def V4LDST_tc_st_SLOT0 : InstrItinClass; def V4LDST_tc_st_SLOT01 : InstrItinClass; def J_tc_2early_SLOT0123 : InstrItinClass; def EXTENDER_tc_1_SLOT0123 : InstrItinClass; +def S_3op_tc_3stall_SLOT23 : InstrItinClass; def HexagonItinerariesV4 : diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td new file mode 100644 index 0000000..d9ad25d --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV55.td @@ -0,0 +1,170 @@ +//=-HexagonScheduleV4.td - HexagonV4 Scheduling Definitions --*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// There are four SLOTS (four parallel pipelines) in Hexagon V4 machine. +// This file describes that machine information. + +// +// |===========|==================================================| +// | PIPELINE | Instruction Classes | +// |===========|==================================================| +// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM | +// |-----------|--------------------------------------------------| +// | SLOT1 | LD ST ALU32 | +// |-----------|--------------------------------------------------| +// | SLOT2 | XTYPE ALU32 J JR | +// |-----------|--------------------------------------------------| +// | SLOT3 | XTYPE ALU32 J CR | +// |===========|==================================================| + +def CJ_tc_1_SLOT23 : InstrItinClass; +def CJ_tc_2early_SLOT23 : InstrItinClass; +def COPROC_VMEM_vtc_long_SLOT01 : InstrItinClass; +def COPROC_VX_vtc_long_SLOT23 : InstrItinClass; +def COPROC_VX_vtc_SLOT23 : InstrItinClass; +def J_tc_3stall_SLOT2 : InstrItinClass; +def MAPPING_tc_1_SLOT0123 : InstrItinClass; +def M_tc_3stall_SLOT23 : InstrItinClass; +def SUBINSN_tc_1_SLOT01 : InstrItinClass; +def SUBINSN_tc_2early_SLOT0 : InstrItinClass; +def SUBINSN_tc_2early_SLOT01 : InstrItinClass; +def SUBINSN_tc_3stall_SLOT0 : InstrItinClass; +def SUBINSN_tc_ld_SLOT0 : InstrItinClass; +def SUBINSN_tc_ld_SLOT01 : InstrItinClass; +def SUBINSN_tc_st_SLOT01 : InstrItinClass; + +def HexagonItinerariesV55 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [ + // ALU32 + InstrItinData<ALU32_2op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_2op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2_SLOT0123 , + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_ADDI_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // ALU64 + InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // CR -> System + InstrItinData<CR_tc_2_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<3, [SLOT3]>]>, + + // Jump (conditional/unconditional/return etc) + InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + + // JR + InstrItinData<J_tc_2early_SLOT2 , [InstrStage<2, [SLOT2]>]>, + InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<3, [SLOT2]>]>, + + // Extender + InstrItinData<EXTENDER_tc_1_SLOT0123, + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Load + InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + + // M + InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + + // Store + InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + + // Subinsn + InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_1_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_2early_SLOT01, + [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // S + InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // New Value Compare Jump + InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + + // Mem ops + InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // Endloop + InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>, + + // Vector + InstrItinData<COPROC_VMEM_vtc_long_SLOT01, + [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<COPROC_VX_vtc_long_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<COPROC_VX_vtc_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<MAPPING_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Misc + InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>, + InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [SLOT2, SLOT3]>]> + + ]>; + +def HexagonModelV55 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV55; + let LoadLatency = 1; +} + +//===----------------------------------------------------------------------===// +// Hexagon V4 Resource Definitions - +//===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td new file mode 100644 index 0000000..2ccff82 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonScheduleV60.td @@ -0,0 +1,310 @@ +//=-HexagonScheduleV60.td - HexagonV60 Scheduling Definitions *- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +// CVI pipes from the "Hexagon Multimedia Co-Processor Extensions Arch Spec". +def CVI_ST : FuncUnit; +def CVI_XLANE : FuncUnit; +def CVI_SHIFT : FuncUnit; +def CVI_MPY0 : FuncUnit; +def CVI_MPY1 : FuncUnit; +def CVI_LD : FuncUnit; + +// Combined functional units. +def CVI_XLSHF : FuncUnit; +def CVI_MPY01 : FuncUnit; +def CVI_ALL : FuncUnit; + +// Combined functional unit data. +def HexagonComboFuncsV60 : + ComboFuncUnits<[ + ComboFuncData<CVI_XLSHF , [CVI_XLANE, CVI_SHIFT]>, + ComboFuncData<CVI_MPY01 , [CVI_MPY0, CVI_MPY1]>, + ComboFuncData<CVI_ALL , [CVI_ST, CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1, CVI_LD]> + ]>; + +// Note: When adding additional vector scheduling classes, add the +// corresponding methods to the class HexagonInstrInfo. +def CVI_VA : InstrItinClass; +def CVI_VA_DV : InstrItinClass; +def CVI_VX_LONG : InstrItinClass; +def CVI_VX_LATE : InstrItinClass; +def CVI_VX : InstrItinClass; +def CVI_VX_DV_LONG : InstrItinClass; +def CVI_VX_DV : InstrItinClass; +def CVI_VX_DV_SLOT2 : InstrItinClass; +def CVI_VP : InstrItinClass; +def CVI_VP_LONG : InstrItinClass; +def CVI_VP_VS_EARLY : InstrItinClass; +def CVI_VP_VS_LONG_EARLY : InstrItinClass; +def CVI_VP_VS_LONG : InstrItinClass; +def CVI_VP_VS : InstrItinClass; +def CVI_VP_DV : InstrItinClass; +def CVI_VS : InstrItinClass; +def CVI_VINLANESAT : InstrItinClass; +def CVI_VM_LD : InstrItinClass; +def CVI_VM_TMP_LD : InstrItinClass; +def CVI_VM_CUR_LD : InstrItinClass; +def CVI_VM_VP_LDU : InstrItinClass; +def CVI_VM_ST : InstrItinClass; +def CVI_VM_NEW_ST : InstrItinClass; +def CVI_VM_STU : InstrItinClass; +def CVI_HIST : InstrItinClass; +def CVI_VA_EXT : InstrItinClass; + +// There are four SLOTS (four parallel pipelines) in Hexagon V60 machine. +// This file describes that machine information. +// +// |===========|==================================================| +// | PIPELINE | Instruction Classes | +// |===========|==================================================| +// | SLOT0 | LD ST ALU32 MEMOP NV SYSTEM | +// |-----------|--------------------------------------------------| +// | SLOT1 | LD ST ALU32 | +// |-----------|--------------------------------------------------| +// | SLOT2 | XTYPE ALU32 J JR | +// |-----------|--------------------------------------------------| +// | SLOT3 | XTYPE ALU32 J CR | +// |===========|==================================================| +// +// +// In addition to using the above SLOTS, there are also six vector pipelines +// in the CVI co-processor in the Hexagon V60 machine. +// +// |=========| |=========| |=========| |=========| |=========| |=========| +// SLOT | CVI_LD | |CVI_MPY3 | |CVI_MPY2 | |CVI_SHIFT| |CVI_XLANE| | CVI_ST | +// ==== |=========| |=========| |=========| |=========| |=========| |=========| +// S0-3 | | | CVI_VA | | CVI_VA | | CVI_VA | | CVI_VA | | | +// S2-3 | | | CVI_VX | | CVI_VX | | | | | | | +// S0-3 | | | | | | | | | CVI_VP | | | +// S0-3 | | | | | | | CVI_VS | | | | | +// S0-1 |(CVI_LD) | | CVI_LD | | CVI_LD | | CVI_LD | | CVI_LD | | | +// S0-1 |(C*TMP_LD) | | | | | | | | | | +// S01 |(C*_LDU) | | | | | | | | C*_LDU | | | +// S0 | | | CVI_ST | | CVI_ST | | CVI_ST | | CVI_ST | |(CVI_ST) | +// S0 | | | | | | | | | | |(C*TMP_ST) +// S01 | | | | | | | | | VSTU | |(C*_STU) | +// |=========| |=========| |=========| |=========| |=========| |=========| +// |=====================| |=====================| +// | CVI_MPY2 & CVI_MPY3 | |CVI_XLANE & CVI_SHIFT| +// |=====================| |=====================| +// S0-3 | CVI_VA_DV | | CVI_VA_DV | +// S0-3 | | | CVI_VP_DV | +// S2-3 | CVI_VX_DV | | | +// |=====================| |=====================| +// |=====================================================================| +// S0-3 | CVI_HIST Histogram | +// S0123| CVI_VA_EXT Extract | +// |=====================================================================| + +def HexagonItinerariesV60 : + ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP, + CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1, + CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL], [], [ + // ALU32 + InstrItinData<ALU32_2op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_2op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2_SLOT0123 , + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_3op_tc_2early_SLOT0123, + [InstrStage<2, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<ALU32_ADDI_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // ALU64 + InstrItinData<ALU64_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<ALU64_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // CR -> System + InstrItinData<CR_tc_2_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<2, [SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT3 , [InstrStage<3, [SLOT3]>]>, + + // Jump (conditional/unconditional/return etc) + InstrItinData<CR_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<CR_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + + // JR + InstrItinData<J_tc_2early_SLOT2 , [InstrStage<2, [SLOT2]>]>, + InstrItinData<J_tc_3stall_SLOT2 , [InstrStage<3, [SLOT2]>]>, + + // Extender + InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1, + [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Load + InstrItinData<LD_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>, + InstrItinData<LD_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + + // M + InstrItinData<M_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>, + InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + + // Store + InstrItinData<ST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<ST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + + // Subinsn + InstrItinData<SUBINSN_tc_2early_SLOT0, [InstrStage<2, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT0 , [InstrStage<3, [SLOT0]>]>, + InstrItinData<SUBINSN_tc_1_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_2early_SLOT01, + [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<SUBINSN_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // S + InstrItinData<S_2op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60. + InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_1_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<2, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<S_3op_tc_3x_SLOT23 , [InstrStage<3, [SLOT2, SLOT3]>]>, + + // New Value Compare Jump + InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>, + + // Mem ops + InstrItinData<V2LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<2, [SLOT0, SLOT1]>]>, + InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT0 , [InstrStage<1, [SLOT0]>]>, + InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>]>, + + // Endloop + InstrItinData<J_tc_2early_SLOT0123, [InstrStage<2, [SLOT_ENDLOOP]>]>, + + // Vector + InstrItinData<COPROC_VMEM_vtc_long_SLOT01, + [InstrStage<3, [SLOT0, SLOT1]>]>, + InstrItinData<COPROC_VX_vtc_long_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<COPROC_VX_vtc_SLOT23 , + [InstrStage<3, [SLOT2, SLOT3]>]>, + InstrItinData<MAPPING_tc_1_SLOT0123 , + [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + + // Duplex and Compound + InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>, + InstrItinData<COMPOUND_CJ_ARCHDEPSLOT , [InstrStage<1, [SLOT2, SLOT3]>]>, + InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>, + // Misc + InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>, + InstrItinData<PSEUDOM , [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [SLOT2, SLOT3]>]>, + + // Latest CVI spec definitions. + InstrItinData<CVI_VA,[InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLANE,CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VA_DV, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF, CVI_MPY01]>]>, + InstrItinData<CVI_VX_LONG, [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VX_LATE, [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VX,[InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VX_DV_LONG, + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>]>, + InstrItinData<CVI_VX_DV, + [InstrStage<1, [SLOT2, SLOT3], 0>, + InstrStage<1, [CVI_MPY01]>]>, + InstrItinData<CVI_VX_DV_SLOT2, + [InstrStage<1, [SLOT2], 0>, + InstrStage<1, [CVI_MPY01]>]>, + InstrItinData<CVI_VP, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_VP_LONG, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_VP_VS_EARLY, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_VS_LONG, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_VS, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_VS_LONG_EARLY, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VP_DV , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_XLSHF]>]>, + InstrItinData<CVI_VS, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>]>, + InstrItinData<CVI_VINLANESAT, + [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_SHIFT]>]>, + InstrItinData<CVI_VM_LD , [InstrStage<1, [SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VM_TMP_LD,[InstrStage<1,[SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD]>]>, + InstrItinData<CVI_VM_CUR_LD,[InstrStage<1,[SLOT0, SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VM_VP_LDU,[InstrStage<1,[SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_LD], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_VM_ST , [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE, CVI_SHIFT, + CVI_MPY0, CVI_MPY1]>]>, + InstrItinData<CVI_VM_NEW_ST,[InstrStage<1,[SLOT0], 0>, + InstrStage<1, [CVI_ST]>]>, + InstrItinData<CVI_VM_STU , [InstrStage<1, [SLOT0], 0>, + InstrStage<1, [SLOT1], 0>, + InstrStage<1, [CVI_ST], 0>, + InstrStage<1, [CVI_XLANE]>]>, + InstrItinData<CVI_HIST , [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>, + InstrStage<1, [CVI_ALL]>]> + ]>; + +def HexagonModelV60 : SchedMachineModel { + // Max issue per cycle == bundle width. + let IssueWidth = 4; + let Itineraries = HexagonItinerariesV60; + let LoadLatency = 1; +} + +//===----------------------------------------------------------------------===// +// Hexagon V60 Resource Definitions - +//===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp index 276cc69..239dbda 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp @@ -12,12 +12,11 @@ //===----------------------------------------------------------------------===// #include "HexagonTargetMachine.h" +#include "llvm/CodeGen/SelectionDAG.h" using namespace llvm; #define DEBUG_TYPE "hexagon-selectiondag-info" -bool llvm::flag_aligned_memcpy; - SDValue HexagonSelectionDAGInfo:: EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, @@ -25,15 +24,40 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - flag_aligned_memcpy = false; - if ((Align & 0x3) == 0) { - ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); - if (ConstantSize) { - uint64_t SizeVal = ConstantSize->getZExtValue(); - if ((SizeVal > 32) && ((SizeVal % 8) == 0)) - flag_aligned_memcpy = true; - } - } - - return SDValue(); + ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); + if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize) + return SDValue(); + + uint64_t SizeVal = ConstantSize->getZExtValue(); + if (SizeVal < 32 || (SizeVal % 8) != 0) + return SDValue(); + + // Special case aligned memcpys with size >= 32 bytes and a multiple of 8. + // + const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering(); + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + Entry.Node = Src; + Args.push_back(Entry); + Entry.Node = Size; + Args.push_back(Entry); + + const char *SpecialMemcpyName = + "__hexagon_memcpy_likely_aligned_min32bytes_mult8bytes"; + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(dl) + .setChain(Chain) + .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY), + Type::getVoidTy(*DAG.getContext()), + DAG.getTargetExternalSymbol( + SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args), 0) + .setDiscardResult(); + + std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); + return CallResult.second; } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp index d3eb56f..10fe606 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp @@ -81,7 +81,7 @@ bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) { // Loop over all of the basic blocks for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end(); MBBb != MBBe; ++MBBb) { - MachineBasicBlock* MBB = MBBb; + MachineBasicBlock *MBB = &*MBBb; // Traverse the basic block MachineBasicBlock::iterator MII = MBB->begin(); MachineBasicBlock::iterator MIE = MBB->end (); diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp new file mode 100644 index 0000000..d4e95b0d --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp @@ -0,0 +1,1209 @@ +//===--- HexagonSplitDouble.cpp -------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "hsdr" + +#include "HexagonRegisterInfo.h" +#include "HexagonTargetMachine.h" + +#include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetRegisterInfo.h" + +#include <map> +#include <set> +#include <vector> + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonSplitDoubleRegs(); + void initializeHexagonSplitDoubleRegsPass(PassRegistry&); +} + +namespace { + static cl::opt<int> MaxHSDR("max-hsdr", cl::Hidden, cl::init(-1), + cl::desc("Maximum number of split partitions")); + static cl::opt<bool> MemRefsFixed("hsdr-no-mem", cl::Hidden, cl::init(true), + cl::desc("Do not split loads or stores")); + + class HexagonSplitDoubleRegs : public MachineFunctionPass { + public: + static char ID; + HexagonSplitDoubleRegs() : MachineFunctionPass(ID), TRI(nullptr), + TII(nullptr) { + initializeHexagonSplitDoubleRegsPass(*PassRegistry::getPassRegistry()); + } + const char *getPassName() const override { + return "Hexagon Split Double Registers"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + bool runOnMachineFunction(MachineFunction &MF) override; + + private: + static const TargetRegisterClass *const DoubleRC; + + const HexagonRegisterInfo *TRI; + const HexagonInstrInfo *TII; + const MachineLoopInfo *MLI; + MachineRegisterInfo *MRI; + + typedef std::set<unsigned> USet; + typedef std::map<unsigned,USet> UUSetMap; + typedef std::pair<unsigned,unsigned> UUPair; + typedef std::map<unsigned,UUPair> UUPairMap; + typedef std::map<const MachineLoop*,USet> LoopRegMap; + + bool isInduction(unsigned Reg, LoopRegMap &IRM) const; + bool isVolatileInstr(const MachineInstr *MI) const; + bool isFixedInstr(const MachineInstr *MI) const; + void partitionRegisters(UUSetMap &P2Rs); + int32_t profit(const MachineInstr *MI) const; + bool isProfitable(const USet &Part, LoopRegMap &IRM) const; + + void collectIndRegsForLoop(const MachineLoop *L, USet &Rs); + void collectIndRegs(LoopRegMap &IRM); + + void createHalfInstr(unsigned Opc, MachineInstr *MI, + const UUPairMap &PairMap, unsigned SubR); + void splitMemRef(MachineInstr *MI, const UUPairMap &PairMap); + void splitImmediate(MachineInstr *MI, const UUPairMap &PairMap); + void splitCombine(MachineInstr *MI, const UUPairMap &PairMap); + void splitExt(MachineInstr *MI, const UUPairMap &PairMap); + void splitShift(MachineInstr *MI, const UUPairMap &PairMap); + void splitAslOr(MachineInstr *MI, const UUPairMap &PairMap); + bool splitInstr(MachineInstr *MI, const UUPairMap &PairMap); + void replaceSubregUses(MachineInstr *MI, const UUPairMap &PairMap); + void collapseRegPairs(MachineInstr *MI, const UUPairMap &PairMap); + bool splitPartition(const USet &Part); + + static int Counter; + static void dump_partition(raw_ostream&, const USet&, + const TargetRegisterInfo&); + }; + char HexagonSplitDoubleRegs::ID; + int HexagonSplitDoubleRegs::Counter = 0; + const TargetRegisterClass *const HexagonSplitDoubleRegs::DoubleRC + = &Hexagon::DoubleRegsRegClass; +} + +INITIALIZE_PASS(HexagonSplitDoubleRegs, "hexagon-split-double", + "Hexagon Split Double Registers", false, false) + + +static inline uint32_t getRegState(const MachineOperand &R) { + assert(R.isReg()); + return getDefRegState(R.isDef()) | + getImplRegState(R.isImplicit()) | + getKillRegState(R.isKill()) | + getDeadRegState(R.isDead()) | + getUndefRegState(R.isUndef()) | + getInternalReadRegState(R.isInternalRead()) | + (R.isDebug() ? RegState::Debug : 0); +} + + +void HexagonSplitDoubleRegs::dump_partition(raw_ostream &os, + const USet &Part, const TargetRegisterInfo &TRI) { + dbgs() << '{'; + for (auto I : Part) + dbgs() << ' ' << PrintReg(I, &TRI); + dbgs() << " }"; +} + + +bool HexagonSplitDoubleRegs::isInduction(unsigned Reg, LoopRegMap &IRM) const { + for (auto I : IRM) { + const USet &Rs = I.second; + if (Rs.find(Reg) != Rs.end()) + return true; + } + return false; +} + + +bool HexagonSplitDoubleRegs::isVolatileInstr(const MachineInstr *MI) const { + for (auto &I : MI->memoperands()) + if (I->isVolatile()) + return true; + return false; +} + + +bool HexagonSplitDoubleRegs::isFixedInstr(const MachineInstr *MI) const { + if (MI->mayLoad() || MI->mayStore()) + if (MemRefsFixed || isVolatileInstr(MI)) + return true; + if (MI->isDebugValue()) + return false; + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + return true; + + case TargetOpcode::PHI: + case TargetOpcode::COPY: + break; + + case Hexagon::L2_loadrd_io: + // Not handling stack stores (only reg-based addresses). + if (MI->getOperand(1).isReg()) + break; + return true; + case Hexagon::S2_storerd_io: + // Not handling stack stores (only reg-based addresses). + if (MI->getOperand(0).isReg()) + break; + return true; + case Hexagon::L2_loadrd_pi: + case Hexagon::S2_storerd_pi: + + case Hexagon::A2_tfrpi: + case Hexagon::A2_combineii: + case Hexagon::A4_combineir: + case Hexagon::A4_combineii: + case Hexagon::A4_combineri: + case Hexagon::A2_combinew: + case Hexagon::CONST64_Int_Real: + + case Hexagon::A2_sxtw: + + case Hexagon::A2_andp: + case Hexagon::A2_orp: + case Hexagon::A2_xorp: + case Hexagon::S2_asl_i_p_or: + case Hexagon::S2_asl_i_p: + case Hexagon::S2_asr_i_p: + case Hexagon::S2_lsr_i_p: + break; + } + + for (auto &Op : MI->operands()) { + if (!Op.isReg()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + return true; + } + return false; +} + + +void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) { + typedef std::map<unsigned,unsigned> UUMap; + typedef std::vector<unsigned> UVect; + + unsigned NumRegs = MRI->getNumVirtRegs(); + BitVector DoubleRegs(NumRegs); + for (unsigned i = 0; i < NumRegs; ++i) { + unsigned R = TargetRegisterInfo::index2VirtReg(i); + if (MRI->getRegClass(R) == DoubleRC) + DoubleRegs.set(i); + } + + BitVector FixedRegs(NumRegs); + for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { + unsigned R = TargetRegisterInfo::index2VirtReg(x); + MachineInstr *DefI = MRI->getVRegDef(R); + // In some cases a register may exist, but never be defined or used. + // It should never appear anywhere, but mark it as "fixed", just to be + // safe. + if (!DefI || isFixedInstr(DefI)) + FixedRegs.set(x); + } + + UUSetMap AssocMap; + for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { + if (FixedRegs[x]) + continue; + unsigned R = TargetRegisterInfo::index2VirtReg(x); + DEBUG(dbgs() << PrintReg(R, TRI) << " ~~"); + USet &Asc = AssocMap[R]; + for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end(); + U != Z; ++U) { + MachineOperand &Op = *U; + MachineInstr *UseI = Op.getParent(); + if (isFixedInstr(UseI)) + continue; + for (unsigned i = 0, n = UseI->getNumOperands(); i < n; ++i) { + MachineOperand &MO = UseI->getOperand(i); + // Skip non-registers or registers with subregisters. + if (&MO == &Op || !MO.isReg() || MO.getSubReg()) + continue; + unsigned T = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(T)) { + FixedRegs.set(x); + continue; + } + if (MRI->getRegClass(T) != DoubleRC) + continue; + unsigned u = TargetRegisterInfo::virtReg2Index(T); + if (FixedRegs[u]) + continue; + DEBUG(dbgs() << ' ' << PrintReg(T, TRI)); + Asc.insert(T); + // Make it symmetric. + AssocMap[T].insert(R); + } + } + DEBUG(dbgs() << '\n'); + } + + UUMap R2P; + unsigned NextP = 1; + USet Visited; + for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) { + unsigned R = TargetRegisterInfo::index2VirtReg(x); + if (Visited.count(R)) + continue; + // Create a new partition for R. + unsigned ThisP = FixedRegs[x] ? 0 : NextP++; + UVect WorkQ; + WorkQ.push_back(R); + for (unsigned i = 0; i < WorkQ.size(); ++i) { + unsigned T = WorkQ[i]; + if (Visited.count(T)) + continue; + R2P[T] = ThisP; + Visited.insert(T); + // Add all registers associated with T. + USet &Asc = AssocMap[T]; + for (USet::iterator J = Asc.begin(), F = Asc.end(); J != F; ++J) + WorkQ.push_back(*J); + } + } + + for (auto I : R2P) + P2Rs[I.second].insert(I.first); +} + + +static inline int32_t profitImm(unsigned Lo, unsigned Hi) { + int32_t P = 0; + bool LoZ1 = false, HiZ1 = false; + if (Lo == 0 || Lo == 0xFFFFFFFF) + P += 10, LoZ1 = true; + if (Hi == 0 || Hi == 0xFFFFFFFF) + P += 10, HiZ1 = true; + if (!LoZ1 && !HiZ1 && Lo == Hi) + P += 3; + return P; +} + + +int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const { + unsigned ImmX = 0; + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case TargetOpcode::PHI: + for (const auto &Op : MI->operands()) + if (!Op.getSubReg()) + return 0; + return 10; + case TargetOpcode::COPY: + if (MI->getOperand(1).getSubReg() != 0) + return 10; + return 0; + + case Hexagon::L2_loadrd_io: + case Hexagon::S2_storerd_io: + return -1; + case Hexagon::L2_loadrd_pi: + case Hexagon::S2_storerd_pi: + return 2; + + case Hexagon::A2_tfrpi: + case Hexagon::CONST64_Int_Real: { + uint64_t D = MI->getOperand(1).getImm(); + unsigned Lo = D & 0xFFFFFFFFULL; + unsigned Hi = D >> 32; + return profitImm(Lo, Hi); + } + case Hexagon::A2_combineii: + case Hexagon::A4_combineii: + return profitImm(MI->getOperand(1).getImm(), + MI->getOperand(2).getImm()); + case Hexagon::A4_combineri: + ImmX++; + case Hexagon::A4_combineir: { + ImmX++; + int64_t V = MI->getOperand(ImmX).getImm(); + if (V == 0 || V == -1) + return 10; + // Fall through into A2_combinew. + } + case Hexagon::A2_combinew: + return 2; + + case Hexagon::A2_sxtw: + return 3; + + case Hexagon::A2_andp: + case Hexagon::A2_orp: + case Hexagon::A2_xorp: + return 1; + + case Hexagon::S2_asl_i_p_or: { + unsigned S = MI->getOperand(3).getImm(); + if (S == 0 || S == 32) + return 10; + return -1; + } + case Hexagon::S2_asl_i_p: + case Hexagon::S2_asr_i_p: + case Hexagon::S2_lsr_i_p: + unsigned S = MI->getOperand(2).getImm(); + if (S == 0 || S == 32) + return 10; + if (S == 16) + return 5; + if (S == 48) + return 7; + return -10; + } + + return 0; +} + + +bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM) + const { + unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0; + int32_t TotalP = 0; + + for (unsigned DR : Part) { + MachineInstr *DefI = MRI->getVRegDef(DR); + int32_t P = profit(DefI); + if (P == INT_MIN) + return false; + TotalP += P; + // Reduce the profitability of splitting induction registers. + if (isInduction(DR, IRM)) + TotalP -= 30; + + for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end(); + U != W; ++U) { + MachineInstr *UseI = U->getParent(); + if (isFixedInstr(UseI)) { + FixedNum++; + // Calculate the cost of generating REG_SEQUENCE instructions. + for (auto &Op : UseI->operands()) { + if (Op.isReg() && Part.count(Op.getReg())) + if (Op.getSubReg()) + TotalP -= 2; + } + continue; + } + // If a register from this partition is used in a fixed instruction, + // and there is also a register in this partition that is used in + // a loop phi node, then decrease the splitting profit as this can + // confuse the modulo scheduler. + if (UseI->isPHI()) { + const MachineBasicBlock *PB = UseI->getParent(); + const MachineLoop *L = MLI->getLoopFor(PB); + if (L && L->getHeader() == PB) + LoopPhiNum++; + } + // Splittable instruction. + SplitNum++; + int32_t P = profit(UseI); + if (P == INT_MIN) + return false; + TotalP += P; + } + } + + if (FixedNum > 0 && LoopPhiNum > 0) + TotalP -= 20*LoopPhiNum; + + DEBUG(dbgs() << "Partition profit: " << TotalP << '\n'); + return TotalP > 0; +} + + +void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L, + USet &Rs) { + const MachineBasicBlock *HB = L->getHeader(); + const MachineBasicBlock *LB = L->getLoopLatch(); + if (!HB || !LB) + return; + + // Examine the latch branch. Expect it to be a conditional branch to + // the header (either "br-cond header" or "br-cond exit; br header"). + MachineBasicBlock *TB = 0, *FB = 0; + MachineBasicBlock *TmpLB = const_cast<MachineBasicBlock*>(LB); + SmallVector<MachineOperand,2> Cond; + bool BadLB = TII->AnalyzeBranch(*TmpLB, TB, FB, Cond, false); + // Only analyzable conditional branches. HII::AnalyzeBranch will put + // the branch opcode as the first element of Cond, and the predicate + // operand as the second. + if (BadLB || Cond.size() != 2) + return; + // Only simple jump-conditional (with or without negation). + if (!TII->PredOpcodeHasJMP_c(Cond[0].getImm())) + return; + // Must go to the header. + if (TB != HB && FB != HB) + return; + assert(Cond[1].isReg() && "Unexpected Cond vector from AnalyzeBranch"); + // Expect a predicate register. + unsigned PR = Cond[1].getReg(); + assert(MRI->getRegClass(PR) == &Hexagon::PredRegsRegClass); + + // Get the registers on which the loop controlling compare instruction + // depends. + unsigned CmpR1 = 0, CmpR2 = 0; + const MachineInstr *CmpI = MRI->getVRegDef(PR); + while (CmpI->getOpcode() == Hexagon::C2_not) + CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg()); + + int Mask = 0, Val = 0; + bool OkCI = TII->analyzeCompare(CmpI, CmpR1, CmpR2, Mask, Val); + if (!OkCI) + return; + // Eliminate non-double input registers. + if (CmpR1 && MRI->getRegClass(CmpR1) != DoubleRC) + CmpR1 = 0; + if (CmpR2 && MRI->getRegClass(CmpR2) != DoubleRC) + CmpR2 = 0; + if (!CmpR1 && !CmpR2) + return; + + // Now examine the top of the loop: the phi nodes that could poten- + // tially define loop induction registers. The registers defined by + // such a phi node would be used in a 64-bit add, which then would + // be used in the loop compare instruction. + + // Get the set of all double registers defined by phi nodes in the + // loop header. + typedef std::vector<unsigned> UVect; + UVect DP; + for (auto &MI : *HB) { + if (!MI.isPHI()) + break; + const MachineOperand &MD = MI.getOperand(0); + unsigned R = MD.getReg(); + if (MRI->getRegClass(R) == DoubleRC) + DP.push_back(R); + } + if (DP.empty()) + return; + + auto NoIndOp = [this, CmpR1, CmpR2] (unsigned R) -> bool { + for (auto I = MRI->use_nodbg_begin(R), E = MRI->use_nodbg_end(); + I != E; ++I) { + const MachineInstr *UseI = I->getParent(); + if (UseI->getOpcode() != Hexagon::A2_addp) + continue; + // Get the output from the add. If it is one of the inputs to the + // loop-controlling compare instruction, then R is likely an induc- + // tion register. + unsigned T = UseI->getOperand(0).getReg(); + if (T == CmpR1 || T == CmpR2) + return false; + } + return true; + }; + UVect::iterator End = std::remove_if(DP.begin(), DP.end(), NoIndOp); + Rs.insert(DP.begin(), End); + Rs.insert(CmpR1); + Rs.insert(CmpR2); + + DEBUG({ + dbgs() << "For loop at BB#" << HB->getNumber() << " ind regs: "; + dump_partition(dbgs(), Rs, *TRI); + dbgs() << '\n'; + }); +} + + +void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) { + typedef std::vector<MachineLoop*> LoopVector; + LoopVector WorkQ; + + for (auto I : *MLI) + WorkQ.push_back(I); + for (unsigned i = 0; i < WorkQ.size(); ++i) { + for (auto I : *WorkQ[i]) + WorkQ.push_back(I); + } + + USet Rs; + for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) { + MachineLoop *L = WorkQ[i]; + Rs.clear(); + collectIndRegsForLoop(L, Rs); + if (!Rs.empty()) + IRM.insert(std::make_pair(L, Rs)); + } +} + + +void HexagonSplitDoubleRegs::createHalfInstr(unsigned Opc, MachineInstr *MI, + const UUPairMap &PairMap, unsigned SubR) { + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + MachineInstr *NewI = BuildMI(B, MI, DL, TII->get(Opc)); + + for (auto &Op : MI->operands()) { + if (!Op.isReg()) { + NewI->addOperand(Op); + continue; + } + // For register operands, set the subregister. + unsigned R = Op.getReg(); + unsigned SR = Op.getSubReg(); + bool isVirtReg = TargetRegisterInfo::isVirtualRegister(R); + bool isKill = Op.isKill(); + if (isVirtReg && MRI->getRegClass(R) == DoubleRC) { + isKill = false; + UUPairMap::const_iterator F = PairMap.find(R); + if (F == PairMap.end()) { + SR = SubR; + } else { + const UUPair &P = F->second; + R = (SubR == Hexagon::subreg_loreg) ? P.first : P.second; + SR = 0; + } + } + auto CO = MachineOperand::CreateReg(R, Op.isDef(), Op.isImplicit(), isKill, + Op.isDead(), Op.isUndef(), Op.isEarlyClobber(), SR, Op.isDebug(), + Op.isInternalRead()); + NewI->addOperand(CO); + } +} + + +void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI, + const UUPairMap &PairMap) { + bool Load = MI->mayLoad(); + unsigned OrigOpc = MI->getOpcode(); + bool PostInc = (OrigOpc == Hexagon::L2_loadrd_pi || + OrigOpc == Hexagon::S2_storerd_pi); + MachineInstr *LowI, *HighI; + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + // Index of the base-address-register operand. + unsigned AdrX = PostInc ? (Load ? 2 : 1) + : (Load ? 1 : 0); + MachineOperand &AdrOp = MI->getOperand(AdrX); + unsigned RSA = getRegState(AdrOp); + MachineOperand &ValOp = Load ? MI->getOperand(0) + : (PostInc ? MI->getOperand(3) + : MI->getOperand(2)); + UUPairMap::const_iterator F = PairMap.find(ValOp.getReg()); + assert(F != PairMap.end()); + + if (Load) { + const UUPair &P = F->second; + int64_t Off = PostInc ? 0 : MI->getOperand(2).getImm(); + LowI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.first) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off); + HighI = BuildMI(B, MI, DL, TII->get(Hexagon::L2_loadri_io), P.second) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off+4); + } else { + const UUPair &P = F->second; + int64_t Off = PostInc ? 0 : MI->getOperand(1).getImm(); + LowI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io)) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off) + .addReg(P.first); + HighI = BuildMI(B, MI, DL, TII->get(Hexagon::S2_storeri_io)) + .addReg(AdrOp.getReg(), RSA & ~RegState::Kill, AdrOp.getSubReg()) + .addImm(Off+4) + .addReg(P.second); + } + + if (PostInc) { + // Create the increment of the address register. + int64_t Inc = Load ? MI->getOperand(3).getImm() + : MI->getOperand(2).getImm(); + MachineOperand &UpdOp = Load ? MI->getOperand(1) : MI->getOperand(0); + const TargetRegisterClass *RC = MRI->getRegClass(UpdOp.getReg()); + unsigned NewR = MRI->createVirtualRegister(RC); + assert(!UpdOp.getSubReg() && "Def operand with subreg"); + BuildMI(B, MI, DL, TII->get(Hexagon::A2_addi), NewR) + .addReg(AdrOp.getReg(), RSA) + .addImm(Inc); + MRI->replaceRegWith(UpdOp.getReg(), NewR); + // The original instruction will be deleted later. + } + + // Generate a new pair of memory-operands. + MachineFunction &MF = *B.getParent(); + for (auto &MO : MI->memoperands()) { + const MachinePointerInfo &Ptr = MO->getPointerInfo(); + unsigned F = MO->getFlags(); + int A = MO->getAlignment(); + + auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A); + LowI->addMemOperand(MF, Tmp1); + auto *Tmp2 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, std::min(A, 4)); + HighI->addMemOperand(MF, Tmp2); + } +} + + +void HexagonSplitDoubleRegs::splitImmediate(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + assert(Op0.isReg() && Op1.isImm()); + uint64_t V = Op1.getImm(); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + + // The operand to A2_tfrsi can only have 32 significant bits. Immediate + // values in MachineOperand are stored as 64-bit integers, and so the + // value -1 may be represented either as 64-bit -1, or 4294967295. Both + // will have the 32 higher bits truncated in the end, but -1 will remain + // as -1, while the latter may appear to be a large unsigned value + // requiring a constant extender. The casting to int32_t will select the + // former representation. (The same reasoning applies to all 32-bit + // values.) + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first) + .addImm(int32_t(V & 0xFFFFFFFFULL)); + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second) + .addImm(int32_t(V >> 32)); +} + + +void HexagonSplitDoubleRegs::splitCombine(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + assert(Op0.isReg()); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + + if (Op1.isImm()) { + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.second) + .addImm(Op1.getImm()); + } else if (Op1.isReg()) { + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.second) + .addReg(Op1.getReg(), getRegState(Op1), Op1.getSubReg()); + } else + llvm_unreachable("Unexpected operand"); + + if (Op2.isImm()) { + BuildMI(B, MI, DL, TII->get(Hexagon::A2_tfrsi), P.first) + .addImm(Op2.getImm()); + } else if (Op2.isReg()) { + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first) + .addReg(Op2.getReg(), getRegState(Op2), Op2.getSubReg()); + } else + llvm_unreachable("Unexpected operand"); +} + + +void HexagonSplitDoubleRegs::splitExt(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + assert(Op0.isReg() && Op1.isReg()); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + unsigned RS = getRegState(Op1); + + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), P.first) + .addReg(Op1.getReg(), RS & ~RegState::Kill, Op1.getSubReg()); + BuildMI(B, MI, DL, TII->get(Hexagon::S2_asr_i_r), P.second) + .addReg(Op1.getReg(), RS, Op1.getSubReg()) + .addImm(31); +} + + +void HexagonSplitDoubleRegs::splitShift(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + assert(Op0.isReg() && Op1.isReg() && Op2.isImm()); + int64_t Sh64 = Op2.getImm(); + assert(Sh64 >= 0 && Sh64 < 64); + unsigned S = Sh64; + + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + unsigned LoR = P.first; + unsigned HiR = P.second; + using namespace Hexagon; + + unsigned Opc = MI->getOpcode(); + bool Right = (Opc == S2_lsr_i_p || Opc == S2_asr_i_p); + bool Left = !Right; + bool Signed = (Opc == S2_asr_i_p); + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned RS = getRegState(Op1); + unsigned ShiftOpc = Left ? S2_asl_i_r + : (Signed ? S2_asr_i_r : S2_lsr_i_r); + unsigned LoSR = subreg_loreg; + unsigned HiSR = subreg_hireg; + + if (S == 0) { + // No shift, subregister copy. + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), HiR) + .addReg(Op1.getReg(), RS, HiSR); + } else if (S < 32) { + const TargetRegisterClass *IntRC = &IntRegsRegClass; + unsigned TmpR = MRI->createVirtualRegister(IntRC); + // Expansion: + // Shift left: DR = shl R, #s + // LoR = shl R.lo, #s + // TmpR = extractu R.lo, #s, #32-s + // HiR = or (TmpR, asl(R.hi, #s)) + // Shift right: DR = shr R, #s + // HiR = shr R.hi, #s + // TmpR = shr R.lo, #s + // LoR = insert TmpR, R.hi, #s, #32-s + + // Shift left: + // LoR = shl R.lo, #s + // Shift right: + // TmpR = shr R.lo, #s + + // Make a special case for A2_aslh and A2_asrh (they are predicable as + // opposed to S2_asl_i_r/S2_asr_i_r). + if (S == 16 && Left) + BuildMI(B, MI, DL, TII->get(A2_aslh), LoR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + else if (S == 16 && Signed) + BuildMI(B, MI, DL, TII->get(A2_asrh), TmpR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + else + BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? LoR : TmpR)) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR) + .addImm(S); + + if (Left) { + // TmpR = extractu R.lo, #s, #32-s + BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR) + .addImm(S) + .addImm(32-S); + // HiR = or (TmpR, asl(R.hi, #s)) + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR) + .addReg(TmpR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(S); + } else { + // HiR = shr R.hi, #s + BuildMI(B, MI, DL, TII->get(ShiftOpc), HiR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR) + .addImm(S); + // LoR = insert TmpR, R.hi, #s, #32-s + BuildMI(B, MI, DL, TII->get(S2_insert), LoR) + .addReg(TmpR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(S) + .addImm(32-S); + } + } else if (S == 32) { + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), (Left ? HiR : LoR)) + .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR)); + if (!Signed) + BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR)) + .addImm(0); + else // Must be right shift. + BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(31); + } else if (S < 64) { + S -= 32; + if (S == 16 && Left) + BuildMI(B, MI, DL, TII->get(A2_aslh), HiR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, LoSR); + else if (S == 16 && Signed) + BuildMI(B, MI, DL, TII->get(A2_asrh), LoR) + .addReg(Op1.getReg(), RS & ~RegState::Kill, HiSR); + else + BuildMI(B, MI, DL, TII->get(ShiftOpc), (Left ? HiR : LoR)) + .addReg(Op1.getReg(), RS & ~RegState::Kill, (Left ? LoSR : HiSR)) + .addImm(S); + + if (Signed) + BuildMI(B, MI, DL, TII->get(S2_asr_i_r), HiR) + .addReg(Op1.getReg(), RS, HiSR) + .addImm(31); + else + BuildMI(B, MI, DL, TII->get(A2_tfrsi), (Left ? LoR : HiR)) + .addImm(0); + } +} + + +void HexagonSplitDoubleRegs::splitAslOr(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineOperand &Op0 = MI->getOperand(0); + MachineOperand &Op1 = MI->getOperand(1); + MachineOperand &Op2 = MI->getOperand(2); + MachineOperand &Op3 = MI->getOperand(3); + assert(Op0.isReg() && Op1.isReg() && Op2.isReg() && Op3.isImm()); + int64_t Sh64 = Op3.getImm(); + assert(Sh64 >= 0 && Sh64 < 64); + unsigned S = Sh64; + + UUPairMap::const_iterator F = PairMap.find(Op0.getReg()); + assert(F != PairMap.end()); + const UUPair &P = F->second; + unsigned LoR = P.first; + unsigned HiR = P.second; + using namespace Hexagon; + + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + unsigned RS1 = getRegState(Op1); + unsigned RS2 = getRegState(Op2); + const TargetRegisterClass *IntRC = &IntRegsRegClass; + + unsigned LoSR = subreg_loreg; + unsigned HiSR = subreg_hireg; + + // Op0 = S2_asl_i_p_or Op1, Op2, Op3 + // means: Op0 = or (Op1, asl(Op2, Op3)) + + // Expansion of + // DR = or (R1, asl(R2, #s)) + // + // LoR = or (R1.lo, asl(R2.lo, #s)) + // Tmp1 = extractu R2.lo, #s, #32-s + // Tmp2 = or R1.hi, Tmp1 + // HiR = or (Tmp2, asl(R2.hi, #s)) + + if (S == 0) { + // DR = or (R1, asl(R2, #0)) + // -> or (R1, R2) + // i.e. LoR = or R1.lo, R2.lo + // HiR = or R1.hi, R2.hi + BuildMI(B, MI, DL, TII->get(A2_or), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR) + .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(A2_or), HiR) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(Op2.getReg(), RS2, HiSR); + } else if (S < 32) { + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR) + .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR) + .addImm(S); + unsigned TmpR1 = MRI->createVirtualRegister(IntRC); + BuildMI(B, MI, DL, TII->get(S2_extractu), TmpR1) + .addReg(Op2.getReg(), RS2 & ~RegState::Kill, LoSR) + .addImm(S) + .addImm(32-S); + unsigned TmpR2 = MRI->createVirtualRegister(IntRC); + BuildMI(B, MI, DL, TII->get(A2_or), TmpR2) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(TmpR1); + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR) + .addReg(TmpR2) + .addReg(Op2.getReg(), RS2, HiSR) + .addImm(S); + } else if (S == 32) { + // DR = or (R1, asl(R2, #32)) + // -> or R1, R2.lo + // LoR = R1.lo + // HiR = or R1.hi, R2.lo + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(A2_or), HiR) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(Op2.getReg(), RS2, LoSR); + } else if (S < 64) { + // DR = or (R1, asl(R2, #s)) + // + // LoR = R1:lo + // HiR = or (R1:hi, asl(R2:lo, #s-32)) + S -= 32; + BuildMI(B, MI, DL, TII->get(TargetOpcode::COPY), LoR) + .addReg(Op1.getReg(), RS1 & ~RegState::Kill, LoSR); + BuildMI(B, MI, DL, TII->get(S2_asl_i_r_or), HiR) + .addReg(Op1.getReg(), RS1, HiSR) + .addReg(Op2.getReg(), RS2, LoSR) + .addImm(S); + } +} + + +bool HexagonSplitDoubleRegs::splitInstr(MachineInstr *MI, + const UUPairMap &PairMap) { + DEBUG(dbgs() << "Splitting: " << *MI); + bool Split = false; + unsigned Opc = MI->getOpcode(); + using namespace Hexagon; + + switch (Opc) { + case TargetOpcode::PHI: + case TargetOpcode::COPY: { + unsigned DstR = MI->getOperand(0).getReg(); + if (MRI->getRegClass(DstR) == DoubleRC) { + createHalfInstr(Opc, MI, PairMap, subreg_loreg); + createHalfInstr(Opc, MI, PairMap, subreg_hireg); + Split = true; + } + break; + } + case A2_andp: + createHalfInstr(A2_and, MI, PairMap, subreg_loreg); + createHalfInstr(A2_and, MI, PairMap, subreg_hireg); + Split = true; + break; + case A2_orp: + createHalfInstr(A2_or, MI, PairMap, subreg_loreg); + createHalfInstr(A2_or, MI, PairMap, subreg_hireg); + Split = true; + break; + case A2_xorp: + createHalfInstr(A2_xor, MI, PairMap, subreg_loreg); + createHalfInstr(A2_xor, MI, PairMap, subreg_hireg); + Split = true; + break; + + case L2_loadrd_io: + case L2_loadrd_pi: + case S2_storerd_io: + case S2_storerd_pi: + splitMemRef(MI, PairMap); + Split = true; + break; + + case A2_tfrpi: + case CONST64_Int_Real: + splitImmediate(MI, PairMap); + Split = true; + break; + + case A2_combineii: + case A4_combineir: + case A4_combineii: + case A4_combineri: + case A2_combinew: + splitCombine(MI, PairMap); + Split = true; + break; + + case A2_sxtw: + splitExt(MI, PairMap); + Split = true; + break; + + case S2_asl_i_p: + case S2_asr_i_p: + case S2_lsr_i_p: + splitShift(MI, PairMap); + Split = true; + break; + + case S2_asl_i_p_or: + splitAslOr(MI, PairMap); + Split = true; + break; + + default: + llvm_unreachable("Instruction not splitable"); + return false; + } + + return Split; +} + + +void HexagonSplitDoubleRegs::replaceSubregUses(MachineInstr *MI, + const UUPairMap &PairMap) { + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isUse() || !Op.getSubReg()) + continue; + unsigned R = Op.getReg(); + UUPairMap::const_iterator F = PairMap.find(R); + if (F == PairMap.end()) + continue; + const UUPair &P = F->second; + switch (Op.getSubReg()) { + case Hexagon::subreg_loreg: + Op.setReg(P.first); + break; + case Hexagon::subreg_hireg: + Op.setReg(P.second); + break; + } + Op.setSubReg(0); + } +} + + +void HexagonSplitDoubleRegs::collapseRegPairs(MachineInstr *MI, + const UUPairMap &PairMap) { + MachineBasicBlock &B = *MI->getParent(); + DebugLoc DL = MI->getDebugLoc(); + + for (auto &Op : MI->operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + unsigned R = Op.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(R)) + continue; + if (MRI->getRegClass(R) != DoubleRC || Op.getSubReg()) + continue; + UUPairMap::const_iterator F = PairMap.find(R); + if (F == PairMap.end()) + continue; + const UUPair &Pr = F->second; + unsigned NewDR = MRI->createVirtualRegister(DoubleRC); + BuildMI(B, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), NewDR) + .addReg(Pr.first) + .addImm(Hexagon::subreg_loreg) + .addReg(Pr.second) + .addImm(Hexagon::subreg_hireg); + Op.setReg(NewDR); + } +} + + +bool HexagonSplitDoubleRegs::splitPartition(const USet &Part) { + const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass; + typedef std::set<MachineInstr*> MISet; + bool Changed = false; + + DEBUG(dbgs() << "Splitting partition: "; dump_partition(dbgs(), Part, *TRI); + dbgs() << '\n'); + + UUPairMap PairMap; + + MISet SplitIns; + for (unsigned DR : Part) { + MachineInstr *DefI = MRI->getVRegDef(DR); + SplitIns.insert(DefI); + + // Collect all instructions, including fixed ones. We won't split them, + // but we need to visit them again to insert the REG_SEQUENCE instructions. + for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end(); + U != W; ++U) + SplitIns.insert(U->getParent()); + + unsigned LoR = MRI->createVirtualRegister(IntRC); + unsigned HiR = MRI->createVirtualRegister(IntRC); + DEBUG(dbgs() << "Created mapping: " << PrintReg(DR, TRI) << " -> " + << PrintReg(HiR, TRI) << ':' << PrintReg(LoR, TRI) << '\n'); + PairMap.insert(std::make_pair(DR, UUPair(LoR, HiR))); + } + + MISet Erase; + for (auto MI : SplitIns) { + if (isFixedInstr(MI)) { + collapseRegPairs(MI, PairMap); + } else { + bool Done = splitInstr(MI, PairMap); + if (Done) + Erase.insert(MI); + Changed |= Done; + } + } + + for (unsigned DR : Part) { + // Before erasing "double" instructions, revisit all uses of the double + // registers in this partition, and replace all uses of them with subre- + // gisters, with the corresponding single registers. + MISet Uses; + for (auto U = MRI->use_nodbg_begin(DR), W = MRI->use_nodbg_end(); + U != W; ++U) + Uses.insert(U->getParent()); + for (auto M : Uses) + replaceSubregUses(M, PairMap); + } + + for (auto MI : Erase) { + MachineBasicBlock *B = MI->getParent(); + B->erase(MI); + } + + return Changed; +} + + +bool HexagonSplitDoubleRegs::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "Splitting double registers in function: " + << MF.getName() << '\n'); + + auto &ST = MF.getSubtarget<HexagonSubtarget>(); + TRI = ST.getRegisterInfo(); + TII = ST.getInstrInfo(); + MRI = &MF.getRegInfo(); + MLI = &getAnalysis<MachineLoopInfo>(); + + UUSetMap P2Rs; + LoopRegMap IRM; + + collectIndRegs(IRM); + partitionRegisters(P2Rs); + + DEBUG({ + dbgs() << "Register partitioning: (partition #0 is fixed)\n"; + for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) { + dbgs() << '#' << I->first << " -> "; + dump_partition(dbgs(), I->second, *TRI); + dbgs() << '\n'; + } + }); + + bool Changed = false; + int Limit = MaxHSDR; + + for (UUSetMap::iterator I = P2Rs.begin(), E = P2Rs.end(); I != E; ++I) { + if (I->first == 0) + continue; + if (Limit >= 0 && Counter >= Limit) + break; + USet &Part = I->second; + DEBUG(dbgs() << "Calculating profit for partition #" << I->first << '\n'); + if (!isProfitable(Part, IRM)) + continue; + Counter++; + Changed |= splitPartition(Part); + } + + return Changed; +} + +FunctionPass *llvm::createHexagonSplitDoubleRegs() { + return new HexagonSplitDoubleRegs(); +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp new file mode 100644 index 0000000..b5339ff --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp @@ -0,0 +1,616 @@ +//===--- HexagonStoreWidening.cpp------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Replace sequences of "narrow" stores to adjacent memory locations with +// a fewer "wide" stores that have the same effect. +// For example, replace: +// S4_storeirb_io %vreg100, 0, 0 ; store-immediate-byte +// S4_storeirb_io %vreg100, 1, 0 ; store-immediate-byte +// with +// S4_storeirh_io %vreg100, 0, 0 ; store-immediate-halfword +// The above is the general idea. The actual cases handled by the code +// may be a bit more complex. +// The purpose of this pass is to reduce the number of outstanding stores, +// or as one could say, "reduce store queue pressure". Also, wide stores +// mean fewer stores, and since there are only two memory instructions allowed +// per packet, it also means fewer packets, and ultimately fewer cycles. +//===---------------------------------------------------------------------===// + +#define DEBUG_TYPE "hexagon-widen-stores" + +#include "HexagonTargetMachine.h" + +#include "llvm/PassSupport.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#include <algorithm> + + +using namespace llvm; + +namespace llvm { + FunctionPass *createHexagonStoreWidening(); + void initializeHexagonStoreWideningPass(PassRegistry&); +} + +namespace { + struct HexagonStoreWidening : public MachineFunctionPass { + const HexagonInstrInfo *TII; + const HexagonRegisterInfo *TRI; + const MachineRegisterInfo *MRI; + AliasAnalysis *AA; + MachineFunction *MF; + + public: + static char ID; + HexagonStoreWidening() : MachineFunctionPass(ID) { + initializeHexagonStoreWideningPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "Hexagon Store Widening"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + static bool handledStoreType(const MachineInstr *MI); + + private: + static const int MaxWideSize = 4; + + typedef std::vector<MachineInstr*> InstrGroup; + typedef std::vector<InstrGroup> InstrGroupList; + + bool instrAliased(InstrGroup &Stores, const MachineMemOperand &MMO); + bool instrAliased(InstrGroup &Stores, const MachineInstr *MI); + void createStoreGroup(MachineInstr *BaseStore, InstrGroup::iterator Begin, + InstrGroup::iterator End, InstrGroup &Group); + void createStoreGroups(MachineBasicBlock &MBB, + InstrGroupList &StoreGroups); + bool processBasicBlock(MachineBasicBlock &MBB); + bool processStoreGroup(InstrGroup &Group); + bool selectStores(InstrGroup::iterator Begin, InstrGroup::iterator End, + InstrGroup &OG, unsigned &TotalSize, unsigned MaxSize); + bool createWideStores(InstrGroup &OG, InstrGroup &NG, unsigned TotalSize); + bool replaceStores(InstrGroup &OG, InstrGroup &NG); + bool storesAreAdjacent(const MachineInstr *S1, const MachineInstr *S2); + }; + +} // namespace + + +namespace { + +// Some local helper functions... +unsigned getBaseAddressRegister(const MachineInstr *MI) { + const MachineOperand &MO = MI->getOperand(0); + assert(MO.isReg() && "Expecting register operand"); + return MO.getReg(); +} + +int64_t getStoreOffset(const MachineInstr *MI) { + unsigned OpC = MI->getOpcode(); + assert(HexagonStoreWidening::handledStoreType(MI) && "Unhandled opcode"); + + switch (OpC) { + case Hexagon::S4_storeirb_io: + case Hexagon::S4_storeirh_io: + case Hexagon::S4_storeiri_io: { + const MachineOperand &MO = MI->getOperand(1); + assert(MO.isImm() && "Expecting immediate offset"); + return MO.getImm(); + } + } + dbgs() << *MI; + llvm_unreachable("Store offset calculation missing for a handled opcode"); + return 0; +} + +const MachineMemOperand &getStoreTarget(const MachineInstr *MI) { + assert(!MI->memoperands_empty() && "Expecting memory operands"); + return **MI->memoperands_begin(); +} + +} // namespace + + +char HexagonStoreWidening::ID = 0; + +INITIALIZE_PASS_BEGIN(HexagonStoreWidening, "hexagon-widen-stores", + "Hexason Store Widening", false, false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(HexagonStoreWidening, "hexagon-widen-stores", + "Hexagon Store Widening", false, false) + + +// Filtering function: any stores whose opcodes are not "approved" of by +// this function will not be subjected to widening. +inline bool HexagonStoreWidening::handledStoreType(const MachineInstr *MI) { + // For now, only handle stores of immediate values. + // Also, reject stores to stack slots. + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::S4_storeirb_io: + case Hexagon::S4_storeirh_io: + case Hexagon::S4_storeiri_io: + // Base address must be a register. (Implement FI later.) + return MI->getOperand(0).isReg(); + default: + return false; + } +} + + +// Check if the machine memory operand MMO is aliased with any of the +// stores in the store group Stores. +bool HexagonStoreWidening::instrAliased(InstrGroup &Stores, + const MachineMemOperand &MMO) { + if (!MMO.getValue()) + return true; + + MemoryLocation L(MMO.getValue(), MMO.getSize(), MMO.getAAInfo()); + + for (auto SI : Stores) { + const MachineMemOperand &SMO = getStoreTarget(SI); + if (!SMO.getValue()) + return true; + + MemoryLocation SL(SMO.getValue(), SMO.getSize(), SMO.getAAInfo()); + if (AA->alias(L, SL)) + return true; + } + + return false; +} + + +// Check if the machine instruction MI accesses any storage aliased with +// any store in the group Stores. +bool HexagonStoreWidening::instrAliased(InstrGroup &Stores, + const MachineInstr *MI) { + for (auto &I : MI->memoperands()) + if (instrAliased(Stores, *I)) + return true; + return false; +} + + +// Inspect a machine basic block, and generate store groups out of stores +// encountered in the block. +// +// A store group is a group of stores that use the same base register, +// and which can be reordered within that group without altering the +// semantics of the program. A single store group could be widened as +// a whole, if there existed a single store instruction with the same +// semantics as the entire group. In many cases, a single store group +// may need more than one wide store. +void HexagonStoreWidening::createStoreGroups(MachineBasicBlock &MBB, + InstrGroupList &StoreGroups) { + InstrGroup AllInsns; + + // Copy all instruction pointers from the basic block to a temporary + // list. This will allow operating on the list, and modifying its + // elements without affecting the basic block. + for (auto &I : MBB) + AllInsns.push_back(&I); + + // Traverse all instructions in the AllInsns list, and if we encounter + // a store, then try to create a store group starting at that instruction + // i.e. a sequence of independent stores that can be widened. + for (auto I = AllInsns.begin(), E = AllInsns.end(); I != E; ++I) { + MachineInstr *MI = *I; + // Skip null pointers (processed instructions). + if (!MI || !handledStoreType(MI)) + continue; + + // Found a store. Try to create a store group. + InstrGroup G; + createStoreGroup(MI, I+1, E, G); + if (G.size() > 1) + StoreGroups.push_back(G); + } +} + + +// Create a single store group. The stores need to be independent between +// themselves, and also there cannot be other instructions between them +// that could read or modify storage being stored into. +void HexagonStoreWidening::createStoreGroup(MachineInstr *BaseStore, + InstrGroup::iterator Begin, InstrGroup::iterator End, InstrGroup &Group) { + assert(handledStoreType(BaseStore) && "Unexpected instruction"); + unsigned BaseReg = getBaseAddressRegister(BaseStore); + InstrGroup Other; + + Group.push_back(BaseStore); + + for (auto I = Begin; I != End; ++I) { + MachineInstr *MI = *I; + if (!MI) + continue; + + if (handledStoreType(MI)) { + // If this store instruction is aliased with anything already in the + // group, terminate the group now. + if (instrAliased(Group, getStoreTarget(MI))) + return; + // If this store is aliased to any of the memory instructions we have + // seen so far (that are not a part of this group), terminate the group. + if (instrAliased(Other, getStoreTarget(MI))) + return; + + unsigned BR = getBaseAddressRegister(MI); + if (BR == BaseReg) { + Group.push_back(MI); + *I = 0; + continue; + } + } + + // Assume calls are aliased to everything. + if (MI->isCall() || MI->hasUnmodeledSideEffects()) + return; + + if (MI->mayLoad() || MI->mayStore()) { + if (MI->hasOrderedMemoryRef() || instrAliased(Group, MI)) + return; + Other.push_back(MI); + } + } // for +} + + +// Check if store instructions S1 and S2 are adjacent. More precisely, +// S2 has to access memory immediately following that accessed by S1. +bool HexagonStoreWidening::storesAreAdjacent(const MachineInstr *S1, + const MachineInstr *S2) { + if (!handledStoreType(S1) || !handledStoreType(S2)) + return false; + + const MachineMemOperand &S1MO = getStoreTarget(S1); + + // Currently only handling immediate stores. + int Off1 = S1->getOperand(1).getImm(); + int Off2 = S2->getOperand(1).getImm(); + + return (Off1 >= 0) ? Off1+S1MO.getSize() == unsigned(Off2) + : int(Off1+S1MO.getSize()) == Off2; +} + + +/// Given a sequence of adjacent stores, and a maximum size of a single wide +/// store, pick a group of stores that can be replaced by a single store +/// of size not exceeding MaxSize. The selected sequence will be recorded +/// in OG ("old group" of instructions). +/// OG should be empty on entry, and should be left empty if the function +/// fails. +bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin, + InstrGroup::iterator End, InstrGroup &OG, unsigned &TotalSize, + unsigned MaxSize) { + assert(Begin != End && "No instructions to analyze"); + assert(OG.empty() && "Old group not empty on entry"); + + if (std::distance(Begin, End) <= 1) + return false; + + MachineInstr *FirstMI = *Begin; + assert(!FirstMI->memoperands_empty() && "Expecting some memory operands"); + const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI); + unsigned Alignment = FirstMMO.getAlignment(); + unsigned SizeAccum = FirstMMO.getSize(); + unsigned FirstOffset = getStoreOffset(FirstMI); + + // The initial value of SizeAccum should always be a power of 2. + assert(isPowerOf2_32(SizeAccum) && "First store size not a power of 2"); + + // If the size of the first store equals to or exceeds the limit, do nothing. + if (SizeAccum >= MaxSize) + return false; + + // If the size of the first store is greater than or equal to the address + // stored to, then the store cannot be made any wider. + if (SizeAccum >= Alignment) + return false; + + // The offset of a store will put restrictions on how wide the store can be. + // Offsets in stores of size 2^n bytes need to have the n lowest bits be 0. + // If the first store already exhausts the offset limits, quit. Test this + // by checking if the next wider size would exceed the limit. + if ((2*SizeAccum-1) & FirstOffset) + return false; + + OG.push_back(FirstMI); + MachineInstr *S1 = FirstMI, *S2 = *(Begin+1); + InstrGroup::iterator I = Begin+1; + + // Pow2Num will be the largest number of elements in OG such that the sum + // of sizes of stores 0...Pow2Num-1 will be a power of 2. + unsigned Pow2Num = 1; + unsigned Pow2Size = SizeAccum; + + // Be greedy: keep accumulating stores as long as they are to adjacent + // memory locations, and as long as the total number of bytes stored + // does not exceed the limit (MaxSize). + // Keep track of when the total size covered is a power of 2, since + // this is a size a single store can cover. + while (I != End) { + S2 = *I; + // Stores are sorted, so if S1 and S2 are not adjacent, there won't be + // any other store to fill the "hole". + if (!storesAreAdjacent(S1, S2)) + break; + + unsigned S2Size = getStoreTarget(S2).getSize(); + if (SizeAccum + S2Size > std::min(MaxSize, Alignment)) + break; + + OG.push_back(S2); + SizeAccum += S2Size; + if (isPowerOf2_32(SizeAccum)) { + Pow2Num = OG.size(); + Pow2Size = SizeAccum; + } + if ((2*Pow2Size-1) & FirstOffset) + break; + + S1 = S2; + ++I; + } + + // The stores don't add up to anything that can be widened. Clean up. + if (Pow2Num <= 1) { + OG.clear(); + return false; + } + + // Only leave the stored being widened. + OG.resize(Pow2Num); + TotalSize = Pow2Size; + return true; +} + + +/// Given an "old group" OG of stores, create a "new group" NG of instructions +/// to replace them. Ideally, NG would only have a single instruction in it, +/// but that may only be possible for store-immediate. +bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG, + unsigned TotalSize) { + // XXX Current limitations: + // - only expect stores of immediate values in OG, + // - only handle a TotalSize of up to 4. + + if (TotalSize > 4) + return false; + + unsigned Acc = 0; // Value accumulator. + unsigned Shift = 0; + + for (InstrGroup::iterator I = OG.begin(), E = OG.end(); I != E; ++I) { + MachineInstr *MI = *I; + const MachineMemOperand &MMO = getStoreTarget(MI); + MachineOperand &SO = MI->getOperand(2); // Source. + assert(SO.isImm() && "Expecting an immediate operand"); + + unsigned NBits = MMO.getSize()*8; + unsigned Mask = (0xFFFFFFFFU >> (32-NBits)); + unsigned Val = (SO.getImm() & Mask) << Shift; + Acc |= Val; + Shift += NBits; + } + + + MachineInstr *FirstSt = OG.front(); + DebugLoc DL = OG.back()->getDebugLoc(); + const MachineMemOperand &OldM = getStoreTarget(FirstSt); + MachineMemOperand *NewM = + MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(), + TotalSize, OldM.getAlignment(), + OldM.getAAInfo()); + + if (Acc < 0x10000) { + // Create mem[hw] = #Acc + unsigned WOpc = (TotalSize == 2) ? Hexagon::S4_storeirh_io : + (TotalSize == 4) ? Hexagon::S4_storeiri_io : 0; + assert(WOpc && "Unexpected size"); + + int Val = (TotalSize == 2) ? int16_t(Acc) : int(Acc); + const MCInstrDesc &StD = TII->get(WOpc); + MachineOperand &MR = FirstSt->getOperand(0); + int64_t Off = FirstSt->getOperand(1).getImm(); + MachineInstr *StI = BuildMI(*MF, DL, StD) + .addReg(MR.getReg(), getKillRegState(MR.isKill())) + .addImm(Off) + .addImm(Val); + StI->addMemOperand(*MF, NewM); + NG.push_back(StI); + } else { + // Create vreg = A2_tfrsi #Acc; mem[hw] = vreg + const MCInstrDesc &TfrD = TII->get(Hexagon::A2_tfrsi); + const TargetRegisterClass *RC = TII->getRegClass(TfrD, 0, TRI, *MF); + unsigned VReg = MF->getRegInfo().createVirtualRegister(RC); + MachineInstr *TfrI = BuildMI(*MF, DL, TfrD, VReg) + .addImm(int(Acc)); + NG.push_back(TfrI); + + unsigned WOpc = (TotalSize == 2) ? Hexagon::S2_storerh_io : + (TotalSize == 4) ? Hexagon::S2_storeri_io : 0; + assert(WOpc && "Unexpected size"); + + const MCInstrDesc &StD = TII->get(WOpc); + MachineOperand &MR = FirstSt->getOperand(0); + int64_t Off = FirstSt->getOperand(1).getImm(); + MachineInstr *StI = BuildMI(*MF, DL, StD) + .addReg(MR.getReg(), getKillRegState(MR.isKill())) + .addImm(Off) + .addReg(VReg, RegState::Kill); + StI->addMemOperand(*MF, NewM); + NG.push_back(StI); + } + + return true; +} + + +// Replace instructions from the old group OG with instructions from the +// new group NG. Conceptually, remove all instructions in OG, and then +// insert all instructions in NG, starting at where the first instruction +// from OG was (in the order in which they appeared in the basic block). +// (The ordering in OG does not have to match the order in the basic block.) +bool HexagonStoreWidening::replaceStores(InstrGroup &OG, InstrGroup &NG) { + DEBUG({ + dbgs() << "Replacing:\n"; + for (auto I : OG) + dbgs() << " " << *I; + dbgs() << "with\n"; + for (auto I : NG) + dbgs() << " " << *I; + }); + + MachineBasicBlock *MBB = OG.back()->getParent(); + MachineBasicBlock::iterator InsertAt = MBB->end(); + + // Need to establish the insertion point. The best one is right before + // the first store in the OG, but in the order in which the stores occur + // in the program list. Since the ordering in OG does not correspond + // to the order in the program list, we need to do some work to find + // the insertion point. + + // Create a set of all instructions in OG (for quick lookup). + SmallPtrSet<MachineInstr*, 4> InstrSet; + for (auto I : OG) + InstrSet.insert(I); + + // Traverse the block, until we hit an instruction from OG. + for (auto &I : *MBB) { + if (InstrSet.count(&I)) { + InsertAt = I; + break; + } + } + + assert((InsertAt != MBB->end()) && "Cannot locate any store from the group"); + + bool AtBBStart = false; + + // InsertAt points at the first instruction that will be removed. We need + // to move it out of the way, so it remains valid after removing all the + // old stores, and so we are able to recover it back to the proper insertion + // position. + if (InsertAt != MBB->begin()) + --InsertAt; + else + AtBBStart = true; + + for (auto I : OG) + I->eraseFromParent(); + + if (!AtBBStart) + ++InsertAt; + else + InsertAt = MBB->begin(); + + for (auto I : NG) + MBB->insert(InsertAt, I); + + return true; +} + + +// Break up the group into smaller groups, each of which can be replaced by +// a single wide store. Widen each such smaller group and replace the old +// instructions with the widened ones. +bool HexagonStoreWidening::processStoreGroup(InstrGroup &Group) { + bool Changed = false; + InstrGroup::iterator I = Group.begin(), E = Group.end(); + InstrGroup OG, NG; // Old and new groups. + unsigned CollectedSize; + + while (I != E) { + OG.clear(); + NG.clear(); + + bool Succ = selectStores(I++, E, OG, CollectedSize, MaxWideSize) && + createWideStores(OG, NG, CollectedSize) && + replaceStores(OG, NG); + if (!Succ) + continue; + + assert(OG.size() > 1 && "Created invalid group"); + assert(distance(I, E)+1 >= int(OG.size()) && "Too many elements"); + I += OG.size()-1; + + Changed = true; + } + + return Changed; +} + + +// Process a single basic block: create the store groups, and replace them +// with the widened stores, if possible. Processing of each basic block +// is independent from processing of any other basic block. This transfor- +// mation could be stopped after having processed any basic block without +// any ill effects (other than not having performed widening in the unpro- +// cessed blocks). Also, the basic blocks can be processed in any order. +bool HexagonStoreWidening::processBasicBlock(MachineBasicBlock &MBB) { + InstrGroupList SGs; + bool Changed = false; + + createStoreGroups(MBB, SGs); + + auto Less = [] (const MachineInstr *A, const MachineInstr *B) -> bool { + return getStoreOffset(A) < getStoreOffset(B); + }; + for (auto &G : SGs) { + assert(G.size() > 1 && "Store group with fewer than 2 elements"); + std::sort(G.begin(), G.end(), Less); + + Changed |= processStoreGroup(G); + } + + return Changed; +} + + +bool HexagonStoreWidening::runOnMachineFunction(MachineFunction &MFn) { + MF = &MFn; + auto &ST = MFn.getSubtarget<HexagonSubtarget>(); + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MRI = &MFn.getRegInfo(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + + bool Changed = false; + + for (auto &B : MFn) + Changed |= processBasicBlock(B); + + return Changed; +} + + +FunctionPass *llvm::createHexagonStoreWidening() { + return new HexagonStoreWidening(); +} + diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index cd482b3..aa0efd4 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -16,6 +16,8 @@ #include "HexagonRegisterInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include <map> + using namespace llvm; #define DEBUG_TYPE "hexagon-subtarget" @@ -24,49 +26,65 @@ using namespace llvm; #define GET_SUBTARGETINFO_TARGET_DESC #include "HexagonGenSubtargetInfo.inc" -static cl::opt<bool> -EnableV3("enable-hexagon-v3", cl::Hidden, - cl::desc("Enable Hexagon V3 instructions.")); - -static cl::opt<bool> -EnableMemOps( - "enable-hexagon-memops", - cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true), - cl::desc( - "Generate V4 MEMOP in code generation for Hexagon target")); - -static cl::opt<bool> -DisableMemOps( - "disable-hexagon-memops", - cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false), - cl::desc( - "Do not generate V4 MEMOP in code generation for Hexagon target")); - -static cl::opt<bool> -EnableIEEERndNear( - "enable-hexagon-ieee-rnd-near", - cl::Hidden, cl::ZeroOrMore, cl::init(false), - cl::desc("Generate non-chopped conversion from fp to int.")); +static cl::opt<bool> EnableMemOps("enable-hexagon-memops", + cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(true), + cl::desc("Generate V4 MEMOP in code generation for Hexagon target")); + +static cl::opt<bool> DisableMemOps("disable-hexagon-memops", + cl::Hidden, cl::ZeroOrMore, cl::ValueDisallowed, cl::init(false), + cl::desc("Do not generate V4 MEMOP in code generation for Hexagon target")); + +static cl::opt<bool> EnableIEEERndNear("enable-hexagon-ieee-rnd-near", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Generate non-chopped conversion from fp to int.")); + +static cl::opt<bool> EnableBSBSched("enable-bsb-sched", + cl::Hidden, cl::ZeroOrMore, cl::init(true)); + +static cl::opt<bool> EnableHexagonHVXDouble("enable-hexagon-hvx-double", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Enable Hexagon Double Vector eXtensions")); + +static cl::opt<bool> EnableHexagonHVX("enable-hexagon-hvx", + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Enable Hexagon Vector eXtensions")); static cl::opt<bool> DisableHexagonMISched("disable-hexagon-misched", - cl::Hidden, cl::ZeroOrMore, cl::init(false), - cl::desc("Disable Hexagon MI Scheduling")); + cl::Hidden, cl::ZeroOrMore, cl::init(false), + cl::desc("Disable Hexagon MI Scheduling")); + +void HexagonSubtarget::initializeEnvironment() { + UseMemOps = false; + ModeIEEERndNear = false; + UseBSBScheduling = false; +} HexagonSubtarget & HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { - // If the programmer has not specified a Hexagon version, default to -mv4. - if (CPUString.empty()) - CPUString = "hexagonv4"; - - if (CPUString == "hexagonv4") { - HexagonArchVersion = V4; - } else if (CPUString == "hexagonv5") { - HexagonArchVersion = V5; - } else { + CPUString = HEXAGON_MC::selectHexagonCPU(getTargetTriple(), CPU); + + static std::map<StringRef, HexagonArchEnum> CpuTable { + { "hexagonv4", V4 }, + { "hexagonv5", V5 }, + { "hexagonv55", V55 }, + { "hexagonv60", V60 }, + }; + + auto foundIt = CpuTable.find(CPUString); + if (foundIt != CpuTable.end()) + HexagonArchVersion = foundIt->second; + else llvm_unreachable("Unrecognized Hexagon processor version"); - } + UseHVXOps = false; + UseHVXDblOps = false; ParseSubtargetFeatures(CPUString, FS); + + if (EnableHexagonHVX.getPosition()) + UseHVXOps = EnableHexagonHVX; + if (EnableHexagonHVXDouble.getPosition()) + UseHVXDblOps = EnableHexagonHVXDouble; + return *this; } @@ -76,6 +94,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), FrameLowering() { + initializeEnvironment(); + // Initialize scheduling itinerary for the specified CPU. InstrItins = getInstrItineraryForCPU(CPUString); @@ -91,6 +111,8 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU, ModeIEEERndNear = true; else ModeIEEERndNear = false; + + UseBSBScheduling = hasV60TOps() && EnableBSBSched; } // Pin the vtable to this file. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 34cdad7..c7ae139 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -34,15 +34,19 @@ namespace llvm { class HexagonSubtarget : public HexagonGenSubtargetInfo { virtual void anchor(); - bool UseMemOps; + bool UseMemOps, UseHVXOps, UseHVXDblOps; bool ModeIEEERndNear; public: enum HexagonArchEnum { - V4, V5 + V4, V5, V55, V60 }; HexagonArchEnum HexagonArchVersion; + /// True if the target should use Back-Skip-Back scheduling. This is the + /// default for V60. + bool UseBSBScheduling; + private: std::string CPUString; HexagonInstrInfo InstrInfo; @@ -50,6 +54,7 @@ private: HexagonSelectionDAGInfo TSInfo; HexagonFrameLowering FrameLowering; InstrItineraryData InstrItins; + void initializeEnvironment(); public: HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS, @@ -84,7 +89,16 @@ public: bool useMemOps() const { return UseMemOps; } bool hasV5TOps() const { return getHexagonArchVersion() >= V5; } bool hasV5TOpsOnly() const { return getHexagonArchVersion() == V5; } + bool hasV55TOps() const { return getHexagonArchVersion() >= V55; } + bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; } + bool hasV60TOps() const { return getHexagonArchVersion() >= V60; } + bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; } bool modeIEEERndNear() const { return ModeIEEERndNear; } + bool useHVXOps() const { return UseHVXOps; } + bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; } + bool useHVXSglOps() const { return UseHVXOps && !UseHVXDblOps; } + + bool useBSBScheduling() const { return UseBSBScheduling; } bool enableMachineScheduler() const override; // Always use the TargetLowering default scheduler. // FIXME: This will use the vliw scheduler which is probably just hurting @@ -98,7 +112,7 @@ public: return Hexagon_SMALL_DATA_THRESHOLD; } const HexagonArchEnum &getHexagonArchVersion() const { - return HexagonArchVersion; + return HexagonArchVersion; } }; diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp index b504429..9dccd69 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -16,12 +16,12 @@ #include "HexagonISelLowering.h" #include "HexagonMachineScheduler.h" #include "HexagonTargetObjectFile.h" +#include "HexagonTargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; @@ -33,10 +33,16 @@ static cl::opt<bool> DisableHexagonCFGOpt("disable-hexagon-cfgopt", cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::desc("Disable Hexagon CFG Optimization")); +static cl::opt<bool> DisableStoreWidening("disable-store-widen", + cl::Hidden, cl::init(false), cl::desc("Disable store widening")); + static cl::opt<bool> EnableExpandCondsets("hexagon-expand-condsets", cl::init(true), cl::Hidden, cl::ZeroOrMore, cl::desc("Early expansion of MUX")); +static cl::opt<bool> EnableEarlyIf("hexagon-eif", cl::init(true), cl::Hidden, + cl::ZeroOrMore, cl::desc("Enable early if-conversion")); + static cl::opt<bool> EnableGenInsert("hexagon-insert", cl::init(true), cl::Hidden, cl::desc("Generate \"insert\" instructions")); @@ -46,10 +52,22 @@ static cl::opt<bool> EnableCommGEP("hexagon-commgep", cl::init(true), static cl::opt<bool> EnableGenExtract("hexagon-extract", cl::init(true), cl::Hidden, cl::desc("Generate \"extract\" instructions")); +static cl::opt<bool> EnableGenMux("hexagon-mux", cl::init(true), cl::Hidden, + cl::desc("Enable converting conditional transfers into MUX instructions")); + static cl::opt<bool> EnableGenPred("hexagon-gen-pred", cl::init(true), cl::Hidden, cl::desc("Enable conversion of arithmetic operations to " "predicate instructions")); +static cl::opt<bool> DisableHSDR("disable-hsdr", cl::init(false), cl::Hidden, + cl::desc("Disable splitting double registers")); + +static cl::opt<bool> EnableBitSimplify("hexagon-bit", cl::init(true), + cl::Hidden, cl::desc("Bit simplification")); + +static cl::opt<bool> EnableLoopResched("hexagon-loop-resched", cl::init(true), + cl::Hidden, cl::desc("Loop rescheduling")); + /// HexagonTargetMachineModule - Note that this is used on hosts that /// cannot link in a library unless there are references into the /// library. In particular, it seems that it is not possible to get @@ -72,23 +90,30 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", createVLIWMachineSched); namespace llvm { + FunctionPass *createHexagonBitSimplify(); + FunctionPass *createHexagonCallFrameInformation(); FunctionPass *createHexagonCFGOptimizer(); FunctionPass *createHexagonCommonGEP(); FunctionPass *createHexagonCopyToCombine(); + FunctionPass *createHexagonEarlyIfConversion(); FunctionPass *createHexagonExpandCondsets(); FunctionPass *createHexagonExpandPredSpillCode(); FunctionPass *createHexagonFixupHwLoops(); FunctionPass *createHexagonGenExtract(); FunctionPass *createHexagonGenInsert(); + FunctionPass *createHexagonGenMux(); FunctionPass *createHexagonGenPredicate(); FunctionPass *createHexagonHardwareLoops(); FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM, CodeGenOpt::Level OptLevel); + FunctionPass *createHexagonLoopRescheduling(); FunctionPass *createHexagonNewValueJump(); + FunctionPass *createHexagonOptimizeSZextends(); FunctionPass *createHexagonPacketizer(); FunctionPass *createHexagonPeephole(); - FunctionPass *createHexagonRemoveExtendArgs(const HexagonTargetMachine &TM); FunctionPass *createHexagonSplitConst32AndConst64(); + FunctionPass *createHexagonSplitDoubleRegs(); + FunctionPass *createHexagonStoreWidening(); } // end namespace llvm; /// HexagonTargetMachine ctor - Create an ILP32 architecture model. @@ -101,13 +126,46 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, "e-m:e-p:32:32-i1:32-i64:64-a:0-n32", TT, CPU, FS, - Options, RM, CM, OL), - TLOF(make_unique<HexagonTargetObjectFile>()), - Subtarget(TT, CPU, FS, *this) { - initAsmInfo(); + : LLVMTargetMachine(T, "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-" + "i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-" + "n16:32", TT, CPU, FS, Options, RM, CM, OL), + TLOF(make_unique<HexagonTargetObjectFile>()) { + initAsmInfo(); +} + +const HexagonSubtarget * +HexagonTargetMachine::getSubtargetImpl(const Function &F) const { + AttributeSet FnAttrs = F.getAttributes(); + Attribute CPUAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); + Attribute FSAttr = + FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); + + std::string CPU = !CPUAttr.hasAttribute(Attribute::None) + ? CPUAttr.getValueAsString().str() + : TargetCPU; + std::string FS = !FSAttr.hasAttribute(Attribute::None) + ? FSAttr.getValueAsString().str() + : TargetFS; + + auto &I = SubtargetMap[CPU + FS]; + if (!I) { + // This needs to be done before we create a new subtarget since any + // creation will depend on the TM and the code generation flags on the + // function that reside in TargetOptions. + resetTargetOptions(F); + I = llvm::make_unique<HexagonSubtarget>(TargetTriple, CPU, FS, *this); + } + return I.get(); +} + +TargetIRAnalysis HexagonTargetMachine::getTargetIRAnalysis() { + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(HexagonTTIImpl(this, F)); + }); } + HexagonTargetMachine::~HexagonTargetMachine() {} namespace { @@ -166,7 +224,7 @@ bool HexagonPassConfig::addInstSelector() { bool NoOpt = (getOptLevel() == CodeGenOpt::None); if (!NoOpt) - addPass(createHexagonRemoveExtendArgs(TM)); + addPass(createHexagonOptimizeSZextends()); addPass(createHexagonISelDag(TM, getOptLevel())); @@ -174,19 +232,33 @@ bool HexagonPassConfig::addInstSelector() { // Create logical operations on predicate registers. if (EnableGenPred) addPass(createHexagonGenPredicate(), false); + // Rotate loops to expose bit-simplification opportunities. + if (EnableLoopResched) + addPass(createHexagonLoopRescheduling(), false); + // Split double registers. + if (!DisableHSDR) + addPass(createHexagonSplitDoubleRegs()); + // Bit simplification. + if (EnableBitSimplify) + addPass(createHexagonBitSimplify(), false); addPass(createHexagonPeephole()); printAndVerify("After hexagon peephole pass"); if (EnableGenInsert) addPass(createHexagonGenInsert(), false); + if (EnableEarlyIf) + addPass(createHexagonEarlyIfConversion(), false); } return false; } void HexagonPassConfig::addPreRegAlloc() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { + if (!DisableStoreWidening) + addPass(createHexagonStoreWidening(), false); if (!DisableHardwareLoops) addPass(createHexagonHardwareLoops(), false); + } } void HexagonPassConfig::addPostRegAlloc() { @@ -215,6 +287,13 @@ void HexagonPassConfig::addPreEmitPass() { if (!NoOpt) { if (!DisableHardwareLoops) addPass(createHexagonFixupHwLoops(), false); + // Generate MUX from pairs of conditional transfers. + if (EnableGenMux) + addPass(createHexagonGenMux(), false); + addPass(createHexagonPacketizer(), false); } + + // Add CFI instructions if necessary. + addPass(createHexagonCallFrameInformation(), false); } diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h index 115eadb..968814b 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetMachine.h @@ -16,6 +16,7 @@ #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" +#include "HexagonTargetObjectFile.h" #include "llvm/Target/TargetMachine.h" namespace llvm { @@ -24,7 +25,7 @@ class Module; class HexagonTargetMachine : public LLVMTargetMachine { std::unique_ptr<TargetLoweringObjectFile> TLOF; - HexagonSubtarget Subtarget; + mutable StringMap<std::unique_ptr<HexagonSubtarget>> SubtargetMap; public: HexagonTargetMachine(const Target &T, const Triple &TT, StringRef CPU, @@ -32,20 +33,18 @@ public: Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL); ~HexagonTargetMachine() override; - const HexagonSubtarget *getSubtargetImpl(const Function &) const override { - return &Subtarget; - } + const HexagonSubtarget *getSubtargetImpl(const Function &F) const override; + static unsigned getModuleMatchQuality(const Module &M); TargetPassConfig *createPassConfig(PassManagerBase &PM) override; + TargetIRAnalysis getTargetIRAnalysis() override; - TargetLoweringObjectFile *getObjFileLowering() const override { - return TLOF.get(); + HexagonTargetObjectFile *getObjFileLowering() const override { + return static_cast<HexagonTargetObjectFile*>(TLOF.get()); } }; -extern bool flag_aligned_memcpy; - } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index 4ea0e0d..ccca620 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -73,9 +73,10 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM, if (!GVA) return false; - if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) { + if (Kind.isBSS() || Kind.isData() || Kind.isCommon()) { Type *Ty = GV->getType()->getElementType(); - return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty)); + return IsInSmallSection( + GV->getParent()->getDataLayout().getTypeAllocSize(Ty)); } return false; @@ -89,7 +90,7 @@ HexagonTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, // Handle Small Section classification here. if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallBSSSection; - if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind)) + if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallDataSection; // Otherwise, we work the same as ELF. diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp new file mode 100644 index 0000000..a05443e --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -0,0 +1,38 @@ +//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// Hexagon target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#include "HexagonTargetTransformInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "hexagontti" + +TargetTransformInfo::PopcntSupportKind +HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { + // Return Fast Hardware support as every input < 64 bits will be promoted + // to 64 bits. + return TargetTransformInfo::PSK_FastHardware; +} + +// The Hexagon target can unroll loops with run-time trip counts. +void HexagonTTIImpl::getUnrollingPreferences(Loop *L, + TTI::UnrollingPreferences &UP) { + UP.Runtime = UP.Partial = true; +} + +unsigned HexagonTTIImpl::getNumberOfRegisters(bool vector) const { + return vector ? 0 : 32; +} diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h new file mode 100644 index 0000000..71ae17a --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -0,0 +1,70 @@ +//===-- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +/// \file +/// This file implements a TargetTransformInfo analysis pass specific to the +/// Hexagon target machine. It uses the target's detailed information to provide +/// more precise answers to certain TTI queries, while letting the target +/// independent and default TTI implementations handle the rest. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H +#define LLVM_LIB_TARGET_HEXAGON_HEXAGONTARGETTRANSFORMINFO_H + +#include "Hexagon.h" +#include "HexagonTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm { + +class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> { + typedef BasicTTIImplBase<HexagonTTIImpl> BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; + + const HexagonSubtarget *ST; + const HexagonTargetLowering *TLI; + + const HexagonSubtarget *getST() const { return ST; } + const HexagonTargetLowering *getTLI() const { return TLI; } + +public: + explicit HexagonTTIImpl(const HexagonTargetMachine *TM, const Function &F) + : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), + TLI(ST->getTargetLowering()) {} + + // Provide value semantics. MSVC requires that we spell all of these out. + HexagonTTIImpl(const HexagonTTIImpl &Arg) + : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {} + HexagonTTIImpl(HexagonTTIImpl &&Arg) + : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)), + TLI(std::move(Arg.TLI)) {} + + /// \name Scalar TTI Implementations + /// @{ + + TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; + + // The Hexagon target can unroll loops with run-time trip counts. + void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); + + /// @} + + /// \name Vector TTI Implementations + /// @{ + + unsigned getNumberOfRegisters(bool vector) const; + + /// @} +}; + +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index b91a3f6..8185054 100644 --- a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -16,35 +16,19 @@ // prune the dependence. // //===----------------------------------------------------------------------===// -#include "llvm/CodeGen/DFAPacketizer.h" -#include "Hexagon.h" -#include "HexagonMachineFunctionInfo.h" #include "HexagonRegisterInfo.h" #include "HexagonSubtarget.h" #include "HexagonTargetMachine.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LatencyPriorityQueue.h" +#include "HexagonVLIWPacketizer.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/ScheduleDAG.h" -#include "llvm/CodeGen/ScheduleDAGInstrs.h" -#include "llvm/CodeGen/ScheduleHazardRecognizer.h" -#include "llvm/CodeGen/SchedulerRegistry.h" -#include "llvm/MC/MCInstrItineraries.h" #include "llvm/Support/CommandLine.h" -#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <map> #include <vector> @@ -52,9 +36,22 @@ using namespace llvm; #define DEBUG_TYPE "packets" +static cl::opt<bool> DisablePacketizer("disable-packetizer", cl::Hidden, + cl::ZeroOrMore, cl::init(false), + cl::desc("Disable Hexagon packetizer pass")); + static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles", - cl::ZeroOrMore, cl::Hidden, cl::init(true), - cl::desc("Allow non-solo packetization of volatile memory references")); + cl::ZeroOrMore, cl::Hidden, cl::init(true), + cl::desc("Allow non-solo packetization of volatile memory references")); + +static cl::opt<bool> EnableGenAllInsnClass("enable-gen-insn", cl::init(false), + cl::Hidden, cl::ZeroOrMore, cl::desc("Generate all instruction with TC")); + +static cl::opt<bool> DisableVecDblNVStores("disable-vecdbl-nv-stores", + cl::init(false), cl::Hidden, cl::ZeroOrMore, + cl::desc("Disable vector double new-value-stores")); + +extern cl::opt<bool> ScheduleInlineAsm; namespace llvm { FunctionPass *createHexagonPacketizer(); @@ -64,7 +61,6 @@ namespace llvm { namespace { class HexagonPacketizer : public MachineFunctionPass { - public: static char ID; HexagonPacketizer() : MachineFunctionPass(ID) { @@ -73,103 +69,25 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<MachineDominatorTree>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<MachineBranchProbabilityInfo>(); - AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineDominatorTree>(); AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineDominatorTree>(); AU.addPreserved<MachineLoopInfo>(); MachineFunctionPass::getAnalysisUsage(AU); } - const char *getPassName() const override { return "Hexagon Packetizer"; } - bool runOnMachineFunction(MachineFunction &Fn) override; - }; - char HexagonPacketizer::ID = 0; - - class HexagonPacketizerList : public VLIWPacketizerList { private: - - // Has the instruction been promoted to a dot-new instruction. - bool PromotedToDotNew; - - // Has the instruction been glued to allocframe. - bool GlueAllocframeStore; - - // Has the feeder instruction been glued to new value jump. - bool GlueToNewValueJump; - - // Check if there is a dependence between some instruction already in this - // packet and this instruction. - bool Dependence; - - // Only check for dependence if there are resources available to - // schedule this instruction. - bool FoundSequentialDependence; - - /// \brief A handle to the branch probability pass. - const MachineBranchProbabilityInfo *MBPI; - - // Track MIs with ignored dependece. - std::vector<MachineInstr*> IgnoreDepMIs; - - public: - // Ctor. - HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, - const MachineBranchProbabilityInfo *MBPI); - - // initPacketizerState - initialize some internal flags. - void initPacketizerState() override; - - // ignorePseudoInstruction - Ignore bundling of pseudo instructions. - bool ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) override; - - // isSoloInstruction - return true if instruction MI can not be packetized - // with any other instruction, which means that MI itself is a packet. - bool isSoloInstruction(MachineInstr *MI) override; - - // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ - // together. - bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override; - - // isLegalToPruneDependencies - Is it legal to prune dependece between SUI - // and SUJ. - bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override; - - MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override; - private: - bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg); - bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType, - MachineBasicBlock::iterator &MII, - const TargetRegisterClass* RC); - bool CanPromoteToDotNew(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII, - const TargetRegisterClass *RC); - bool - CanPromoteToNewValue(MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII); - bool CanPromoteToNewValueStore( - MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit); - bool DemoteToDotOld(MachineInstr *MI); - bool ArePredicatesComplements( - MachineInstr *MI1, MachineInstr *MI2, - const std::map<MachineInstr *, SUnit *> &MIToSUnit); - bool RestrictingDepExistInPacket(MachineInstr *, unsigned, - const std::map<MachineInstr *, SUnit *> &); - bool isNewifiable(MachineInstr* MI); - bool isCondInst(MachineInstr* MI); - bool tryAllocateResourcesForConstExt(MachineInstr* MI); - bool canReserveResourcesForConstExt(MachineInstr *MI); - void reserveResourcesForConstExt(MachineInstr* MI); - bool isNewValueInst(MachineInstr* MI); + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; }; + + char HexagonPacketizer::ID = 0; } INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer", @@ -177,26 +95,93 @@ INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer", INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) -INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer", false, false) -// HexagonPacketizerList Ctor. -HexagonPacketizerList::HexagonPacketizerList( - MachineFunction &MF, MachineLoopInfo &MLI, - const MachineBranchProbabilityInfo *MBPI) - : VLIWPacketizerList(MF, MLI, true) { - this->MBPI = MBPI; +HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF, + MachineLoopInfo &MLI, AliasAnalysis *AA, + const MachineBranchProbabilityInfo *MBPI) + : VLIWPacketizerList(MF, MLI, AA), MBPI(MBPI), MLI(&MLI) { + HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); } -bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) { - const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo(); - MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); - const MachineBranchProbabilityInfo *MBPI = - &getAnalysis<MachineBranchProbabilityInfo>(); +// Check if FirstI modifies a register that SecondI reads. +static bool hasWriteToReadDep(const MachineInstr *FirstI, + const MachineInstr *SecondI, const TargetRegisterInfo *TRI) { + for (auto &MO : FirstI->operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned R = MO.getReg(); + if (SecondI->readsRegister(R, TRI)) + return true; + } + return false; +} + + +static MachineBasicBlock::iterator moveInstrOut(MachineInstr *MI, + MachineBasicBlock::iterator BundleIt, bool Before) { + MachineBasicBlock::instr_iterator InsertPt; + if (Before) + InsertPt = BundleIt.getInstrIterator(); + else + InsertPt = std::next(BundleIt).getInstrIterator(); + + MachineBasicBlock &B = *MI->getParent(); + // The instruction should at least be bundled with the preceding instruction + // (there will always be one, i.e. BUNDLE, if nothing else). + assert(MI->isBundledWithPred()); + if (MI->isBundledWithSucc()) { + MI->clearFlag(MachineInstr::BundledSucc); + MI->clearFlag(MachineInstr::BundledPred); + } else { + // If it's not bundled with the successor (i.e. it is the last one + // in the bundle), then we can simply unbundle it from the predecessor, + // which will take care of updating the predecessor's flag. + MI->unbundleFromPred(); + } + B.splice(InsertPt, &B, MI); + + // Get the size of the bundle without asserting. + MachineBasicBlock::const_instr_iterator I(BundleIt); + MachineBasicBlock::const_instr_iterator E = B.instr_end(); + unsigned Size = 0; + for (++I; I != E && I->isBundledWithPred(); ++I) + ++Size; + + // If there are still two or more instructions, then there is nothing + // else to be done. + if (Size > 1) + return BundleIt; + + // Otherwise, extract the single instruction out and delete the bundle. + MachineBasicBlock::iterator NextIt = std::next(BundleIt); + MachineInstr *SingleI = BundleIt->getNextNode(); + SingleI->unbundleFromPred(); + assert(!SingleI->isBundledWithSucc()); + BundleIt->eraseFromParent(); + return NextIt; +} + + +bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) { + if (DisablePacketizer) + return false; + + HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo(); + HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo(); + auto &MLI = getAnalysis<MachineLoopInfo>(); + auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); + auto *MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); + + if (EnableGenAllInsnClass) + HII->genAllInsnTimingClasses(MF); + // Instantiate the packetizer. - HexagonPacketizerList Packetizer(Fn, MLI, MBPI); + HexagonPacketizerList Packetizer(MF, MLI, AA, MBPI); // DFA state table should not be empty. assert(Packetizer.getResourceTracker() && "Empty DFA table!"); @@ -211,162 +196,107 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &Fn) { // dependence between Insn 0 and Insn 2. This can lead to incorrect // packetization // - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - MachineBasicBlock::iterator End = MBB->end(); - MachineBasicBlock::iterator MI = MBB->begin(); + for (auto &MB : MF) { + auto End = MB.end(); + auto MI = MB.begin(); while (MI != End) { + auto NextI = std::next(MI); if (MI->isKill()) { - MachineBasicBlock::iterator DeleteMI = MI; - ++MI; - MBB->erase(DeleteMI); - End = MBB->end(); - continue; + MB.erase(MI); + End = MB.end(); } - ++MI; + MI = NextI; } } // Loop over all of the basic blocks. - for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end(); - MBB != MBBe; ++MBB) { - // Find scheduling regions and schedule / packetize each region. - unsigned RemainingCount = MBB->size(); - for(MachineBasicBlock::iterator RegionEnd = MBB->end(); - RegionEnd != MBB->begin();) { - // The next region starts above the previous region. Look backward in the - // instruction stream until we find the nearest boundary. - MachineBasicBlock::iterator I = RegionEnd; - for(;I != MBB->begin(); --I, --RemainingCount) { - if (TII->isSchedulingBoundary(std::prev(I), MBB, Fn)) - break; - } - I = MBB->begin(); - - // Skip empty scheduling regions. - if (I == RegionEnd) { - RegionEnd = std::prev(RegionEnd); - --RemainingCount; - continue; - } - // Skip regions with one instruction. - if (I == std::prev(RegionEnd)) { - RegionEnd = std::prev(RegionEnd); - continue; - } - - Packetizer.PacketizeMIs(MBB, I, RegionEnd); - RegionEnd = I; + for (auto &MB : MF) { + auto Begin = MB.begin(), End = MB.end(); + while (Begin != End) { + // First the first non-boundary starting from the end of the last + // scheduling region. + MachineBasicBlock::iterator RB = Begin; + while (RB != End && HII->isSchedulingBoundary(RB, &MB, MF)) + ++RB; + // First the first boundary starting from the beginning of the new + // region. + MachineBasicBlock::iterator RE = RB; + while (RE != End && !HII->isSchedulingBoundary(RE, &MB, MF)) + ++RE; + // Add the scheduling boundary if it's not block end. + if (RE != End) + ++RE; + // If RB == End, then RE == End. + if (RB != End) + Packetizer.PacketizeMIs(&MB, RB, RE); + + Begin = RE; } } + Packetizer.unpacketizeSoloInstrs(MF); return true; } -static bool IsIndirectCall(MachineInstr* MI) { - return MI->getOpcode() == Hexagon::J2_callr; +// Reserve resources for a constant extender. Trigger an assertion if the +// reservation fails. +void HexagonPacketizerList::reserveResourcesForConstExt() { + if (!tryAllocateResourcesForConstExt(true)) + llvm_unreachable("Resources not available"); } -// Reserve resources for constant extender. Trigure an assertion if -// reservation fail. -void HexagonPacketizerList::reserveResourcesForConstExt(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext), - MI->getDebugLoc()); - - if (ResourceTracker->canReserveResources(PseudoMI)) { - ResourceTracker->reserveResources(PseudoMI); - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); - } else { - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); - llvm_unreachable("can not reserve resources for constant extender."); - } - return; +bool HexagonPacketizerList::canReserveResourcesForConstExt() { + return tryAllocateResourcesForConstExt(false); } -bool HexagonPacketizerList::canReserveResourcesForConstExt(MachineInstr *MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - assert((QII->isExtended(MI) || QII->isConstExtended(MI)) && - "Should only be called for constant extended instructions"); - MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext), - MI->getDebugLoc()); - bool CanReserve = ResourceTracker->canReserveResources(PseudoMI); - MF.DeleteMachineInstr(PseudoMI); - return CanReserve; +// Allocate resources (i.e. 4 bytes) for constant extender. If succeeded, +// return true, otherwise, return false. +bool HexagonPacketizerList::tryAllocateResourcesForConstExt(bool Reserve) { + auto *ExtMI = MF.CreateMachineInstr(HII->get(Hexagon::A4_ext), DebugLoc()); + bool Avail = ResourceTracker->canReserveResources(ExtMI); + if (Reserve && Avail) + ResourceTracker->reserveResources(ExtMI); + MF.DeleteMachineInstr(ExtMI); + return Avail; } -// Allocate resources (i.e. 4 bytes) for constant extender. If succeed, return -// true, otherwise, return false. -bool HexagonPacketizerList::tryAllocateResourcesForConstExt(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - MachineInstr *PseudoMI = MF.CreateMachineInstr(QII->get(Hexagon::A4_ext), - MI->getDebugLoc()); - if (ResourceTracker->canReserveResources(PseudoMI)) { - ResourceTracker->reserveResources(PseudoMI); - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); +bool HexagonPacketizerList::isCallDependent(const MachineInstr* MI, + SDep::Kind DepType, unsigned DepReg) { + // Check for LR dependence. + if (DepReg == HRI->getRARegister()) return true; - } else { - MI->getParent()->getParent()->DeleteMachineInstr(PseudoMI); - return false; - } -} - - -bool HexagonPacketizerList::IsCallDependent(MachineInstr* MI, - SDep::Kind DepType, - unsigned DepReg) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - const HexagonRegisterInfo *QRI = - (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo(); - - // Check for lr dependence - if (DepReg == QRI->getRARegister()) { - return true; - } - if (QII->isDeallocRet(MI)) { - if (DepReg == QRI->getFrameRegister() || - DepReg == QRI->getStackRegister()) + if (HII->isDeallocRet(MI)) + if (DepReg == HRI->getFrameRegister() || DepReg == HRI->getStackRegister()) return true; - } - // Check if this is a predicate dependence - const TargetRegisterClass* RC = QRI->getMinimalPhysRegClass(DepReg); - if (RC == &Hexagon::PredRegsRegClass) { + // Check if this is a predicate dependence. + const TargetRegisterClass* RC = HRI->getMinimalPhysRegClass(DepReg); + if (RC == &Hexagon::PredRegsRegClass) return true; - } - // - // Lastly check for an operand used in an indirect call - // If we had an attribute for checking if an instruction is an indirect call, - // then we could have avoided this relatively brittle implementation of - // IsIndirectCall() - // - // Assumes that the first operand of the CALLr is the function address - // - if (IsIndirectCall(MI) && (DepType == SDep::Data)) { + // Assumes that the first operand of the CALLr is the function address. + if (HII->isIndirectCall(MI) && (DepType == SDep::Data)) { MachineOperand MO = MI->getOperand(0); - if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) { + if (MO.isReg() && MO.isUse() && (MO.getReg() == DepReg)) return true; - } } return false; } -static bool IsRegDependence(const SDep::Kind DepType) { - return (DepType == SDep::Data || DepType == SDep::Anti || - DepType == SDep::Output); +static bool isRegDependence(const SDep::Kind DepType) { + return DepType == SDep::Data || DepType == SDep::Anti || + DepType == SDep::Output; } -static bool IsDirectJump(MachineInstr* MI) { - return (MI->getOpcode() == Hexagon::J2_jump); +static bool isDirectJump(const MachineInstr* MI) { + return MI->getOpcode() == Hexagon::J2_jump; } -static bool IsSchedBarrier(MachineInstr* MI) { +static bool isSchedBarrier(const MachineInstr* MI) { switch (MI->getOpcode()) { case Hexagon::Y2_barrier: return true; @@ -374,76 +304,127 @@ static bool IsSchedBarrier(MachineInstr* MI) { return false; } -static bool IsControlFlow(MachineInstr* MI) { +static bool isControlFlow(const MachineInstr* MI) { return (MI->getDesc().isTerminator() || MI->getDesc().isCall()); } -static bool IsLoopN(MachineInstr *MI) { - return (MI->getOpcode() == Hexagon::J2_loop0i || - MI->getOpcode() == Hexagon::J2_loop0r); -} -/// DoesModifyCalleeSavedReg - Returns true if the instruction modifies a -/// callee-saved register. -static bool DoesModifyCalleeSavedReg(MachineInstr *MI, +/// Returns true if the instruction modifies a callee-saved register. +static bool doesModifyCalleeSavedReg(const MachineInstr *MI, const TargetRegisterInfo *TRI) { - for (const MCPhysReg *CSR = - TRI->getCalleeSavedRegs(MI->getParent()->getParent()); - *CSR; ++CSR) { - unsigned CalleeSavedReg = *CSR; - if (MI->modifiesRegister(CalleeSavedReg, TRI)) + const MachineFunction &MF = *MI->getParent()->getParent(); + for (auto *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR) + if (MI->modifiesRegister(*CSR, TRI)) return true; - } return false; } -// Returns true if an instruction can be promoted to .new predicate -// or new-value store. -bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - return isCondInst(MI) || QII->mayBeNewStore(MI); +// TODO: MI->isIndirectBranch() and IsRegisterJump(MI) +// Returns true if an instruction can be promoted to .new predicate or +// new-value store. +bool HexagonPacketizerList::isNewifiable(const MachineInstr* MI) { + return HII->isCondInst(MI) || MI->isReturn() || HII->mayBeNewStore(MI); } -bool HexagonPacketizerList::isCondInst (MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - const MCInstrDesc& TID = MI->getDesc(); - // bug 5670: until that is fixed, - // this portion is disabled. - if ( TID.isConditionalBranch() // && !IsRegisterJump(MI)) || - || QII->isConditionalTransfer(MI) - || QII->isConditionalALU32(MI) - || QII->isConditionalLoad(MI) - || QII->isConditionalStore(MI)) { - return true; +// Promote an instructiont to its .cur form. +// At this time, we have already made a call to canPromoteToDotCur and made +// sure that it can *indeed* be promoted. +bool HexagonPacketizerList::promoteToDotCur(MachineInstr* MI, + SDep::Kind DepType, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC) { + assert(DepType == SDep::Data); + int CurOpcode = HII->getDotCurOp(MI); + MI->setDesc(HII->get(CurOpcode)); + return true; +} + +void HexagonPacketizerList::cleanUpDotCur() { + MachineInstr *MI = NULL; + for (auto BI : CurrentPacketMIs) { + DEBUG(dbgs() << "Cleanup packet has "; BI->dump();); + if (BI->getOpcode() == Hexagon::V6_vL32b_cur_ai) { + MI = BI; + continue; + } + if (MI) { + for (auto &MO : BI->operands()) + if (MO.isReg() && MO.getReg() == MI->getOperand(0).getReg()) + return; + } } - return false; + if (!MI) + return; + // We did not find a use of the CUR, so de-cur it. + MI->setDesc(HII->get(Hexagon::V6_vL32b_ai)); + DEBUG(dbgs() << "Demoted CUR "; MI->dump();); } +// Check to see if an instruction can be dot cur. +bool HexagonPacketizerList::canPromoteToDotCur(const MachineInstr *MI, + const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass *RC) { + if (!HII->isV60VectorInstruction(MI)) + return false; + if (!HII->isV60VectorInstruction(MII)) + return false; -// Promote an instructiont to its .new form. -// At this time, we have already made a call to CanPromoteToDotNew -// and made sure that it can *indeed* be promoted. -bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI, - SDep::Kind DepType, MachineBasicBlock::iterator &MII, - const TargetRegisterClass* RC) { + // Already a dot new instruction. + if (HII->isDotCurInst(MI) && !HII->mayBeCurLoad(MI)) + return false; - assert (DepType == SDep::Data); - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + if (!HII->mayBeCurLoad(MI)) + return false; + + // The "cur value" cannot come from inline asm. + if (PacketSU->getInstr()->isInlineAsm()) + return false; + + // Make sure candidate instruction uses cur. + DEBUG(dbgs() << "Can we DOT Cur Vector MI\n"; + MI->dump(); + dbgs() << "in packet\n";); + MachineInstr *MJ = MII; + DEBUG(dbgs() << "Checking CUR against "; MJ->dump();); + unsigned DestReg = MI->getOperand(0).getReg(); + bool FoundMatch = false; + for (auto &MO : MJ->operands()) + if (MO.isReg() && MO.getReg() == DestReg) + FoundMatch = true; + if (!FoundMatch) + return false; + + // Check for existing uses of a vector register within the packet which + // would be affected by converting a vector load into .cur formt. + for (auto BI : CurrentPacketMIs) { + DEBUG(dbgs() << "packet has "; BI->dump();); + if (BI->readsRegister(DepReg, MF.getSubtarget().getRegisterInfo())) + return false; + } + + DEBUG(dbgs() << "Can Dot CUR MI\n"; MI->dump();); + // We can convert the opcode into a .cur. + return true; +} +// Promote an instruction to its .new form. At this time, we have already +// made a call to canPromoteToDotNew and made sure that it can *indeed* be +// promoted. +bool HexagonPacketizerList::promoteToDotNew(MachineInstr* MI, + SDep::Kind DepType, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC) { + assert (DepType == SDep::Data); int NewOpcode; if (RC == &Hexagon::PredRegsRegClass) - NewOpcode = QII->GetDotNewPredOp(MI, MBPI); + NewOpcode = HII->getDotNewPredOp(MI, MBPI); else - NewOpcode = QII->GetDotNewOp(MI); - MI->setDesc(QII->get(NewOpcode)); - + NewOpcode = HII->getDotNewOp(MI); + MI->setDesc(HII->get(NewOpcode)); return true; } -bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - int NewOpcode = QII->GetDotOldOp(MI->getOpcode()); - MI->setDesc(QII->get(NewOpcode)); +bool HexagonPacketizerList::demoteToDotOld(MachineInstr* MI) { + int NewOpcode = HII->getDotOldOp(MI->getOpcode()); + MI->setDesc(HII->get(NewOpcode)); return true; } @@ -455,175 +436,173 @@ enum PredicateKind { /// Returns true if an instruction is predicated on p0 and false if it's /// predicated on !p0. -static PredicateKind getPredicateSense(MachineInstr* MI, - const HexagonInstrInfo *QII) { - if (!QII->isPredicated(MI)) +static PredicateKind getPredicateSense(const MachineInstr *MI, + const HexagonInstrInfo *HII) { + if (!HII->isPredicated(MI)) return PK_Unknown; - - if (QII->isPredicatedTrue(MI)) + if (HII->isPredicatedTrue(MI)) return PK_True; - return PK_False; } -static MachineOperand& GetPostIncrementOperand(MachineInstr *MI, - const HexagonInstrInfo *QII) { - assert(QII->isPostIncrement(MI) && "Not a post increment operation."); +static const MachineOperand &getPostIncrementOperand(const MachineInstr *MI, + const HexagonInstrInfo *HII) { + assert(HII->isPostIncrement(MI) && "Not a post increment operation."); #ifndef NDEBUG // Post Increment means duplicates. Use dense map to find duplicates in the // list. Caution: Densemap initializes with the minimum of 64 buckets, // whereas there are at most 5 operands in the post increment. - DenseMap<unsigned, unsigned> DefRegsSet; - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) - if (MI->getOperand(opNum).isReg() && - MI->getOperand(opNum).isDef()) { - DefRegsSet[MI->getOperand(opNum).getReg()] = 1; - } - - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) - if (MI->getOperand(opNum).isReg() && - MI->getOperand(opNum).isUse()) { - if (DefRegsSet[MI->getOperand(opNum).getReg()]) { - return MI->getOperand(opNum); - } - } + DenseSet<unsigned> DefRegsSet; + for (auto &MO : MI->operands()) + if (MO.isReg() && MO.isDef()) + DefRegsSet.insert(MO.getReg()); + + for (auto &MO : MI->operands()) + if (MO.isReg() && MO.isUse() && DefRegsSet.count(MO.getReg())) + return MO; #else - if (MI->getDesc().mayLoad()) { + if (MI->mayLoad()) { + const MachineOperand &Op1 = MI->getOperand(1); // The 2nd operand is always the post increment operand in load. - assert(MI->getOperand(1).isReg() && - "Post increment operand has be to a register."); - return (MI->getOperand(1)); + assert(Op1.isReg() && "Post increment operand has be to a register."); + return Op1; } if (MI->getDesc().mayStore()) { + const MachineOperand &Op0 = MI->getOperand(0); // The 1st operand is always the post increment operand in store. - assert(MI->getOperand(0).isReg() && - "Post increment operand has be to a register."); - return (MI->getOperand(0)); + assert(Op0.isReg() && "Post increment operand has be to a register."); + return Op0; } #endif // we should never come here. llvm_unreachable("mayLoad or mayStore not set for Post Increment operation"); } -// get the value being stored -static MachineOperand& GetStoreValueOperand(MachineInstr *MI) { +// Get the value being stored. +static const MachineOperand& getStoreValueOperand(const MachineInstr *MI) { // value being stored is always the last operand. - return (MI->getOperand(MI->getNumOperands()-1)); + return MI->getOperand(MI->getNumOperands()-1); +} + +static bool isLoadAbsSet(const MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::L4_loadrd_ap: + case Hexagon::L4_loadrb_ap: + case Hexagon::L4_loadrh_ap: + case Hexagon::L4_loadrub_ap: + case Hexagon::L4_loadruh_ap: + case Hexagon::L4_loadri_ap: + return true; + } + return false; } -// can be new value store? +static const MachineOperand &getAbsSetOperand(const MachineInstr *MI) { + assert(isLoadAbsSet(MI)); + return MI->getOperand(1); +} + + +// Can be new value store? // Following restrictions are to be respected in convert a store into // a new value store. // 1. If an instruction uses auto-increment, its address register cannot // be a new-value register. Arch Spec 5.4.2.1 -// 2. If an instruction uses absolute-set addressing mode, -// its address register cannot be a new-value register. -// Arch Spec 5.4.2.1.TODO: This is not enabled as -// as absolute-set address mode patters are not implemented. +// 2. If an instruction uses absolute-set addressing mode, its address +// register cannot be a new-value register. Arch Spec 5.4.2.1. // 3. If an instruction produces a 64-bit result, its registers cannot be used // as new-value registers. Arch Spec 5.4.2.2. -// 4. If the instruction that sets a new-value register is conditional, then +// 4. If the instruction that sets the new-value register is conditional, then // the instruction that uses the new-value register must also be conditional, // and both must always have their predicates evaluate identically. // Arch Spec 5.4.2.3. -// 5. There is an implied restriction of a packet can not have another store, -// if there is a new value store in the packet. Corollary, if there is +// 5. There is an implied restriction that a packet cannot have another store, +// if there is a new value store in the packet. Corollary: if there is // already a store in a packet, there can not be a new value store. // Arch Spec: 3.4.4.2 -bool HexagonPacketizerList::CanPromoteToNewValueStore( - MachineInstr *MI, MachineInstr *PacketMI, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; +bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr *MI, + const MachineInstr *PacketMI, unsigned DepReg) { // Make sure we are looking at the store, that can be promoted. - if (!QII->mayBeNewStore(MI)) + if (!HII->mayBeNewStore(MI)) return false; - // Make sure there is dependency and can be new value'ed - if (GetStoreValueOperand(MI).isReg() && - GetStoreValueOperand(MI).getReg() != DepReg) + // Make sure there is dependency and can be new value'd. + const MachineOperand &Val = getStoreValueOperand(MI); + if (Val.isReg() && Val.getReg() != DepReg) return false; - const HexagonRegisterInfo *QRI = - (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo(); const MCInstrDesc& MCID = PacketMI->getDesc(); - // first operand is always the result - - const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF); - - // if there is already an store in the packet, no can do new value store - // Arch Spec 3.4.4.2. - for (std::vector<MachineInstr*>::iterator VI = CurrentPacketMIs.begin(), - VE = CurrentPacketMIs.end(); - (VI != VE); ++VI) { - SUnit *PacketSU = MIToSUnit.find(*VI)->second; - if (PacketSU->getInstr()->getDesc().mayStore() || - // if we have mayStore = 1 set on ALLOCFRAME and DEALLOCFRAME, - // then we don't need this - PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe || - PacketSU->getInstr()->getOpcode() == Hexagon::L2_deallocframe) - return false; - } - if (PacketRC == &Hexagon::DoubleRegsRegClass) { - // new value store constraint: double regs can not feed into new value store - // arch spec section: 5.4.2.2 + // First operand is always the result. + const TargetRegisterClass *PacketRC = HII->getRegClass(MCID, 0, HRI, MF); + // Double regs can not feed into new value store: PRM section: 5.4.2.2. + if (PacketRC == &Hexagon::DoubleRegsRegClass) return false; + + // New-value stores are of class NV (slot 0), dual stores require class ST + // in slot 0 (PRM 5.5). + for (auto I : CurrentPacketMIs) { + SUnit *PacketSU = MIToSUnit.find(I)->second; + if (PacketSU->getInstr()->mayStore()) + return false; } // Make sure it's NOT the post increment register that we are going to // new value. - if (QII->isPostIncrement(MI) && - MI->getDesc().mayStore() && - GetPostIncrementOperand(MI, QII).getReg() == DepReg) { + if (HII->isPostIncrement(MI) && + getPostIncrementOperand(MI, HII).getReg() == DepReg) { return false; } - if (QII->isPostIncrement(PacketMI) && - PacketMI->getDesc().mayLoad() && - GetPostIncrementOperand(PacketMI, QII).getReg() == DepReg) { - // if source is post_inc, or absolute-set addressing, - // it can not feed into new value store - // r3 = memw(r2++#4) - // memw(r30 + #-1404) = r2.new -> can not be new value store - // arch spec section: 5.4.2.1 + if (HII->isPostIncrement(PacketMI) && PacketMI->mayLoad() && + getPostIncrementOperand(PacketMI, HII).getReg() == DepReg) { + // If source is post_inc, or absolute-set addressing, it can not feed + // into new value store + // r3 = memw(r2++#4) + // memw(r30 + #-1404) = r2.new -> can not be new value store + // arch spec section: 5.4.2.1. return false; } + if (isLoadAbsSet(PacketMI) && getAbsSetOperand(PacketMI).getReg() == DepReg) + return false; + // If the source that feeds the store is predicated, new value store must // also be predicated. - if (QII->isPredicated(PacketMI)) { - if (!QII->isPredicated(MI)) + if (HII->isPredicated(PacketMI)) { + if (!HII->isPredicated(MI)) return false; // Check to make sure that they both will have their predicates - // evaluate identically + // evaluate identically. unsigned predRegNumSrc = 0; unsigned predRegNumDst = 0; const TargetRegisterClass* predRegClass = nullptr; - // Get predicate register used in the source instruction - for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) { - if ( PacketMI->getOperand(opNum).isReg()) - predRegNumSrc = PacketMI->getOperand(opNum).getReg(); - predRegClass = QRI->getMinimalPhysRegClass(predRegNumSrc); - if (predRegClass == &Hexagon::PredRegsRegClass) { + // Get predicate register used in the source instruction. + for (auto &MO : PacketMI->operands()) { + if (!MO.isReg()) + continue; + predRegNumSrc = MO.getReg(); + predRegClass = HRI->getMinimalPhysRegClass(predRegNumSrc); + if (predRegClass == &Hexagon::PredRegsRegClass) break; - } } - assert ((predRegClass == &Hexagon::PredRegsRegClass ) && - ("predicate register not found in a predicated PacketMI instruction")); - - // Get predicate register used in new-value store instruction - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { - if ( MI->getOperand(opNum).isReg()) - predRegNumDst = MI->getOperand(opNum).getReg(); - predRegClass = QRI->getMinimalPhysRegClass(predRegNumDst); - if (predRegClass == &Hexagon::PredRegsRegClass) { + assert((predRegClass == &Hexagon::PredRegsRegClass) && + "predicate register not found in a predicated PacketMI instruction"); + + // Get predicate register used in new-value store instruction. + for (auto &MO : MI->operands()) { + if (!MO.isReg()) + continue; + predRegNumDst = MO.getReg(); + predRegClass = HRI->getMinimalPhysRegClass(predRegNumDst); + if (predRegClass == &Hexagon::PredRegsRegClass) break; - } } - assert ((predRegClass == &Hexagon::PredRegsRegClass ) && - ("predicate register not found in a predicated MI instruction")); + assert((predRegClass == &Hexagon::PredRegsRegClass) && + "predicate register not found in a predicated MI instruction"); // New-value register producer and user (store) need to satisfy these // constraints: @@ -632,13 +611,11 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( // should also be .new predicated and if producer is not .new predicated // then store should not be .new predicated. // 3) Both new-value register producer and user should have same predicate - // sense, i.e, either both should be negated or both should be none negated. - - if (( predRegNumDst != predRegNumSrc) || - QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI) || - getPredicateSense(MI, QII) != getPredicateSense(PacketMI, QII)) { + // sense, i.e, either both should be negated or both should be non-negated. + if (predRegNumDst != predRegNumSrc || + HII->isDotNewInst(PacketMI) != HII->isDotNewInst(MI) || + getPredicateSense(MI, HII) != getPredicateSense(PacketMI, HII)) return false; - } } // Make sure that other than the new-value register no other store instruction @@ -649,81 +626,77 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( // including PacketMI. Howerver, we need to perform the check for the // remaining instructions in the packet. - std::vector<MachineInstr*>::iterator VI; - std::vector<MachineInstr*>::iterator VE; unsigned StartCheck = 0; - for (VI=CurrentPacketMIs.begin(), VE = CurrentPacketMIs.end(); - (VI != VE); ++VI) { - SUnit *TempSU = MIToSUnit.find(*VI)->second; + for (auto I : CurrentPacketMIs) { + SUnit *TempSU = MIToSUnit.find(I)->second; MachineInstr* TempMI = TempSU->getInstr(); // Following condition is true for all the instructions until PacketMI is // reached (StartCheck is set to 0 before the for loop). // StartCheck flag is 1 for all the instructions after PacketMI. - if (TempMI != PacketMI && !StartCheck) // start processing only after - continue; // encountering PacketMI + if (TempMI != PacketMI && !StartCheck) // Start processing only after + continue; // encountering PacketMI. StartCheck = 1; - if (TempMI == PacketMI) // We don't want to check PacketMI for dependence + if (TempMI == PacketMI) // We don't want to check PacketMI for dependence. continue; - for(unsigned opNum = 0; opNum < MI->getNumOperands(); opNum++) { - if (MI->getOperand(opNum).isReg() && - TempSU->getInstr()->modifiesRegister(MI->getOperand(opNum).getReg(), - QRI)) + for (auto &MO : MI->operands()) + if (MO.isReg() && TempSU->getInstr()->modifiesRegister(MO.getReg(), HRI)) return false; - } } // Make sure that for non-POST_INC stores: // 1. The only use of reg is DepReg and no other registers. // This handles V4 base+index registers. // The following store can not be dot new. - // Eg. r0 = add(r0, #3)a + // Eg. r0 = add(r0, #3) // memw(r1+r0<<#2) = r0 - if (!QII->isPostIncrement(MI) && - GetStoreValueOperand(MI).isReg() && - GetStoreValueOperand(MI).getReg() == DepReg) { - for(unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) { - if (MI->getOperand(opNum).isReg() && - MI->getOperand(opNum).getReg() == DepReg) { - return false; - } - } - // 2. If data definition is because of implicit definition of the register, - // do not newify the store. Eg. - // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def> - // STrih_indexed %R8, 2, %R12<kill>; mem:ST2[%scevgep343] - for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) { - if (PacketMI->getOperand(opNum).isReg() && - PacketMI->getOperand(opNum).getReg() == DepReg && - PacketMI->getOperand(opNum).isDef() && - PacketMI->getOperand(opNum).isImplicit()) { + if (!HII->isPostIncrement(MI)) { + for (unsigned opNum = 0; opNum < MI->getNumOperands()-1; opNum++) { + const MachineOperand &MO = MI->getOperand(opNum); + if (MO.isReg() && MO.getReg() == DepReg) return false; - } } } + // If data definition is because of implicit definition of the register, + // do not newify the store. Eg. + // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def> + // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343] + for (auto &MO : PacketMI->operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.isImplicit()) + continue; + unsigned R = MO.getReg(); + if (R == DepReg || HRI->isSuperRegister(DepReg, R)) + return false; + } + + // Handle imp-use of super reg case. There is a target independent side + // change that should prevent this situation but I am handling it for + // just-in-case. For example, we cannot newify R2 in the following case: + // %R3<def> = A2_tfrsi 0; + // S2_storeri_io %R0<kill>, 0, %R2<kill>, %D1<imp-use,kill>; + for (auto &MO : MI->operands()) { + if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == DepReg) + return false; + } + // Can be dot new store. return true; } -// can this MI to promoted to either -// new value store or new value jump -bool HexagonPacketizerList::CanPromoteToNewValue( - MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - if (!QII->mayBeNewStore(MI)) +// Can this MI to promoted to either new value store or new value jump. +bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr *MI, + const SUnit *PacketSU, unsigned DepReg, + MachineBasicBlock::iterator &MII) { + if (!HII->mayBeNewStore(MI)) return false; - MachineInstr *PacketMI = PacketSU->getInstr(); - // Check to see the store can be new value'ed. - if (CanPromoteToNewValueStore(MI, PacketMI, DepReg, MIToSUnit)) + MachineInstr *PacketMI = PacketSU->getInstr(); + if (canPromoteToNewValueStore(MI, PacketMI, DepReg)) return true; // Check to see the compare/jump can be new value'ed. @@ -731,93 +704,110 @@ bool HexagonPacketizerList::CanPromoteToNewValue( return false; } +static bool isImplicitDependency(const MachineInstr *I, unsigned DepReg) { + for (auto &MO : I->operands()) + if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit()) + return true; + return false; +} + // Check to see if an instruction can be dot new // There are three kinds. // 1. dot new on predicate - V2/V3/V4 // 2. dot new on stores NV/ST - V4 // 3. dot new on jump NV/J - V4 -- This is generated in a pass. -bool HexagonPacketizerList::CanPromoteToDotNew( - MachineInstr *MI, SUnit *PacketSU, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit, - MachineBasicBlock::iterator &MII, const TargetRegisterClass *RC) { - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; +bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr *MI, + const SUnit *PacketSU, unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC) { // Already a dot new instruction. - if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI)) + if (HII->isDotNewInst(MI) && !HII->mayBeNewStore(MI)) return false; if (!isNewifiable(MI)) return false; + const MachineInstr *PI = PacketSU->getInstr(); + + // The "new value" cannot come from inline asm. + if (PI->isInlineAsm()) + return false; + + // IMPLICIT_DEFs won't materialize as real instructions, so .new makes no + // sense. + if (PI->isImplicitDef()) + return false; + + // If dependency is trough an implicitly defined register, we should not + // newify the use. + if (isImplicitDependency(PI, DepReg)) + return false; + + const MCInstrDesc& MCID = PI->getDesc(); + const TargetRegisterClass *VecRC = HII->getRegClass(MCID, 0, HRI, MF); + if (DisableVecDblNVStores && VecRC == &Hexagon::VecDblRegsRegClass) + return false; + // predicate .new - if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI)) - return true; - else if (RC != &Hexagon::PredRegsRegClass && - !QII->mayBeNewStore(MI)) // MI is not a new-value store + // bug 5670: until that is fixed + // TODO: MI->isIndirectBranch() and IsRegisterJump(MI) + if (RC == &Hexagon::PredRegsRegClass) + if (HII->isCondInst(MI) || MI->isReturn()) + return HII->predCanBeUsedAsDotNew(PI, DepReg); + + if (RC != &Hexagon::PredRegsRegClass && !HII->mayBeNewStore(MI)) + return false; + + // Create a dot new machine instruction to see if resources can be + // allocated. If not, bail out now. + int NewOpcode = HII->getDotNewOp(MI); + const MCInstrDesc &D = HII->get(NewOpcode); + MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc()); + bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI); + MF.DeleteMachineInstr(NewMI); + if (!ResourcesAvailable) + return false; + + // New Value Store only. New Value Jump generated as a separate pass. + if (!canPromoteToNewValue(MI, PacketSU, DepReg, MII)) return false; - else { - // Create a dot new machine instruction to see if resources can be - // allocated. If not, bail out now. - int NewOpcode = QII->GetDotNewOp(MI); - const MCInstrDesc &desc = QII->get(NewOpcode); - DebugLoc dl; - MachineInstr *NewMI = - MI->getParent()->getParent()->CreateMachineInstr(desc, dl); - bool ResourcesAvailable = ResourceTracker->canReserveResources(NewMI); - MI->getParent()->getParent()->DeleteMachineInstr(NewMI); - - if (!ResourcesAvailable) - return false; - // new value store only - // new new value jump generated as a passes - if (!CanPromoteToNewValue(MI, PacketSU, DepReg, MIToSUnit, MII)) { - return false; - } - } return true; } -// Go through the packet instructions and search for anti dependency -// between them and DepReg from MI -// Consider this case: +// Go through the packet instructions and search for an anti dependency between +// them and DepReg from MI. Consider this case: // Trying to add // a) %R1<def> = TFRI_cdNotPt %P3, 2 // to this packet: // { -// b) %P0<def> = OR_pp %P3<kill>, %P0<kill> -// c) %P3<def> = TFR_PdRs %R23 -// d) %R1<def> = TFRI_cdnPt %P3, 4 +// b) %P0<def> = C2_or %P3<kill>, %P0<kill> +// c) %P3<def> = C2_tfrrp %R23 +// d) %R1<def> = C2_cmovenewit %P3, 4 // } // The P3 from a) and d) will be complements after // a)'s P3 is converted to .new form -// Anti Dep between c) and b) is irrelevant for this case -bool HexagonPacketizerList::RestrictingDepExistInPacket( - MachineInstr *MI, unsigned DepReg, - const std::map<MachineInstr *, SUnit *> &MIToSUnit) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; +// Anti-dep between c) and b) is irrelevant for this case +bool HexagonPacketizerList::restrictingDepExistInPacket(MachineInstr* MI, + unsigned DepReg) { SUnit *PacketSUDep = MIToSUnit.find(MI)->second; - for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(), - VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) { - + for (auto I : CurrentPacketMIs) { // We only care for dependencies to predicated instructions - if(!QII->isPredicated(*VIN)) continue; + if (!HII->isPredicated(I)) + continue; // Scheduling Unit for current insn in the packet - SUnit *PacketSU = MIToSUnit.find(*VIN)->second; + SUnit *PacketSU = MIToSUnit.find(I)->second; - // Look at dependencies between current members of the packet - // and predicate defining instruction MI. - // Make sure that dependency is on the exact register - // we care about. + // Look at dependencies between current members of the packet and + // predicate defining instruction MI. Make sure that dependency is + // on the exact register we care about. if (PacketSU->isSucc(PacketSUDep)) { for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) { - if ((PacketSU->Succs[i].getSUnit() == PacketSUDep) && - (PacketSU->Succs[i].getKind() == SDep::Anti) && - (PacketSU->Succs[i].getReg() == DepReg)) { + auto &Dep = PacketSU->Succs[i]; + if (Dep.getSUnit() == PacketSUDep && Dep.getKind() == SDep::Anti && + Dep.getReg() == DepReg) return true; - } } } } @@ -831,276 +821,362 @@ static unsigned getPredicatedRegister(MachineInstr *MI, const HexagonInstrInfo *QII) { /// We use the following rule: The first predicate register that is a use is /// the predicate register of a predicated instruction. - assert(QII->isPredicated(MI) && "Must be predicated instruction"); - for (MachineInstr::mop_iterator OI = MI->operands_begin(), - OE = MI->operands_end(); OI != OE; ++OI) { - MachineOperand &Op = *OI; + for (auto &Op : MI->operands()) { if (Op.isReg() && Op.getReg() && Op.isUse() && Hexagon::PredRegsRegClass.contains(Op.getReg())) return Op.getReg(); } llvm_unreachable("Unknown instruction operand layout"); - return 0; } // Given two predicated instructions, this function detects whether -// the predicates are complements -bool HexagonPacketizerList::ArePredicatesComplements( - MachineInstr *MI1, MachineInstr *MI2, - const std::map<MachineInstr *, SUnit *> &MIToSUnit) { - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - +// the predicates are complements. +bool HexagonPacketizerList::arePredicatesComplements(MachineInstr *MI1, + MachineInstr *MI2) { // If we don't know the predicate sense of the instructions bail out early, we // need it later. - if (getPredicateSense(MI1, QII) == PK_Unknown || - getPredicateSense(MI2, QII) == PK_Unknown) + if (getPredicateSense(MI1, HII) == PK_Unknown || + getPredicateSense(MI2, HII) == PK_Unknown) return false; - // Scheduling unit for candidate - SUnit *SU = MIToSUnit.find(MI1)->second; + // Scheduling unit for candidate. + SUnit *SU = MIToSUnit[MI1]; // One corner case deals with the following scenario: // Trying to add - // a) %R24<def> = TFR_cPt %P0, %R25 + // a) %R24<def> = A2_tfrt %P0, %R25 // to this packet: - // // { - // b) %R25<def> = TFR_cNotPt %P0, %R24 - // c) %P0<def> = CMPEQri %R26, 1 + // b) %R25<def> = A2_tfrf %P0, %R24 + // c) %P0<def> = C2_cmpeqi %R26, 1 // } // - // On general check a) and b) are complements, but - // presence of c) will convert a) to .new form, and - // then it is not a complement - // We attempt to detect it by analyzing existing - // dependencies in the packet + // On general check a) and b) are complements, but presence of c) will + // convert a) to .new form, and then it is not a complement. + // We attempt to detect it by analyzing existing dependencies in the packet. // Analyze relationships between all existing members of the packet. - // Look for Anti dependecy on the same predicate reg - // as used in the candidate - for (std::vector<MachineInstr*>::iterator VIN = CurrentPacketMIs.begin(), - VEN = CurrentPacketMIs.end(); (VIN != VEN); ++VIN) { - - // Scheduling Unit for current insn in the packet - SUnit *PacketSU = MIToSUnit.find(*VIN)->second; + // Look for Anti dependecy on the same predicate reg as used in the + // candidate. + for (auto I : CurrentPacketMIs) { + // Scheduling Unit for current insn in the packet. + SUnit *PacketSU = MIToSUnit.find(I)->second; // If this instruction in the packet is succeeded by the candidate... if (PacketSU->isSucc(SU)) { for (unsigned i = 0; i < PacketSU->Succs.size(); ++i) { - // The corner case exist when there is true data - // dependency between candidate and one of current - // packet members, this dep is on predicate reg, and - // there already exist anti dep on the same pred in + auto Dep = PacketSU->Succs[i]; + // The corner case exist when there is true data dependency between + // candidate and one of current packet members, this dep is on + // predicate reg, and there already exist anti dep on the same pred in // the packet. - if (PacketSU->Succs[i].getSUnit() == SU && - PacketSU->Succs[i].getKind() == SDep::Data && - Hexagon::PredRegsRegClass.contains( - PacketSU->Succs[i].getReg()) && - // Here I know that *VIN is predicate setting instruction - // with true data dep to candidate on the register - // we care about - c) in the above example. - // Now I need to see if there is an anti dependency - // from c) to any other instruction in the - // same packet on the pred reg of interest - RestrictingDepExistInPacket(*VIN,PacketSU->Succs[i].getReg(), - MIToSUnit)) { - return false; + if (Dep.getSUnit() == SU && Dep.getKind() == SDep::Data && + Hexagon::PredRegsRegClass.contains(Dep.getReg())) { + // Here I know that I is predicate setting instruction with true + // data dep to candidate on the register we care about - c) in the + // above example. Now I need to see if there is an anti dependency + // from c) to any other instruction in the same packet on the pred + // reg of interest. + if (restrictingDepExistInPacket(I, Dep.getReg())) + return false; } } } } - // If the above case does not apply, check regular - // complement condition. - // Check that the predicate register is the same and - // that the predicate sense is different - // We also need to differentiate .old vs. .new: - // !p0 is not complimentary to p0.new - unsigned PReg1 = getPredicatedRegister(MI1, QII); - unsigned PReg2 = getPredicatedRegister(MI2, QII); - return ((PReg1 == PReg2) && - Hexagon::PredRegsRegClass.contains(PReg1) && - Hexagon::PredRegsRegClass.contains(PReg2) && - (getPredicateSense(MI1, QII) != getPredicateSense(MI2, QII)) && - (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2))); + // If the above case does not apply, check regular complement condition. + // Check that the predicate register is the same and that the predicate + // sense is different We also need to differentiate .old vs. .new: !p0 + // is not complementary to p0.new. + unsigned PReg1 = getPredicatedRegister(MI1, HII); + unsigned PReg2 = getPredicatedRegister(MI2, HII); + return PReg1 == PReg2 && + Hexagon::PredRegsRegClass.contains(PReg1) && + Hexagon::PredRegsRegClass.contains(PReg2) && + getPredicateSense(MI1, HII) != getPredicateSense(MI2, HII) && + HII->isDotNewInst(MI1) == HII->isDotNewInst(MI2); } -// initPacketizerState - Initialize packetizer flags +// Initialize packetizer flags. void HexagonPacketizerList::initPacketizerState() { - Dependence = false; PromotedToDotNew = false; GlueToNewValueJump = false; GlueAllocframeStore = false; FoundSequentialDependence = false; - - return; } -// ignorePseudoInstruction - Ignore bundling of pseudo instructions. -bool HexagonPacketizerList::ignorePseudoInstruction(MachineInstr *MI, - MachineBasicBlock *MBB) { +// Ignore bundling of pseudo instructions. +bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock*) { if (MI->isDebugValue()) return true; if (MI->isCFIInstruction()) return false; - // We must print out inline assembly + // We must print out inline assembly. if (MI->isInlineAsm()) return false; - // We check if MI has any functional units mapped to it. - // If it doesn't, we ignore the instruction. + if (MI->isImplicitDef()) + return false; + + // We check if MI has any functional units mapped to it. If it doesn't, + // we ignore the instruction. const MCInstrDesc& TID = MI->getDesc(); - unsigned SchedClass = TID.getSchedClass(); - const InstrStage* IS = - ResourceTracker->getInstrItins()->beginStage(SchedClass); + auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass()); unsigned FuncUnits = IS->getUnits(); return !FuncUnits; } -// isSoloInstruction: - Returns true for instructions that must be -// scheduled in their own packet. -bool HexagonPacketizerList::isSoloInstruction(MachineInstr *MI) { +bool HexagonPacketizerList::isSoloInstruction(const MachineInstr *MI) { if (MI->isEHLabel() || MI->isCFIInstruction()) return true; - if (MI->isInlineAsm()) + // Consider inline asm to not be a solo instruction by default. + // Inline asm will be put in a packet temporarily, but then it will be + // removed, and placed outside of the packet (before or after, depending + // on dependencies). This is to reduce the impact of inline asm as a + // "packet splitting" instruction. + if (MI->isInlineAsm() && !ScheduleInlineAsm) return true; // From Hexagon V4 Programmer's Reference Manual 3.4.4 Grouping constraints: // trap, pause, barrier, icinva, isync, and syncht are solo instructions. // They must not be grouped with other instructions in a packet. - if (IsSchedBarrier(MI)) + if (isSchedBarrier(MI)) + return true; + + if (HII->isSolo(MI)) + return true; + + if (MI->getOpcode() == Hexagon::A2_nop) return true; return false; } -// isLegalToPacketizeTogether: -// SUI is the current instruction that is out side of the current packet. -// SUJ is the current instruction inside the current packet against which that -// SUI will be packetized. -bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { - MachineInstr *I = SUI->getInstr(); - MachineInstr *J = SUJ->getInstr(); - assert(I && J && "Unable to packetize null instruction!"); - const MCInstrDesc &MCIDI = I->getDesc(); - const MCInstrDesc &MCIDJ = J->getDesc(); +// Quick check if instructions MI and MJ cannot coexist in the same packet. +// Limit the tests to be "one-way", e.g. "if MI->isBranch and MJ->isInlineAsm", +// but not the symmetric case: "if MJ->isBranch and MI->isInlineAsm". +// For full test call this function twice: +// cannotCoexistAsymm(MI, MJ) || cannotCoexistAsymm(MJ, MI) +// Doing the test only one way saves the amount of code in this function, +// since every test would need to be repeated with the MI and MJ reversed. +static bool cannotCoexistAsymm(const MachineInstr *MI, const MachineInstr *MJ, + const HexagonInstrInfo &HII) { + const MachineFunction *MF = MI->getParent()->getParent(); + if (MF->getSubtarget<HexagonSubtarget>().hasV60TOpsOnly() && + HII.isHVXMemWithAIndirect(MI, MJ)) + return true; - MachineBasicBlock::iterator II = I; + // An inline asm cannot be together with a branch, because we may not be + // able to remove the asm out after packetizing (i.e. if the asm must be + // moved past the bundle). Similarly, two asms cannot be together to avoid + // complications when determining their relative order outside of a bundle. + if (MI->isInlineAsm()) + return MJ->isInlineAsm() || MJ->isBranch() || MJ->isBarrier() || + MJ->isCall() || MJ->isTerminator(); - const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); - const HexagonRegisterInfo *QRI = - (const HexagonRegisterInfo *)MF.getSubtarget().getRegisterInfo(); - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; + // "False" really means that the quick check failed to determine if + // I and J cannot coexist. + return false; +} - // Inline asm cannot go in the packet. - if (I->getOpcode() == Hexagon::INLINEASM) - llvm_unreachable("Should not meet inline asm here!"); - if (isSoloInstruction(I)) - llvm_unreachable("Should not meet solo instr here!"); +// Full, symmetric check. +bool HexagonPacketizerList::cannotCoexist(const MachineInstr *MI, + const MachineInstr *MJ) { + return cannotCoexistAsymm(MI, MJ, *HII) || cannotCoexistAsymm(MJ, MI, *HII); +} - // A save callee-save register function call can only be in a packet - // with instructions that don't write to the callee-save registers. - if ((QII->isSaveCalleeSavedRegsCall(I) && - DoesModifyCalleeSavedReg(J, QRI)) || - (QII->isSaveCalleeSavedRegsCall(J) && - DoesModifyCalleeSavedReg(I, QRI))) { - Dependence = true; - return false; +void HexagonPacketizerList::unpacketizeSoloInstrs(MachineFunction &MF) { + for (auto &B : MF) { + MachineBasicBlock::iterator BundleIt; + MachineBasicBlock::instr_iterator NextI; + for (auto I = B.instr_begin(), E = B.instr_end(); I != E; I = NextI) { + NextI = std::next(I); + MachineInstr *MI = &*I; + if (MI->isBundle()) + BundleIt = I; + if (!MI->isInsideBundle()) + continue; + + // Decide on where to insert the instruction that we are pulling out. + // Debug instructions always go before the bundle, but the placement of + // INLINE_ASM depends on potential dependencies. By default, try to + // put it before the bundle, but if the asm writes to a register that + // other instructions in the bundle read, then we need to place it + // after the bundle (to preserve the bundle semantics). + bool InsertBeforeBundle; + if (MI->isInlineAsm()) + InsertBeforeBundle = !hasWriteToReadDep(MI, BundleIt, HRI); + else if (MI->isDebugValue()) + InsertBeforeBundle = true; + else + continue; + + BundleIt = moveInstrOut(MI, BundleIt, InsertBeforeBundle); + } } +} - // Two control flow instructions cannot go in the same packet. - if (IsControlFlow(I) && IsControlFlow(J)) { - Dependence = true; - return false; +// Check if a given instruction is of class "system". +static bool isSystemInstr(const MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + switch (Opc) { + case Hexagon::Y2_barrier: + case Hexagon::Y2_dcfetchbo: + return true; } + return false; +} - // A LoopN instruction cannot appear in the same packet as a jump or call. - if (IsLoopN(I) && - (IsDirectJump(J) || MCIDJ.isCall() || QII->isDeallocRet(J))) { - Dependence = true; +bool HexagonPacketizerList::hasDeadDependence(const MachineInstr *I, + const MachineInstr *J) { + // The dependence graph may not include edges between dead definitions, + // so without extra checks, we could end up packetizing two instruction + // defining the same (dead) register. + if (I->isCall() || J->isCall()) return false; - } - if (IsLoopN(J) && - (IsDirectJump(I) || MCIDI.isCall() || QII->isDeallocRet(I))) { - Dependence = true; + if (HII->isPredicated(I) || HII->isPredicated(J)) return false; + + BitVector DeadDefs(Hexagon::NUM_TARGET_REGS); + for (auto &MO : I->operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.isDead()) + continue; + DeadDefs[MO.getReg()] = true; } + for (auto &MO : J->operands()) { + if (!MO.isReg() || !MO.isDef() || !MO.isDead()) + continue; + unsigned R = MO.getReg(); + if (R != Hexagon::USR_OVF && DeadDefs[R]) + return true; + } + return false; +} + +bool HexagonPacketizerList::hasControlDependence(const MachineInstr *I, + const MachineInstr *J) { + // A save callee-save register function call can only be in a packet + // with instructions that don't write to the callee-save registers. + if ((HII->isSaveCalleeSavedRegsCall(I) && + doesModifyCalleeSavedReg(J, HRI)) || + (HII->isSaveCalleeSavedRegsCall(J) && + doesModifyCalleeSavedReg(I, HRI))) + return true; + + // Two control flow instructions cannot go in the same packet. + if (isControlFlow(I) && isControlFlow(J)) + return true; + + // \ref-manual (7.3.4) A loop setup packet in loopN or spNloop0 cannot + // contain a speculative indirect jump, + // a new-value compare jump or a dealloc_return. + auto isBadForLoopN = [this] (const MachineInstr *MI) -> bool { + if (MI->isCall() || HII->isDeallocRet(MI) || HII->isNewValueJump(MI)) + return true; + if (HII->isPredicated(MI) && HII->isPredicatedNew(MI) && HII->isJumpR(MI)) + return true; + return false; + }; + + if (HII->isLoopN(I) && isBadForLoopN(J)) + return true; + if (HII->isLoopN(J) && isBadForLoopN(I)) + return true; + // dealloc_return cannot appear in the same packet as a conditional or // unconditional jump. - if (QII->isDeallocRet(I) && - (MCIDJ.isBranch() || MCIDJ.isCall() || MCIDJ.isBarrier())) { - Dependence = true; - return false; + return HII->isDeallocRet(I) && + (J->isBranch() || J->isCall() || J->isBarrier()); +} + +bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr *I, + const MachineInstr *J) { + bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J); + bool StoreI = I->mayStore(), StoreJ = J->mayStore(); + if ((SysI && StoreJ) || (SysJ && StoreI)) + return true; + + if (StoreI && StoreJ) { + if (HII->isNewValueInst(J) || HII->isMemOp(J) || HII->isMemOp(I)) + return true; + } else { + // A memop cannot be in the same packet with another memop or a store. + // Two stores can be together, but here I and J cannot both be stores. + bool MopStI = HII->isMemOp(I) || StoreI; + bool MopStJ = HII->isMemOp(J) || StoreJ; + if (MopStI && MopStJ) + return true; } + return (StoreJ && HII->isDeallocRet(I)) || (StoreI && HII->isDeallocRet(J)); +} - // V4 allows dual store. But does not allow second store, if the - // first store is not in SLOT0. New value store, new value jump, - // dealloc_return and memop always take SLOT0. - // Arch spec 3.4.4.2 - if (MCIDI.mayStore() && MCIDJ.mayStore() && - (QII->isNewValueInst(J) || QII->isMemOp(J) || QII->isMemOp(I))) { - Dependence = true; +// SUI is the current instruction that is out side of the current packet. +// SUJ is the current instruction inside the current packet against which that +// SUI will be packetized. +bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { + MachineInstr *I = SUI->getInstr(); + MachineInstr *J = SUJ->getInstr(); + assert(I && J && "Unable to packetize null instruction!"); + + // Clear IgnoreDepMIs when Packet starts. + if (CurrentPacketMIs.size() == 1) + IgnoreDepMIs.clear(); + + MachineBasicBlock::iterator II = I; + const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); + + // Solo instructions cannot go in the packet. + assert(!isSoloInstruction(I) && "Unexpected solo instr!"); + + if (cannotCoexist(I, J)) return false; - } - if ((QII->isMemOp(J) && MCIDI.mayStore()) - || (MCIDJ.mayStore() && QII->isMemOp(I)) - || (QII->isMemOp(J) && QII->isMemOp(I))) { - Dependence = true; + Dependence = hasDeadDependence(I, J) || hasControlDependence(I, J); + if (Dependence) return false; - } - //if dealloc_return - if (MCIDJ.mayStore() && QII->isDeallocRet(I)) { - Dependence = true; + // V4 allows dual stores. It does not allow second store, if the first + // store is not in SLOT0. New value store, new value jump, dealloc_return + // and memop always take SLOT0. Arch spec 3.4.4.2. + Dependence = hasV4SpecificDependence(I, J); + if (Dependence) return false; - } // If an instruction feeds new value jump, glue it. MachineBasicBlock::iterator NextMII = I; ++NextMII; - if (NextMII != I->getParent()->end() && QII->isNewValueJump(NextMII)) { + if (NextMII != I->getParent()->end() && HII->isNewValueJump(NextMII)) { MachineInstr *NextMI = NextMII; bool secondRegMatch = false; - bool maintainNewValueJump = false; + const MachineOperand &NOp0 = NextMI->getOperand(0); + const MachineOperand &NOp1 = NextMI->getOperand(1); - if (NextMI->getOperand(1).isReg() && - I->getOperand(0).getReg() == NextMI->getOperand(1).getReg()) { + if (NOp1.isReg() && I->getOperand(0).getReg() == NOp1.getReg()) secondRegMatch = true; - maintainNewValueJump = true; - } - - if (!secondRegMatch && - I->getOperand(0).getReg() == NextMI->getOperand(0).getReg()) { - maintainNewValueJump = true; - } - for (std::vector<MachineInstr*>::iterator - VI = CurrentPacketMIs.begin(), - VE = CurrentPacketMIs.end(); - (VI != VE && maintainNewValueJump); ++VI) { - SUnit *PacketSU = MIToSUnit.find(*VI)->second; - - // NVJ can not be part of the dual jump - Arch Spec: section 7.8 - if (PacketSU->getInstr()->getDesc().isCall()) { + for (auto I : CurrentPacketMIs) { + SUnit *PacketSU = MIToSUnit.find(I)->second; + MachineInstr *PI = PacketSU->getInstr(); + // NVJ can not be part of the dual jump - Arch Spec: section 7.8. + if (PI->isCall()) { Dependence = true; break; } - // Validate + // Validate: // 1. Packet does not have a store in it. // 2. If the first operand of the nvj is newified, and the second // operand is also a reg, it (second reg) is not defined in @@ -1108,302 +1184,413 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) { // 3. If the second operand of the nvj is newified, (which means // first operand is also a reg), first reg is not defined in // the same packet. - if (PacketSU->getInstr()->getDesc().mayStore() || - PacketSU->getInstr()->getOpcode() == Hexagon::S2_allocframe || - // Check #2. - (!secondRegMatch && NextMI->getOperand(1).isReg() && - PacketSU->getInstr()->modifiesRegister( - NextMI->getOperand(1).getReg(), QRI)) || - // Check #3. - (secondRegMatch && - PacketSU->getInstr()->modifiesRegister( - NextMI->getOperand(0).getReg(), QRI))) { + if (PI->getOpcode() == Hexagon::S2_allocframe || PI->mayStore() || + HII->isLoopN(PI)) { + Dependence = true; + break; + } + // Check #2/#3. + const MachineOperand &OpR = secondRegMatch ? NOp0 : NOp1; + if (OpR.isReg() && PI->modifiesRegister(OpR.getReg(), HRI)) { Dependence = true; break; } } - if (!Dependence) - GlueToNewValueJump = true; - else + + if (Dependence) return false; + GlueToNewValueJump = true; } - if (SUJ->isSucc(SUI)) { - for (unsigned i = 0; - (i < SUJ->Succs.size()) && !FoundSequentialDependence; - ++i) { + // There no dependency between a prolog instruction and its successor. + if (!SUJ->isSucc(SUI)) + return true; - if (SUJ->Succs[i].getSUnit() != SUI) { - continue; - } + for (unsigned i = 0; i < SUJ->Succs.size(); ++i) { + if (FoundSequentialDependence) + break; - SDep::Kind DepType = SUJ->Succs[i].getKind(); + if (SUJ->Succs[i].getSUnit() != SUI) + continue; - // For direct calls: - // Ignore register dependences for call instructions for - // packetization purposes except for those due to r31 and - // predicate registers. - // - // For indirect calls: - // Same as direct calls + check for true dependences to the register - // used in the indirect call. - // - // We completely ignore Order dependences for call instructions - // - // For returns: - // Ignore register dependences for return instructions like jumpr, - // dealloc return unless we have dependencies on the explicit uses - // of the registers used by jumpr (like r31) or dealloc return - // (like r29 or r30). - // - // TODO: Currently, jumpr is handling only return of r31. So, the - // following logic (specificaly IsCallDependent) is working fine. - // We need to enable jumpr for register other than r31 and then, - // we need to rework the last part, where it handles indirect call - // of that (IsCallDependent) function. Bug 6216 is opened for this. - // - unsigned DepReg = 0; - const TargetRegisterClass* RC = nullptr; - if (DepType == SDep::Data) { - DepReg = SUJ->Succs[i].getReg(); - RC = QRI->getMinimalPhysRegClass(DepReg); - } - if ((MCIDI.isCall() || MCIDI.isReturn()) && - (!IsRegDependence(DepType) || - !IsCallDependent(I, DepType, SUJ->Succs[i].getReg()))) { - /* do nothing */ - } + SDep::Kind DepType = SUJ->Succs[i].getKind(); + // For direct calls: + // Ignore register dependences for call instructions for packetization + // purposes except for those due to r31 and predicate registers. + // + // For indirect calls: + // Same as direct calls + check for true dependences to the register + // used in the indirect call. + // + // We completely ignore Order dependences for call instructions. + // + // For returns: + // Ignore register dependences for return instructions like jumpr, + // dealloc return unless we have dependencies on the explicit uses + // of the registers used by jumpr (like r31) or dealloc return + // (like r29 or r30). + // + // TODO: Currently, jumpr is handling only return of r31. So, the + // following logic (specificaly isCallDependent) is working fine. + // We need to enable jumpr for register other than r31 and then, + // we need to rework the last part, where it handles indirect call + // of that (isCallDependent) function. Bug 6216 is opened for this. + unsigned DepReg = 0; + const TargetRegisterClass *RC = nullptr; + if (DepType == SDep::Data) { + DepReg = SUJ->Succs[i].getReg(); + RC = HRI->getMinimalPhysRegClass(DepReg); + } - // For instructions that can be promoted to dot-new, try to promote. - else if ((DepType == SDep::Data) && - CanPromoteToDotNew(I, SUJ, DepReg, MIToSUnit, II, RC) && - PromoteToDotNew(I, DepType, II, RC)) { - PromotedToDotNew = true; - /* do nothing */ - } + if (I->isCall() || I->isReturn()) { + if (!isRegDependence(DepType)) + continue; + if (!isCallDependent(I, DepType, SUJ->Succs[i].getReg())) + continue; + } - else if ((DepType == SDep::Data) && - (QII->isNewValueJump(I))) { - /* do nothing */ - } + if (DepType == SDep::Data) { + if (canPromoteToDotCur(J, SUJ, DepReg, II, RC)) + if (promoteToDotCur(J, DepType, II, RC)) + continue; + } - // For predicated instructions, if the predicates are complements - // then there can be no dependence. - else if (QII->isPredicated(I) && - QII->isPredicated(J) && - ArePredicatesComplements(I, J, MIToSUnit)) { - /* do nothing */ + // Data dpendence ok if we have load.cur. + if (DepType == SDep::Data && HII->isDotCurInst(J)) { + if (HII->isV60VectorInstruction(I)) + continue; + } + // For instructions that can be promoted to dot-new, try to promote. + if (DepType == SDep::Data) { + if (canPromoteToDotNew(I, SUJ, DepReg, II, RC)) { + if (promoteToDotNew(I, DepType, II, RC)) { + PromotedToDotNew = true; + continue; + } } - else if (IsDirectJump(I) && - !MCIDJ.isBranch() && - !MCIDJ.isCall() && - (DepType == SDep::Order)) { - // Ignore Order dependences between unconditional direct branches - // and non-control-flow instructions - /* do nothing */ - } - else if (MCIDI.isConditionalBranch() && (DepType != SDep::Data) && - (DepType != SDep::Output)) { - // Ignore all dependences for jumps except for true and output - // dependences - /* do nothing */ - } - - // Ignore output dependences due to superregs. We can - // write to two different subregisters of R1:0 for instance - // in the same cycle - // + if (HII->isNewValueJump(I)) + continue; + } + // For predicated instructions, if the predicates are complements then + // there can be no dependence. + if (HII->isPredicated(I) && HII->isPredicated(J) && + arePredicatesComplements(I, J)) { + // Not always safe to do this translation. + // DAG Builder attempts to reduce dependence edges using transitive + // nature of dependencies. Here is an example: // - // Let the - // If neither I nor J defines DepReg, then this is a - // superfluous output dependence. The dependence must be of the - // form: - // R0 = ... - // R1 = ... - // and there is an output dependence between the two instructions - // with - // DepReg = D0 - // We want to ignore these dependences. - // Ideally, the dependence constructor should annotate such - // dependences. We can then avoid this relatively expensive check. + // r0 = tfr_pt ... (1) + // r0 = tfr_pf ... (2) + // r0 = tfr_pt ... (3) // - else if (DepType == SDep::Output) { - // DepReg is the register that's responsible for the dependence. - unsigned DepReg = SUJ->Succs[i].getReg(); + // There will be an output dependence between (1)->(2) and (2)->(3). + // However, there is no dependence edge between (1)->(3). This results + // in all 3 instructions going in the same packet. We ignore dependce + // only once to avoid this situation. + auto Itr = std::find(IgnoreDepMIs.begin(), IgnoreDepMIs.end(), J); + if (Itr != IgnoreDepMIs.end()) { + Dependence = true; + return false; + } + IgnoreDepMIs.push_back(I); + continue; + } + + // Ignore Order dependences between unconditional direct branches + // and non-control-flow instructions. + if (isDirectJump(I) && !J->isBranch() && !J->isCall() && + DepType == SDep::Order) + continue; + + // Ignore all dependences for jumps except for true and output + // dependences. + if (I->isConditionalBranch() && DepType != SDep::Data && + DepType != SDep::Output) + continue; + + // Ignore output dependences due to superregs. We can write to two + // different subregisters of R1:0 for instance in the same cycle. + + // If neither I nor J defines DepReg, then this is a superfluous output + // dependence. The dependence must be of the form: + // R0 = ... + // R1 = ... + // and there is an output dependence between the two instructions with + // DepReg = D0. + // We want to ignore these dependences. Ideally, the dependence + // constructor should annotate such dependences. We can then avoid this + // relatively expensive check. + // + if (DepType == SDep::Output) { + // DepReg is the register that's responsible for the dependence. + unsigned DepReg = SUJ->Succs[i].getReg(); + + // Check if I and J really defines DepReg. + if (!I->definesRegister(DepReg) && !J->definesRegister(DepReg)) + continue; + FoundSequentialDependence = true; + break; + } - // Check if I and J really defines DepReg. - if (I->definesRegister(DepReg) || - J->definesRegister(DepReg)) { + // For Order dependences: + // 1. On V4 or later, volatile loads/stores can be packetized together, + // unless other rules prevent is. + // 2. Store followed by a load is not allowed. + // 3. Store followed by a store is only valid on V4 or later. + // 4. Load followed by any memory operation is allowed. + if (DepType == SDep::Order) { + if (!PacketizeVolatiles) { + bool OrdRefs = I->hasOrderedMemoryRef() || J->hasOrderedMemoryRef(); + if (OrdRefs) { FoundSequentialDependence = true; break; } } - - // We ignore Order dependences for - // 1. Two loads unless they are volatile. - // 2. Two stores in V4 unless they are volatile. - else if ((DepType == SDep::Order) && - !I->hasOrderedMemoryRef() && - !J->hasOrderedMemoryRef()) { - if (MCIDI.mayStore() && MCIDJ.mayStore()) { - /* do nothing */ - } - // store followed by store-- not OK on V2 - // store followed by load -- not OK on all (OK if addresses - // are not aliased) - // load followed by store -- OK on all - // load followed by load -- OK on all - else if ( !MCIDJ.mayStore()) { - /* do nothing */ - } - else { + // J is first, I is second. + bool LoadJ = J->mayLoad(), StoreJ = J->mayStore(); + bool LoadI = I->mayLoad(), StoreI = I->mayStore(); + if (StoreJ) { + // Two stores are only allowed on V4+. Load following store is never + // allowed. + if (LoadI) { FoundSequentialDependence = true; break; } - } - - // For V4, special case ALLOCFRAME. Even though there is dependency - // between ALLOCFRAME and subsequent store, allow it to be - // packetized in a same packet. This implies that the store is using - // caller's SP. Hence, offset needs to be updated accordingly. - else if (DepType == SDep::Data - && J->getOpcode() == Hexagon::S2_allocframe - && (I->getOpcode() == Hexagon::S2_storerd_io - || I->getOpcode() == Hexagon::S2_storeri_io - || I->getOpcode() == Hexagon::S2_storerb_io) - && I->getOperand(0).getReg() == QRI->getStackRegister() - && QII->isValidOffset(I->getOpcode(), - I->getOperand(1).getImm() - - (FrameSize + HEXAGON_LRFP_SIZE))) - { - GlueAllocframeStore = true; - // Since this store is to be glued with allocframe in the same - // packet, it will use SP of the previous stack frame, i.e - // caller's SP. Therefore, we need to recalculate offset according - // to this change. - I->getOperand(1).setImm(I->getOperand(1).getImm() - - (FrameSize + HEXAGON_LRFP_SIZE)); - } - - // - // Skip over anti-dependences. Two instructions that are - // anti-dependent can share a packet - // - else if (DepType != SDep::Anti) { + } else if (!LoadJ || (!LoadI && !StoreI)) { + // If J is neither load nor store, assume a dependency. + // If J is a load, but I is neither, also assume a dependency. FoundSequentialDependence = true; break; } + // Store followed by store: not OK on V2. + // Store followed by load: not OK on all. + // Load followed by store: OK on all. + // Load followed by load: OK on all. + continue; } - if (FoundSequentialDependence) { - Dependence = true; - return false; + // For V4, special case ALLOCFRAME. Even though there is dependency + // between ALLOCFRAME and subsequent store, allow it to be packetized + // in a same packet. This implies that the store is using the caller's + // SP. Hence, offset needs to be updated accordingly. + if (DepType == SDep::Data && J->getOpcode() == Hexagon::S2_allocframe) { + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Hexagon::S2_storerd_io: + case Hexagon::S2_storeri_io: + case Hexagon::S2_storerh_io: + case Hexagon::S2_storerb_io: + if (I->getOperand(0).getReg() == HRI->getStackRegister()) { + int64_t Imm = I->getOperand(1).getImm(); + int64_t NewOff = Imm - (FrameSize + HEXAGON_LRFP_SIZE); + if (HII->isValidOffset(Opc, NewOff)) { + GlueAllocframeStore = true; + // Since this store is to be glued with allocframe in the same + // packet, it will use SP of the previous stack frame, i.e. + // caller's SP. Therefore, we need to recalculate offset + // according to this change. + I->getOperand(1).setImm(NewOff); + continue; + } + } + default: + break; + } + } + + // Skip over anti-dependences. Two instructions that are anti-dependent + // can share a packet. + if (DepType != SDep::Anti) { + FoundSequentialDependence = true; + break; } } + if (FoundSequentialDependence) { + Dependence = true; + return false; + } + return true; } -// isLegalToPruneDependencies bool HexagonPacketizerList::isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) { MachineInstr *I = SUI->getInstr(); - assert(I && SUJ->getInstr() && "Unable to packetize null instruction!"); - - const unsigned FrameSize = MF.getFrameInfo()->getStackSize(); - - if (Dependence) { + MachineInstr *J = SUJ->getInstr(); + assert(I && J && "Unable to packetize null instruction!"); - // Check if the instruction was promoted to a dot-new. If so, demote it - // back into a dot-old. - if (PromotedToDotNew) { - DemoteToDotOld(I); - } + if (cannotCoexist(I, J)) + return false; - // Check if the instruction (must be a store) was glued with an Allocframe - // instruction. If so, restore its offset to its original value, i.e. use - // curent SP instead of caller's SP. - if (GlueAllocframeStore) { - I->getOperand(1).setImm(I->getOperand(1).getImm() + - FrameSize + HEXAGON_LRFP_SIZE); - } + if (!Dependence) + return true; - return false; + // Check if the instruction was promoted to a dot-new. If so, demote it + // back into a dot-old. + if (PromotedToDotNew) + demoteToDotOld(I); + + cleanUpDotCur(); + // Check if the instruction (must be a store) was glued with an allocframe + // instruction. If so, restore its offset to its original value, i.e. use + // current SP instead of caller's SP. + if (GlueAllocframeStore) { + unsigned FrameSize = MF.getFrameInfo()->getStackSize(); + MachineOperand &MOff = I->getOperand(1); + MOff.setImm(MOff.getImm() + FrameSize + HEXAGON_LRFP_SIZE); } - return true; + return false; } + MachineBasicBlock::iterator HexagonPacketizerList::addToPacket(MachineInstr *MI) { + MachineBasicBlock::iterator MII = MI; + MachineBasicBlock *MBB = MI->getParent(); + if (MI->isImplicitDef()) { + unsigned R = MI->getOperand(0).getReg(); + if (Hexagon::IntRegsRegClass.contains(R)) { + MCSuperRegIterator S(R, HRI, false); + MI->addOperand(MachineOperand::CreateReg(*S, true, true)); + } + return MII; + } + assert(ResourceTracker->canReserveResources(MI)); + + bool ExtMI = HII->isExtended(MI) || HII->isConstExtended(MI); + bool Good = true; + + if (GlueToNewValueJump) { + MachineInstr *NvjMI = ++MII; + // We need to put both instructions in the same packet: MI and NvjMI. + // Either of them can require a constant extender. Try to add both to + // the current packet, and if that fails, end the packet and start a + // new one. + ResourceTracker->reserveResources(MI); + if (ExtMI) + Good = tryAllocateResourcesForConstExt(true); + + bool ExtNvjMI = HII->isExtended(NvjMI) || HII->isConstExtended(NvjMI); + if (Good) { + if (ResourceTracker->canReserveResources(NvjMI)) + ResourceTracker->reserveResources(NvjMI); + else + Good = false; + } + if (Good && ExtNvjMI) + Good = tryAllocateResourcesForConstExt(true); - MachineBasicBlock::iterator MII = MI; - MachineBasicBlock *MBB = MI->getParent(); - - const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII; - - if (GlueToNewValueJump) { - - ++MII; - MachineInstr *nvjMI = MII; + if (!Good) { + endPacket(MBB, MI); assert(ResourceTracker->canReserveResources(MI)); ResourceTracker->reserveResources(MI); - if ((QII->isExtended(MI) || QII->isConstExtended(MI)) && - !tryAllocateResourcesForConstExt(MI)) { - endPacket(MBB, MI); - ResourceTracker->reserveResources(MI); - assert(canReserveResourcesForConstExt(MI) && - "Ensure that there is a slot"); - reserveResourcesForConstExt(MI); - // Reserve resources for new value jump constant extender. - assert(canReserveResourcesForConstExt(MI) && - "Ensure that there is a slot"); - reserveResourcesForConstExt(nvjMI); - assert(ResourceTracker->canReserveResources(nvjMI) && - "Ensure that there is a slot"); - - } else if ( // Extended instruction takes two slots in the packet. - // Try reserve and allocate 4-byte in the current packet first. - (QII->isExtended(nvjMI) - && (!tryAllocateResourcesForConstExt(nvjMI) - || !ResourceTracker->canReserveResources(nvjMI))) - || // For non-extended instruction, no need to allocate extra 4 bytes. - (!QII->isExtended(nvjMI) && - !ResourceTracker->canReserveResources(nvjMI))) - { - endPacket(MBB, MI); - // A new and empty packet starts. - // We are sure that the resources requirements can be satisfied. - // Therefore, do not need to call "canReserveResources" anymore. - ResourceTracker->reserveResources(MI); - if (QII->isExtended(nvjMI)) - reserveResourcesForConstExt(nvjMI); + if (ExtMI) { + assert(canReserveResourcesForConstExt()); + tryAllocateResourcesForConstExt(true); } - // Here, we are sure that "reserveResources" would succeed. - ResourceTracker->reserveResources(nvjMI); - CurrentPacketMIs.push_back(MI); - CurrentPacketMIs.push_back(nvjMI); - } else { - if ( (QII->isExtended(MI) || QII->isConstExtended(MI)) - && ( !tryAllocateResourcesForConstExt(MI) - || !ResourceTracker->canReserveResources(MI))) - { - endPacket(MBB, MI); - // Check if the instruction was promoted to a dot-new. If so, demote it - // back into a dot-old - if (PromotedToDotNew) { - DemoteToDotOld(MI); - } - reserveResourcesForConstExt(MI); + assert(ResourceTracker->canReserveResources(NvjMI)); + ResourceTracker->reserveResources(NvjMI); + if (ExtNvjMI) { + assert(canReserveResourcesForConstExt()); + reserveResourcesForConstExt(); } - // In case that "MI" is not an extended insn, - // the resource availability has already been checked. - ResourceTracker->reserveResources(MI); - CurrentPacketMIs.push_back(MI); } + CurrentPacketMIs.push_back(MI); + CurrentPacketMIs.push_back(NvjMI); return MII; + } + + ResourceTracker->reserveResources(MI); + if (ExtMI && !tryAllocateResourcesForConstExt(true)) { + endPacket(MBB, MI); + if (PromotedToDotNew) + demoteToDotOld(MI); + ResourceTracker->reserveResources(MI); + reserveResourcesForConstExt(); + } + + CurrentPacketMIs.push_back(MI); + return MII; +} + +void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB, + MachineInstr *MI) { + OldPacketMIs = CurrentPacketMIs; + VLIWPacketizerList::endPacket(MBB, MI); +} + +bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr *MI) { + return !producesStall(MI); +} + + +// Return true when ConsMI uses a register defined by ProdMI. +static bool isDependent(const MachineInstr *ProdMI, + const MachineInstr *ConsMI) { + if (!ProdMI->getOperand(0).isReg()) + return false; + unsigned DstReg = ProdMI->getOperand(0).getReg(); + + for (auto &Op : ConsMI->operands()) + if (Op.isReg() && Op.isUse() && Op.getReg() == DstReg) + // The MIs depend on each other. + return true; + + return false; } +// V60 forward scheduling. +bool HexagonPacketizerList::producesStall(const MachineInstr *I) { + // Check whether the previous packet is in a different loop. If this is the + // case, there is little point in trying to avoid a stall because that would + // favor the rare case (loop entry) over the common case (loop iteration). + // + // TODO: We should really be able to check all the incoming edges if this is + // the first packet in a basic block, so we can avoid stalls from the loop + // backedge. + if (!OldPacketMIs.empty()) { + auto *OldBB = OldPacketMIs.front()->getParent(); + auto *ThisBB = I->getParent(); + if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB)) + return false; + } + + // Check for stall between two vector instructions. + if (HII->isV60VectorInstruction(I)) { + for (auto J : OldPacketMIs) { + if (!HII->isV60VectorInstruction(J)) + continue; + if (isDependent(J, I) && !HII->isVecUsableNextPacket(J, I)) + return true; + } + return false; + } + + // Check for stall between two scalar instructions. First, check that + // there is no definition of a use in the current packet, because it + // may be a candidate for .new. + for (auto J : CurrentPacketMIs) + if (!HII->isV60VectorInstruction(J) && isDependent(J, I)) + return false; + + // Check for stall between I and instructions in the previous packet. + if (MF.getSubtarget<HexagonSubtarget>().useBSBScheduling()) { + for (auto J : OldPacketMIs) { + if (HII->isV60VectorInstruction(J)) + continue; + if (!HII->isLateInstrFeedsEarlyInstr(J, I)) + continue; + if (isDependent(J, I) && !HII->canExecuteInBundle(J, I)) + return true; + } + } + + return false; +} + + //===----------------------------------------------------------------------===// // Public Constructor Functions //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h new file mode 100644 index 0000000..960cf6c --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h @@ -0,0 +1,114 @@ +#ifndef HEXAGONVLIWPACKETIZER_H +#define HEXAGONVLIWPACKETIZER_H + +#include "llvm/CodeGen/DFAPacketizer.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" + +namespace llvm { +class HexagonPacketizerList : public VLIWPacketizerList { + // Vector of instructions assigned to the packet that has just been created. + std::vector<MachineInstr*> OldPacketMIs; + + // Has the instruction been promoted to a dot-new instruction. + bool PromotedToDotNew; + + // Has the instruction been glued to allocframe. + bool GlueAllocframeStore; + + // Has the feeder instruction been glued to new value jump. + bool GlueToNewValueJump; + + // Check if there is a dependence between some instruction already in this + // packet and this instruction. + bool Dependence; + + // Only check for dependence if there are resources available to + // schedule this instruction. + bool FoundSequentialDependence; + + // Track MIs with ignored dependence. + std::vector<MachineInstr*> IgnoreDepMIs; + +protected: + /// \brief A handle to the branch probability pass. + const MachineBranchProbabilityInfo *MBPI; + const MachineLoopInfo *MLI; + +private: + const HexagonInstrInfo *HII; + const HexagonRegisterInfo *HRI; + +public: + // Ctor. + HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, + AliasAnalysis *AA, + const MachineBranchProbabilityInfo *MBPI); + + // initPacketizerState - initialize some internal flags. + void initPacketizerState() override; + + // ignorePseudoInstruction - Ignore bundling of pseudo instructions. + bool ignorePseudoInstruction(const MachineInstr *MI, + const MachineBasicBlock *MBB) override; + + // isSoloInstruction - return true if instruction MI can not be packetized + // with any other instruction, which means that MI itself is a packet. + bool isSoloInstruction(const MachineInstr *MI) override; + + // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ + // together. + bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override; + + // isLegalToPruneDependencies - Is it legal to prune dependece between SUI + // and SUJ. + bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override; + + MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override; + void endPacket(MachineBasicBlock *MBB, MachineInstr *MI) override; + bool shouldAddToPacket(const MachineInstr *MI) override; + + void unpacketizeSoloInstrs(MachineFunction &MF); + +protected: + bool isCallDependent(const MachineInstr* MI, SDep::Kind DepType, + unsigned DepReg); + bool promoteToDotCur(MachineInstr* MI, SDep::Kind DepType, + MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool canPromoteToDotCur(const MachineInstr* MI, const SUnit* PacketSU, + unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + void cleanUpDotCur(); + + bool promoteToDotNew(MachineInstr* MI, SDep::Kind DepType, + MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool canPromoteToDotNew(const MachineInstr* MI, const SUnit* PacketSU, + unsigned DepReg, MachineBasicBlock::iterator &MII, + const TargetRegisterClass* RC); + bool canPromoteToNewValue(const MachineInstr* MI, const SUnit* PacketSU, + unsigned DepReg, MachineBasicBlock::iterator &MII); + bool canPromoteToNewValueStore(const MachineInstr* MI, + const MachineInstr* PacketMI, unsigned DepReg); + bool demoteToDotOld(MachineInstr* MI); + bool arePredicatesComplements(MachineInstr* MI1, MachineInstr* MI2); + bool restrictingDepExistInPacket(MachineInstr*, unsigned); + bool isNewifiable(const MachineInstr *MI); + bool isCurifiable(MachineInstr* MI); + bool cannotCoexist(const MachineInstr *MI, const MachineInstr *MJ); + inline bool isPromotedToDotNew() const { + return PromotedToDotNew; + } + bool tryAllocateResourcesForConstExt(bool Reserve); + bool canReserveResourcesForConstExt(); + void reserveResourcesForConstExt(); + bool hasDeadDependence(const MachineInstr *I, const MachineInstr *J); + bool hasControlDependence(const MachineInstr *I, const MachineInstr *J); + bool hasV4SpecificDependence(const MachineInstr *I, const MachineInstr *J); + bool producesStall(const MachineInstr *MI); +}; +} // namespace llvm +#endif // HEXAGONVLIWPACKETIZER_H + diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 99ea2fa..b73af82 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -13,7 +13,9 @@ #include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" #include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmLayout.h" #include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInstrInfo.h" @@ -33,14 +35,28 @@ class HexagonAsmBackend : public MCAsmBackend { mutable uint64_t relaxedCnt; std::unique_ptr <MCInstrInfo> MCII; std::unique_ptr <MCInst *> RelaxTarget; + MCInst * Extender; public: HexagonAsmBackend(Target const &T, uint8_t OSABI, StringRef CPU) : - OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *){} + OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *), + Extender(nullptr) {} MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { return createHexagonELFObjectWriter(OS, OSABI, CPU); } + void setExtender(MCContext &Context) const { + if (Extender == nullptr) + const_cast<HexagonAsmBackend *>(this)->Extender = new (Context) MCInst; + } + + MCInst *takeExtender() const { + assert(Extender != nullptr); + MCInst * Result = Extender; + const_cast<HexagonAsmBackend *>(this)->Extender = nullptr; + return Result; + } + unsigned getNumFixupKinds() const override { return Hexagon::NumTargetFixupKinds; } @@ -222,6 +238,7 @@ public: if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) { ++relaxedCnt; *RelaxTarget = &MCI; + setExtender(Layout.getAssembler().getContext()); return true; } else { return false; @@ -262,6 +279,7 @@ public: if (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_SIZE) { ++relaxedCnt; *RelaxTarget = &MCI; + setExtender(Layout.getAssembler().getContext()); return true; } } @@ -276,9 +294,35 @@ public: llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced"); } - void relaxInstruction(MCInst const & /*Inst*/, - MCInst & /*Res*/) const override { - llvm_unreachable("relaxInstruction() unimplemented"); + void relaxInstruction(MCInst const & Inst, + MCInst & Res) const override { + assert(HexagonMCInstrInfo::isBundle(Inst) && + "Hexagon relaxInstruction only works on bundles"); + + Res = HexagonMCInstrInfo::createBundle(); + // Copy the results into the bundle. + bool Update = false; + for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) { + MCInst &CrntHMI = const_cast<MCInst &>(*I.getInst()); + + // if immediate extender needed, add it in + if (*RelaxTarget == &CrntHMI) { + Update = true; + assert((HexagonMCInstrInfo::bundleSize(Res) < HEXAGON_PACKET_SIZE) && + "No room to insert extender for relaxation"); + + MCInst *HMIx = takeExtender(); + *HMIx = HexagonMCInstrInfo::deriveExtender( + *MCII, CrntHMI, + HexagonMCInstrInfo::getExtendableOperand(*MCII, CrntHMI)); + Res.addOperand(MCOperand::createInst(HMIx)); + *RelaxTarget = nullptr; + } + // now copy over the original instruction(the one we may have extended) + Res.addOperand(MCOperand::createInst(I.getInst())); + } + (void)Update; + assert(Update && "Didn't find relaxation target"); } bool writeNopData(uint64_t Count, diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index f4d162c..47a6f86 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -44,6 +44,25 @@ namespace HexagonII { TypeMEMOP = 9, TypeNV = 10, TypeDUPLEX = 11, + TypeCOMPOUND = 12, + TypeCVI_FIRST = 13, + TypeCVI_VA = TypeCVI_FIRST, + TypeCVI_VA_DV = 14, + TypeCVI_VX = 15, + TypeCVI_VX_DV = 16, + TypeCVI_VP = 17, + TypeCVI_VP_VS = 18, + TypeCVI_VS = 19, + TypeCVI_VINLANESAT= 20, + TypeCVI_VM_LD = 21, + TypeCVI_VM_TMP_LD = 22, + TypeCVI_VM_CUR_LD = 23, + TypeCVI_VM_VP_LDU = 24, + TypeCVI_VM_ST = 25, + TypeCVI_VM_NEW_ST = 26, + TypeCVI_VM_STU = 27, + TypeCVI_HIST = 28, + TypeCVI_LAST = TypeCVI_HIST, TypePREFIX = 30, // Such as extenders. TypeENDLOOP = 31 // Such as end of a HW loop. }; @@ -71,12 +90,16 @@ namespace HexagonII { PostInc = 6 // Post increment addressing mode }; + // MemAccessSize is represented as 1+log2(N) where N is size in bits. enum class MemAccessSize { NoMemAccess = 0, // Not a memory acces instruction. ByteAccess = 1, // Byte access instruction (memb). HalfWordAccess = 2, // Half word access instruction (memh). WordAccess = 3, // Word access instruction (memw). - DoubleWordAccess = 4 // Double word access instruction (memd) + DoubleWordAccess = 4, // Double word access instruction (memd) + // 5, // We do not have a 16 byte vector access. + Vector64Access = 7, // 64 Byte vector access instruction (vmem). + Vector128Access = 8 // 128 Byte vector access instruction (vmem). }; // MCInstrDesc TSFlags @@ -156,7 +179,7 @@ namespace HexagonII { AddrModeMask = 0x7, // Access size for load/store instructions. MemAccessSizePos = 43, - MemAccesSizeMask = 0x7, + MemAccesSizeMask = 0xf, // Branch predicted taken. TakenPos = 47, @@ -164,7 +187,23 @@ namespace HexagonII { // Floating-point instructions. FPPos = 48, - FPMask = 0x1 + FPMask = 0x1, + + // New-Value producer-2 instructions. + hasNewValuePos2 = 50, + hasNewValueMask2 = 0x1, + + // Which operand consumes or produces a new value. + NewValueOpPos2 = 51, + NewValueOpMask2 = 0x7, + + // Accumulator instructions. + AccumulatorPos = 54, + AccumulatorMask = 0x1, + + // Complex XU, prevent xu competition by prefering slot3 + PrefersSlot3Pos = 55, + PrefersSlot3Mask = 0x1, }; // *** The code above must match HexagonInstrFormat*.td *** // @@ -219,6 +258,26 @@ namespace HexagonII { INST_PARSE_EXTENDER = 0x00000000 }; + enum InstIClassBits : unsigned { + INST_ICLASS_MASK = 0xf0000000, + INST_ICLASS_EXTENDER = 0x00000000, + INST_ICLASS_J_1 = 0x10000000, + INST_ICLASS_J_2 = 0x20000000, + INST_ICLASS_LD_ST_1 = 0x30000000, + INST_ICLASS_LD_ST_2 = 0x40000000, + INST_ICLASS_J_3 = 0x50000000, + INST_ICLASS_CR = 0x60000000, + INST_ICLASS_ALU32_1 = 0x70000000, + INST_ICLASS_XTYPE_1 = 0x80000000, + INST_ICLASS_LD = 0x90000000, + INST_ICLASS_ST = 0xa0000000, + INST_ICLASS_ALU32_2 = 0xb0000000, + INST_ICLASS_XTYPE_2 = 0xc0000000, + INST_ICLASS_XTYPE_3 = 0xd0000000, + INST_ICLASS_XTYPE_4 = 0xe0000000, + INST_ICLASS_ALU32_3 = 0xf0000000 + }; + } // End namespace HexagonII. } // End namespace llvm. diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp index 36f8146..06ccec5 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp @@ -12,13 +12,13 @@ //===----------------------------------------------------------------------===// #include "HexagonAsmPrinter.h" -#include "Hexagon.h" #include "HexagonInstPrinter.h" +#include "MCTargetDesc/HexagonBaseInfo.h" #include "MCTargetDesc/HexagonMCInstrInfo.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -28,104 +28,33 @@ using namespace llvm; #define GET_INSTRUCTION_NAME #include "HexagonGenAsmWriter.inc" -HexagonAsmInstPrinter::HexagonAsmInstPrinter(MCInstPrinter *RawPrinter) - : MCInstPrinter(*RawPrinter), RawPrinter(RawPrinter) {} - -void HexagonAsmInstPrinter::printInst(MCInst const *MI, raw_ostream &O, - StringRef Annot, - MCSubtargetInfo const &STI) { - assert(HexagonMCInstrInfo::isBundle(*MI)); - assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE); - std::string Buffer; - { - raw_string_ostream TempStream(Buffer); - RawPrinter->printInst(MI, TempStream, "", STI); - } - StringRef Contents(Buffer); - auto PacketBundle = Contents.rsplit('\n'); - auto HeadTail = PacketBundle.first.split('\n'); - auto Preamble = "\t{\n\t\t"; - auto Separator = ""; - while(!HeadTail.first.empty()) { - O << Separator; - StringRef Inst; - auto Duplex = HeadTail.first.split('\v'); - if(!Duplex.second.empty()){ - O << Duplex.first << "\n"; - Inst = Duplex.second; - } - else - Inst = Duplex.first; - O << Preamble; - O << Inst; - HeadTail = HeadTail.second.split('\n'); - Preamble = ""; - Separator = "\n\t\t"; - } - O << "\n\t}" << PacketBundle.second; -} - -void HexagonAsmInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { - RawPrinter->printRegName(O, RegNo); -} - -// Return the minimum value that a constant extendable operand can have -// without being extended. -static int getMinValue(uint64_t TSFlags) { - unsigned isSigned = - (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; - unsigned bits = - (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask; - - if (isSigned) - return -1U << (bits - 1); - - return 0; -} - -// Return the maximum value that a constant extendable operand can have -// without being extended. -static int getMaxValue(uint64_t TSFlags) { - unsigned isSigned = - (TSFlags >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask; - unsigned bits = - (TSFlags >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask; - - if (isSigned) - return ~(-1U << (bits - 1)); - - return ~(-1U << bits); -} - -// Return true if the instruction must be extended. -static bool isExtended(uint64_t TSFlags) { - return (TSFlags >> HexagonII::ExtendedPos) & HexagonII::ExtendedMask; -} - -// Currently just used in an assert statement -static bool isExtendable(uint64_t TSFlags) LLVM_ATTRIBUTE_UNUSED; -// Return true if the instruction may be extended based on the operand value. -static bool isExtendable(uint64_t TSFlags) { - return (TSFlags >> HexagonII::ExtendablePos) & HexagonII::ExtendableMask; +HexagonInstPrinter::HexagonInstPrinter(MCAsmInfo const &MAI, + MCInstrInfo const &MII, + MCRegisterInfo const &MRI) + : MCInstPrinter(MAI, MII, MRI), MII(MII), HasExtender(false) { } StringRef HexagonInstPrinter::getOpcodeName(unsigned Opcode) const { return MII.getName(Opcode); } -void HexagonInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - OS << getRegisterName(RegNo); +void HexagonInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { + O << getRegName(RegNo); +} + +StringRef HexagonInstPrinter::getRegName(unsigned RegNo) const { + return getRegisterName(RegNo); } void HexagonInstPrinter::setExtender(MCInst const &MCI) { HasExtender = HexagonMCInstrInfo::isImmext(MCI); } -void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS, - StringRef Annot, - MCSubtargetInfo const &STI) { +void HexagonInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, + StringRef Annot, const MCSubtargetInfo &STI) { assert(HexagonMCInstrInfo::isBundle(*MI)); assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE); + assert(HexagonMCInstrInfo::bundleSize(*MI) > 0); HasExtender = false; for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) { MCInst const &MCI = *I.getInst(); @@ -157,145 +86,148 @@ void HexagonInstPrinter::printInst(MCInst const *MI, raw_ostream &OS, } } -void HexagonInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { - const MCOperand& MO = MI->getOperand(OpNo); - + if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo && + (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI))) + O << "#"; + MCOperand const &MO = MI->getOperand(OpNo); if (MO.isReg()) { - printRegName(O, MO.getReg()); - } else if(MO.isExpr()) { - MO.getExpr()->print(O, &MAI); - } else if(MO.isImm()) { - printImmOperand(MI, OpNo, O); - } else { - llvm_unreachable("Unknown operand"); - } -} - -void HexagonInstPrinter::printImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand& MO = MI->getOperand(OpNo); - - if(MO.isExpr()) { - MO.getExpr()->print(O, &MAI); - } else if(MO.isImm()) { - O << MI->getOperand(OpNo).getImm(); + O << getRegisterName(MO.getReg()); + } else if (MO.isExpr()) { + int64_t Value; + if (MO.getExpr()->evaluateAsAbsolute(Value)) + O << formatImm(Value); + else + O << *MO.getExpr(); } else { llvm_unreachable("Unknown operand"); } } -void HexagonInstPrinter::printExtOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { - const MCOperand &MO = MI->getOperand(OpNo); - const MCInstrDesc &MII = getMII().get(MI->getOpcode()); - - assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) && - "Expecting an extendable operand"); - - if (MO.isExpr() || isExtended(MII.TSFlags)) { - O << "#"; - } else if (MO.isImm()) { - int ImmValue = MO.getImm(); - if (ImmValue < getMinValue(MII.TSFlags) || - ImmValue > getMaxValue(MII.TSFlags)) - O << "#"; - } printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printUnsignedImmOperand(const MCInst *MI, - unsigned OpNo, raw_ostream &O) const { +void HexagonInstPrinter::printUnsignedImmOperand(MCInst const *MI, + unsigned OpNo, + raw_ostream &O) const { O << MI->getOperand(OpNo).getImm(); } -void HexagonInstPrinter::printNegImmOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printNegImmOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { O << -MI->getOperand(OpNo).getImm(); } -void HexagonInstPrinter::printNOneImmOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { O << -1; } -void HexagonInstPrinter::printMEMriOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand& MO0 = MI->getOperand(OpNo); - const MCOperand& MO1 = MI->getOperand(OpNo + 1); +void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<9>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO."); + O << formatImm(Imm/64); +} - printRegName(O, MO0.getReg()); - O << " + #" << MO1.getImm(); +void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<10>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO."); + O << formatImm(Imm/128); } -void HexagonInstPrinter::printFrameIndexOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand& MO0 = MI->getOperand(OpNo); - const MCOperand& MO1 = MI->getOperand(OpNo + 1); +void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<10>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO."); + O << formatImm(Imm/64); +} - printRegName(O, MO0.getReg()); - O << ", #" << MO1.getImm(); +void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + int64_t Imm; + bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm); + Imm = SignExtend64<11>(Imm); + assert(Success); (void)Success; + assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO."); + O << formatImm(Imm/128); } -void HexagonInstPrinter::printGlobalOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { - assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); - printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printJumpTable(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printConstantPool(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { assert(MI->getOperand(OpNo).isExpr() && "Expecting expression"); printOperand(MI, OpNo, O); } -void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printBranchOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { // Branches can take an immediate operand. This is used by the branch // selection pass to print $+8, an eight byte displacement from the PC. llvm_unreachable("Unknown branch operand."); } -void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { -} +void HexagonInstPrinter::printCallOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const {} -void HexagonInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { -} +void HexagonInstPrinter::printAbsAddrOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const {} -void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { -} +void HexagonInstPrinter::printPredicateOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const {} -void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo, +void HexagonInstPrinter::printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O, bool hi) const { - assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand"); + MCOperand const &MO = MI->getOperand(OpNo); - O << '#' << (hi ? "HI" : "LO") << "(#"; - printOperand(MI, OpNo, O); + O << '#' << (hi ? "HI" : "LO") << '('; + if (MO.isImm()) { + O << '#'; + printOperand(MI, OpNo, O); + } else { + printOperand(MI, OpNo, O); + assert("Unknown symbol operand"); + } O << ')'; } -void HexagonInstPrinter::printExtBrtarget(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const { - const MCOperand &MO = MI->getOperand(OpNo); - const MCInstrDesc &MII = getMII().get(MI->getOpcode()); - - assert((isExtendable(MII.TSFlags) || isExtended(MII.TSFlags)) && - "Expecting an extendable operand"); - - if (MO.isExpr() || isExtended(MII.TSFlags)) { - O << "##"; +void HexagonInstPrinter::printBrtarget(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const { + MCOperand const &MO = MI->getOperand(OpNo); + assert (MO.isExpr()); + MCExpr const &Expr = *MO.getExpr(); + int64_t Value; + if (Expr.evaluateAsAbsolute(Value)) + O << format("0x%" PRIx64, Value); + else { + if (HasExtender || HexagonMCInstrInfo::isConstExtended(MII, *MI)) + if (HexagonMCInstrInfo::getExtendableOp(MII, *MI) == OpNo) + O << "##"; + O << Expr; } - printOperand(MI, OpNo, O); } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h index 534ac23..5f42118 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h @@ -7,7 +7,6 @@ // //===----------------------------------------------------------------------===// // -// This class prints an Hexagon MCInst to a .s file. // //===----------------------------------------------------------------------===// @@ -15,17 +14,8 @@ #define LLVM_LIB_TARGET_HEXAGON_INSTPRINTER_HEXAGONINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCInstrInfo.h" namespace llvm { -class HexagonAsmInstPrinter : public MCInstPrinter { -public: - HexagonAsmInstPrinter(MCInstPrinter *RawPrinter); - void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot, - MCSubtargetInfo const &STI) override; - void printRegName(raw_ostream &O, unsigned RegNo) const override; - std::unique_ptr<MCInstPrinter> RawPrinter; -}; /// Prints bundles as a newline separated list of individual instructions /// Duplexes are separated by a vertical tab \v character /// A trailing line includes bundle properties such as endloop0/1 @@ -33,68 +23,69 @@ public: /// r0 = add(r1, r2) /// r0 = #0 \v jump 0x0 /// :endloop0 :endloop1 - class HexagonInstPrinter : public MCInstPrinter { - public: - explicit HexagonInstPrinter(MCAsmInfo const &MAI, - MCInstrInfo const &MII, - MCRegisterInfo const &MRI) - : MCInstPrinter(MAI, MII, MRI), MII(MII) {} - - void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot, - const MCSubtargetInfo &STI) override; - virtual StringRef getOpcodeName(unsigned Opcode) const; - void printInstruction(const MCInst *MI, raw_ostream &O); - void printRegName(raw_ostream &OS, unsigned RegNo) const override; - static const char *getRegisterName(unsigned RegNo); +class HexagonInstPrinter : public MCInstPrinter { +public: + explicit HexagonInstPrinter(MCAsmInfo const &MAI, MCInstrInfo const &MII, + MCRegisterInfo const &MRI); + void printInst(MCInst const *MI, raw_ostream &O, StringRef Annot, + const MCSubtargetInfo &STI) override; + virtual StringRef getOpcodeName(unsigned Opcode) const; + void printInstruction(MCInst const *MI, raw_ostream &O); - void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printExtOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printUnsignedImmOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const; - void printNegImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printNOneImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printMEMriOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printFrameIndexOperand(const MCInst *MI, unsigned OpNo, - raw_ostream &O) const; - void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printCallOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printPredicateOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printGlobalOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) - const; - void printJumpTable(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; - void printExtBrtarget(const MCInst *MI, unsigned OpNo, raw_ostream &O) const; + StringRef getRegName(unsigned RegNo) const; + static char const *getRegisterName(unsigned RegNo); + void printRegName(raw_ostream &O, unsigned RegNo) const override; - void printConstantPool(const MCInst *MI, unsigned OpNo, + void printOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printExtOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printUnsignedImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printNegImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printNOneImmOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printBranchOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printAbsAddrOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printPredicateOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printGlobalOperand(MCInst const *MI, unsigned OpNo, + raw_ostream &O) const; + void printJumpTable(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + void printBrtarget(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; + + void printConstantPool(MCInst const *MI, unsigned OpNo, raw_ostream &O) const; - void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O) const - { printSymbol(MI, OpNo, O, true); } - void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O) const - { printSymbol(MI, OpNo, O, false); } + void printSymbolHi(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { + printSymbol(MI, OpNo, O, true); + } + void printSymbolLo(MCInst const *MI, unsigned OpNo, raw_ostream &O) const { + printSymbol(MI, OpNo, O, false); + } - const MCInstrInfo &getMII() const { - return MII; - } + MCAsmInfo const &getMAI() const { return MAI; } + MCInstrInfo const &getMII() const { return MII; } - protected: - void printSymbol(const MCInst *MI, unsigned OpNo, raw_ostream &O, bool hi) - const; +protected: + void printSymbol(MCInst const *MI, unsigned OpNo, raw_ostream &O, + bool hi) const; - private: - const MCInstrInfo &MII; +private: + MCInstrInfo const &MII; - bool HasExtender; - void setExtender(MCInst const &MCI); - }; + bool HasExtender; + void setExtender(MCInst const &MCI); +}; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h index dc07069..a8456b4 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h @@ -18,13 +18,14 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; +class Triple; - class HexagonMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit HexagonMCAsmInfo(const Triple &TT); - }; +class HexagonMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit HexagonMCAsmInfo(const Triple &TT); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp new file mode 100644 index 0000000..46b7b41 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp @@ -0,0 +1,581 @@ +//===----- HexagonMCChecker.cpp - Instruction bundle checking -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements the checking of insns inside a bundle according to the +// packet constraint rules of the Hexagon ISA. +// +//===----------------------------------------------------------------------===// + +#include "HexagonMCChecker.h" + +#include "HexagonBaseInfo.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +static cl::opt<bool> RelaxNVChecks("relax-nv-checks", cl::init(false), + cl::ZeroOrMore, cl::Hidden, cl::desc("Relax checks of new-value validity")); + +const HexagonMCChecker::PredSense + HexagonMCChecker::Unconditional(Hexagon::NoRegister, false); + +void HexagonMCChecker::init() { + // Initialize read-only registers set. + ReadOnly.insert(Hexagon::PC); + + // Figure out the loop-registers definitions. + if (HexagonMCInstrInfo::isInnerLoop(MCB)) { + Defs[Hexagon::SA0].insert(Unconditional); // FIXME: define or change SA0? + Defs[Hexagon::LC0].insert(Unconditional); + } + if (HexagonMCInstrInfo::isOuterLoop(MCB)) { + Defs[Hexagon::SA1].insert(Unconditional); // FIXME: define or change SA0? + Defs[Hexagon::LC1].insert(Unconditional); + } + + if (HexagonMCInstrInfo::isBundle(MCB)) + // Unfurl a bundle. + for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) { + init(*I.getInst()); + } + else + init(MCB); +} + +void HexagonMCChecker::init(MCInst const& MCI) { + const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI); + unsigned PredReg = Hexagon::NoRegister; + bool isTrue = false; + + // Get used registers. + for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i) + if (MCI.getOperand(i).isReg()) { + unsigned R = MCI.getOperand(i).getReg(); + + if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) { + // Note an used predicate register. + PredReg = R; + isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI); + + // Note use of new predicate register. + if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) + NewPreds.insert(PredReg); + } + else + // Note register use. Super-registers are not tracked directly, + // but their components. + for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid()); + SRI.isValid(); + ++SRI) + if (!MCSubRegIterator(*SRI, &RI).isValid()) + // Skip super-registers used indirectly. + Uses.insert(*SRI); + } + + // Get implicit register definitions. + if (const MCPhysReg *ImpDef = MCID.getImplicitDefs()) + for (; *ImpDef; ++ImpDef) { + unsigned R = *ImpDef; + + if (Hexagon::R31 != R && MCID.isCall()) + // Any register other than the LR and the PC are actually volatile ones + // as defined by the ABI, not modified implicitly by the call insn. + continue; + if (Hexagon::PC == R) + // Branches are the only insns that can change the PC, + // otherwise a read-only register. + continue; + + if (Hexagon::USR_OVF == R) + // Many insns change the USR implicitly, but only one or another flag. + // The instruction table models the USR.OVF flag, which can be implicitly + // modified more than once, but cannot be modified in the same packet + // with an instruction that modifies is explicitly. Deal with such situ- + // ations individually. + SoftDefs.insert(R); + else if (isPredicateRegister(R) && + HexagonMCInstrInfo::isPredicateLate(MCII, MCI)) + // Include implicit late predicates. + LatePreds.insert(R); + else + Defs[R].insert(PredSense(PredReg, isTrue)); + } + + // Figure out explicit register definitions. + for (unsigned i = 0; i < MCID.getNumDefs(); ++i) { + unsigned R = MCI.getOperand(i).getReg(), + S = Hexagon::NoRegister; + + // Note register definitions, direct ones as well as indirect side-effects. + // Super-registers are not tracked directly, but their components. + for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid()); + SRI.isValid(); + ++SRI) { + if (MCSubRegIterator(*SRI, &RI).isValid()) + // Skip super-registers defined indirectly. + continue; + + if (R == *SRI) { + if (S == R) + // Avoid scoring the defined register multiple times. + continue; + else + // Note that the defined register has already been scored. + S = R; + } + + if (Hexagon::P3_0 != R && Hexagon::P3_0 == *SRI) + // P3:0 is a special case, since multiple predicate register definitions + // in a packet is allowed as the equivalent of their logical "and". + // Only an explicit definition of P3:0 is noted as such; if a + // side-effect, then note as a soft definition. + SoftDefs.insert(*SRI); + else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) && isPredicateRegister(*SRI)) + // Some insns produce predicates too late to be used in the same packet. + LatePreds.insert(*SRI); + else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_CUR_LD) + // Current loads should be used in the same packet. + // TODO: relies on the impossibility of a current and a temporary loads + // in the same packet. + CurDefs.insert(*SRI), Defs[*SRI].insert(PredSense(PredReg, isTrue)); + else if (i == 0 && llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCVI_VM_TMP_LD) + // Temporary loads should be used in the same packet, but don't commit + // results, so it should be disregarded if another insn changes the same + // register. + // TODO: relies on the impossibility of a current and a temporary loads + // in the same packet. + TmpDefs.insert(*SRI); + else if (i <= 1 && llvm::HexagonMCInstrInfo::hasNewValue2(MCII, MCI) ) + // vshuff(Vx, Vy, Rx) <- Vx(0) and Vy(1) are both source and + // destination registers with this instruction. same for vdeal(Vx,Vy,Rx) + Uses.insert(*SRI); + else + Defs[*SRI].insert(PredSense(PredReg, isTrue)); + } + } + + // Figure out register definitions that produce new values. + if (HexagonMCInstrInfo::hasNewValue(MCII, MCI)) { + unsigned R = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg(); + + if (HexagonMCInstrInfo::isCompound(MCII, MCI)) + compoundRegisterMap(R); // Compound insns have a limited register range. + + for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid()); + SRI.isValid(); + ++SRI) + if (!MCSubRegIterator(*SRI, &RI).isValid()) + // No super-registers defined indirectly. + NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI), + HexagonMCInstrInfo::isFloat(MCII, MCI))); + + // For fairly unique 2-dot-new producers, example: + // vdeal(V1, V9, R0) V1.new and V9.new can be used by consumers. + if (HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) { + unsigned R2 = HexagonMCInstrInfo::getNewValueOperand2(MCII, MCI).getReg(); + + for(MCRegAliasIterator SRI(R2, &RI, !MCSubRegIterator(R2, &RI).isValid()); + SRI.isValid(); + ++SRI) + if (!MCSubRegIterator(*SRI, &RI).isValid()) + NewDefs[*SRI].push_back(NewSense::Def(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI), + HexagonMCInstrInfo::isFloat(MCII, MCI))); + } + } + + // Figure out definitions of new predicate registers. + if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) + for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i) + if (MCI.getOperand(i).isReg()) { + unsigned P = MCI.getOperand(i).getReg(); + + if (isPredicateRegister(P)) + NewPreds.insert(P); + } + + // Figure out uses of new values. + if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) { + unsigned N = HexagonMCInstrInfo::getNewValueOperand(MCII, MCI).getReg(); + + if (!MCSubRegIterator(N, &RI).isValid()) { + // Super-registers cannot use new values. + if (MCID.isBranch()) + NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV); + else + NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI)); + } + } +} + +HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst &mcb, MCInst &mcbdx, + MCRegisterInfo const &ri) + : MCB(mcb), MCBDX(mcbdx), RI(ri), MCII(MCII), STI(STI), + bLoadErrInfo(false) { + init(); +} + +bool HexagonMCChecker::check() { + bool chkB = checkBranches(); + bool chkP = checkPredicates(); + bool chkNV = checkNewValues(); + bool chkR = checkRegisters(); + bool chkS = checkSolo(); + bool chkSh = checkShuffle(); + bool chkSl = checkSlots(); + bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl; + + return chk; +} + +bool HexagonMCChecker::checkSlots() + +{ + unsigned slotsUsed = 0; + for (auto HMI: HexagonMCInstrInfo::bundleInstructions(MCBDX)) { + MCInst const& MCI = *HMI.getInst(); + if (HexagonMCInstrInfo::isImmext(MCI)) + continue; + if (HexagonMCInstrInfo::isDuplex(MCII, MCI)) + slotsUsed += 2; + else + ++slotsUsed; + } + + if (slotsUsed > HEXAGON_PACKET_SIZE) { + HexagonMCErrInfo errInfo; + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NOSLOTS); + addErrInfo(errInfo); + return false; + } + return true; +} + +// Check legal use of branches. +bool HexagonMCChecker::checkBranches() { + HexagonMCErrInfo errInfo; + if (HexagonMCInstrInfo::isBundle(MCB)) { + bool hasConditional = false; + unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0, + NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE, + Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE; + + for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset; + i < MCB.size(); ++i) { + MCInst const &MCI = *MCB.begin()[i].getInst(); + + if (HexagonMCInstrInfo::isImmext(MCI)) + continue; + if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() || + HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) { + ++Branches; + if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() && + HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) + ++NewIndirectBranches; + if (HexagonMCInstrInfo::isNewValue(MCII, MCI)) + ++NewValueBranches; + + if (HexagonMCInstrInfo::isPredicated(MCII, MCI) || + HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) { + hasConditional = true; + Conditional = i; // Record the position of the conditional branch. + } else { + Unconditional = i; // Record the position of the unconditional branch. + } + } + if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() && + HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad()) + ++Returns; + } + + if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too? + if (HexagonMCInstrInfo::isInnerLoop(MCB) || + HexagonMCInstrInfo::isOuterLoop(MCB)) { + // Error out if there's any branch in a loop-end packet. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_ENDLOOP, Hexagon::PC); + addErrInfo(errInfo); + return false; + } + if (Branches > 1) + if (!hasConditional || Conditional > Unconditional) { + // Error out if more than one unconditional branch or + // the conditional branch appears after the unconditional one. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_BRANCHES); + addErrInfo(errInfo); + return false; + } + } + + return true; +} + +// Check legal use of predicate registers. +bool HexagonMCChecker::checkPredicates() { + HexagonMCErrInfo errInfo; + // Check for proper use of new predicate registers. + for (const auto& I : NewPreds) { + unsigned P = I; + + if (!Defs.count(P) || LatePreds.count(P)) { + // Error out if the new predicate register is not defined, + // or defined "late" + // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }"). + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWP, P); + addErrInfo(errInfo); + return false; + } + } + + // Check for proper use of auto-anded of predicate registers. + for (const auto& I : LatePreds) { + unsigned P = I; + + if (LatePreds.count(P) > 1 || Defs.count(P)) { + // Error out if predicate register defined "late" multiple times or + // defined late and regularly defined + // (e.g., "{ p3 = sp1loop0(...); p3 = cmp.eq(...) }". + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, P); + addErrInfo(errInfo); + return false; + } + } + + return true; +} + +// Check legal use of new values. +bool HexagonMCChecker::checkNewValues() { + HexagonMCErrInfo errInfo; + memset(&errInfo, 0, sizeof(errInfo)); + for (auto& I : NewUses) { + unsigned R = I.first; + NewSense &US = I.second; + + if (!hasValidNewValueDef(US, NewDefs[R])) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_NEWV, R); + addErrInfo(errInfo); + return false; + } + } + + return true; +} + +// Check for legal register uses and definitions. +bool HexagonMCChecker::checkRegisters() { + HexagonMCErrInfo errInfo; + // Check for proper register definitions. + for (const auto& I : Defs) { + unsigned R = I.first; + + if (ReadOnly.count(R)) { + // Error out for definitions of read-only registers. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_READONLY, R); + addErrInfo(errInfo); + return false; + } + if (isLoopRegister(R) && Defs.count(R) > 1 && + (HexagonMCInstrInfo::isInnerLoop(MCB) || + HexagonMCInstrInfo::isOuterLoop(MCB))) { + // Error out for definitions of loop registers at the end of a loop. + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_LOOP, R); + addErrInfo(errInfo); + return false; + } + if (SoftDefs.count(R)) { + // Error out for explicit changes to registers also weakly defined + // (e.g., "{ usr = r0; r0 = sfadd(...) }"). + unsigned UsrR = Hexagon::USR; // Silence warning about mixed types in ?:. + unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR); + addErrInfo(errInfo); + return false; + } + if (!isPredicateRegister(R) && Defs[R].size() > 1) { + // Check for multiple register definitions. + PredSet &PM = Defs[R]; + + // Check for multiple unconditional register definitions. + if (PM.count(Unconditional)) { + // Error out on an unconditional change when there are any other + // changes, conditional or not. + unsigned UsrR = Hexagon::USR; + unsigned BadR = RI.isSubRegister(Hexagon::USR, R) ? UsrR : R; + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, BadR); + addErrInfo(errInfo); + return false; + } + // Check for multiple conditional register definitions. + for (const auto& J : PM) { + PredSense P = J; + + // Check for multiple uses of the same condition. + if (PM.count(P) > 1) { + // Error out on conditional changes based on the same predicate + // (e.g., "{ if (!p0) r0 =...; if (!p0) r0 =... }"). + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R); + addErrInfo(errInfo); + return false; + } + // Check for the use of the complementary condition. + P.second = !P.second; + if (PM.count(P) && PM.size() > 2) { + // Error out on conditional changes based on the same predicate + // multiple times + // (e.g., "{ if (p0) r0 =...; if (!p0) r0 =... }; if (!p0) r0 =... }"). + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_REGISTERS, R); + addErrInfo(errInfo); + return false; + } + } + } + } + + // Check for use of current definitions. + for (const auto& I : CurDefs) { + unsigned R = I; + + if (!Uses.count(R)) { + // Warn on an unused current definition. + errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_CURRENT, R); + addErrInfo(errInfo); + return true; + } + } + + // Check for use of temporary definitions. + for (const auto& I : TmpDefs) { + unsigned R = I; + + if (!Uses.count(R)) { + // special case for vhist + bool vHistFound = false; + for (auto const&HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) { + if(llvm::HexagonMCInstrInfo::getType(MCII, *HMI.getInst()) == HexagonII::TypeCVI_HIST) { + vHistFound = true; // vhist() implicitly uses ALL REGxx.tmp + break; + } + } + // Warn on an unused temporary definition. + if (vHistFound == false) { + errInfo.setWarning(HexagonMCErrInfo::CHECK_WARN_TEMPORARY, R); + addErrInfo(errInfo); + return true; + } + } + } + + return true; +} + +// Check for legal use of solo insns. +bool HexagonMCChecker::checkSolo() { + HexagonMCErrInfo errInfo; + if (HexagonMCInstrInfo::isBundle(MCB) && + HexagonMCInstrInfo::bundleSize(MCB) > 1) { + for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) { + if (llvm::HexagonMCInstrInfo::isSolo(MCII, *I.getInst())) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SOLO); + addErrInfo(errInfo); + return false; + } + } + } + + return true; +} + +bool HexagonMCChecker::checkShuffle() { + HexagonMCErrInfo errInfo; + // Branch info is lost when duplexing. The unduplexed insns must be + // checked and only branch errors matter for this case. + HexagonMCShuffler MCS(MCII, STI, MCB); + if (!MCS.check()) { + if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE); + errInfo.setShuffleError(MCS.getError()); + addErrInfo(errInfo); + return false; + } + } + HexagonMCShuffler MCSDX(MCII, STI, MCBDX); + if (!MCSDX.check()) { + errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE); + errInfo.setShuffleError(MCSDX.getError()); + addErrInfo(errInfo); + return false; + } + return true; +} + +void HexagonMCChecker::compoundRegisterMap(unsigned& Register) { + switch (Register) { + default: + break; + case Hexagon::R15: + Register = Hexagon::R23; + break; + case Hexagon::R14: + Register = Hexagon::R22; + break; + case Hexagon::R13: + Register = Hexagon::R21; + break; + case Hexagon::R12: + Register = Hexagon::R20; + break; + case Hexagon::R11: + Register = Hexagon::R19; + break; + case Hexagon::R10: + Register = Hexagon::R18; + break; + case Hexagon::R9: + Register = Hexagon::R17; + break; + case Hexagon::R8: + Register = Hexagon::R16; + break; + } +} + +bool HexagonMCChecker::hasValidNewValueDef(const NewSense &Use, + const NewSenseList &Defs) const { + bool Strict = !RelaxNVChecks; + + for (unsigned i = 0, n = Defs.size(); i < n; ++i) { + const NewSense &Def = Defs[i]; + // NVJ cannot use a new FP value [7.6.1] + if (Use.IsNVJ && (Def.IsFloat || Def.PredReg != 0)) + continue; + // If the definition was not predicated, then it does not matter if + // the use is. + if (Def.PredReg == 0) + return true; + // With the strict checks, both the definition and the use must be + // predicated on the same register and condition. + if (Strict) { + if (Def.PredReg == Use.PredReg && Def.Cond == Use.Cond) + return true; + } else { + // With the relaxed checks, if the definition was predicated, the only + // detectable violation is if the use is predicated on the opposing + // condition, otherwise, it's ok. + if (Def.PredReg != Use.PredReg || Def.Cond == Use.Cond) + return true; + } + } + return false; +} + diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h new file mode 100644 index 0000000..5fc0bde --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h @@ -0,0 +1,218 @@ +//===----- HexagonMCChecker.h - Instruction bundle checking ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This implements the checking of insns inside a bundle according to the +// packet constraint rules of the Hexagon ISA. +// +//===----------------------------------------------------------------------===// + +#ifndef HEXAGONMCCHECKER_H +#define HEXAGONMCCHECKER_H + +#include <map> +#include <set> +#include <queue> +#include "MCTargetDesc/HexagonMCShuffler.h" + +using namespace llvm; + +namespace llvm { +class MCOperandInfo; + +typedef struct { + unsigned Error, Warning, ShuffleError; + unsigned Register; +} ErrInfo_T; + +class HexagonMCErrInfo { +public: + enum { + CHECK_SUCCESS = 0, + // Errors. + CHECK_ERROR_BRANCHES = 0x00001, + CHECK_ERROR_NEWP = 0x00002, + CHECK_ERROR_NEWV = 0x00004, + CHECK_ERROR_REGISTERS = 0x00008, + CHECK_ERROR_READONLY = 0x00010, + CHECK_ERROR_LOOP = 0x00020, + CHECK_ERROR_ENDLOOP = 0x00040, + CHECK_ERROR_SOLO = 0x00080, + CHECK_ERROR_SHUFFLE = 0x00100, + CHECK_ERROR_NOSLOTS = 0x00200, + CHECK_ERROR_UNKNOWN = 0x00400, + // Warnings. + CHECK_WARN_CURRENT = 0x10000, + CHECK_WARN_TEMPORARY = 0x20000 + }; + ErrInfo_T s; + + void reset() { + s.Error = CHECK_SUCCESS; + s.Warning = CHECK_SUCCESS; + s.ShuffleError = HexagonShuffler::SHUFFLE_SUCCESS; + s.Register = Hexagon::NoRegister; + }; + HexagonMCErrInfo() { + reset(); + }; + + void setError(unsigned e, unsigned r = Hexagon::NoRegister) + { s.Error = e; s.Register = r; }; + void setWarning(unsigned w, unsigned r = Hexagon::NoRegister) + { s.Warning = w; s.Register = r; }; + void setShuffleError(unsigned e) { s.ShuffleError = e; }; +}; + +/// Check for a valid bundle. +class HexagonMCChecker { + /// Insn bundle. + MCInst& MCB; + MCInst& MCBDX; + const MCRegisterInfo& RI; + MCInstrInfo const &MCII; + MCSubtargetInfo const &STI; + bool bLoadErrInfo; + + /// Set of definitions: register #, if predicated, if predicated true. + typedef std::pair<unsigned, bool> PredSense; + static const PredSense Unconditional; + typedef std::multiset<PredSense> PredSet; + typedef std::multiset<PredSense>::iterator PredSetIterator; + + typedef llvm::DenseMap<unsigned, PredSet>::iterator DefsIterator; + llvm::DenseMap<unsigned, PredSet> Defs; + + /// Information about how a new-value register is defined or used: + /// PredReg = predicate register, 0 if use/def not predicated, + /// Cond = true/false for if(PredReg)/if(!PredReg) respectively, + /// IsFloat = true if definition produces a floating point value + /// (not valid for uses), + /// IsNVJ = true if the use is a new-value branch (not valid for + /// definitions). + struct NewSense { + unsigned PredReg; + bool IsFloat, IsNVJ, Cond; + // The special-case "constructors": + static NewSense Jmp(bool isNVJ) { + NewSense NS = { /*PredReg=*/ 0, /*IsFloat=*/ false, /*IsNVJ=*/ isNVJ, + /*Cond=*/ false }; + return NS; + } + static NewSense Use(unsigned PR, bool True) { + NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ false, /*IsNVJ=*/ false, + /*Cond=*/ True }; + return NS; + } + static NewSense Def(unsigned PR, bool True, bool Float) { + NewSense NS = { /*PredReg=*/ PR, /*IsFloat=*/ Float, /*IsNVJ=*/ false, + /*Cond=*/ True }; + return NS; + } + }; + /// Set of definitions that produce new register: + typedef llvm::SmallVector<NewSense,2> NewSenseList; + typedef llvm::DenseMap<unsigned, NewSenseList>::iterator NewDefsIterator; + llvm::DenseMap<unsigned, NewSenseList> NewDefs; + + /// Set of weak definitions whose clashes should be enforced selectively. + typedef std::set<unsigned>::iterator SoftDefsIterator; + std::set<unsigned> SoftDefs; + + /// Set of current definitions committed to the register file. + typedef std::set<unsigned>::iterator CurDefsIterator; + std::set<unsigned> CurDefs; + + /// Set of temporary definitions not committed to the register file. + typedef std::set<unsigned>::iterator TmpDefsIterator; + std::set<unsigned> TmpDefs; + + /// Set of new predicates used. + typedef std::set<unsigned>::iterator NewPredsIterator; + std::set<unsigned> NewPreds; + + /// Set of predicates defined late. + typedef std::multiset<unsigned>::iterator LatePredsIterator; + std::multiset<unsigned> LatePreds; + + /// Set of uses. + typedef std::set<unsigned>::iterator UsesIterator; + std::set<unsigned> Uses; + + /// Set of new values used: new register, if new-value jump. + typedef llvm::DenseMap<unsigned, NewSense>::iterator NewUsesIterator; + llvm::DenseMap<unsigned, NewSense> NewUses; + + /// Pre-defined set of read-only registers. + typedef std::set<unsigned>::iterator ReadOnlyIterator; + std::set<unsigned> ReadOnly; + + std::queue<ErrInfo_T> ErrInfoQ; + HexagonMCErrInfo CrntErrInfo; + + void getErrInfo() { + if (bLoadErrInfo == true) { + if (ErrInfoQ.empty()) { + CrntErrInfo.reset(); + } else { + CrntErrInfo.s = ErrInfoQ.front(); + ErrInfoQ.pop(); + } + } + bLoadErrInfo = false; + } + + void init(); + void init(MCInst const&); + + // Checks performed. + bool checkBranches(); + bool checkPredicates(); + bool checkNewValues(); + bool checkRegisters(); + bool checkSolo(); + bool checkShuffle(); + bool checkSlots(); + + static void compoundRegisterMap(unsigned&); + + bool isPredicateRegister(unsigned R) const { + return (Hexagon::P0 == R || Hexagon::P1 == R || + Hexagon::P2 == R || Hexagon::P3 == R); + }; + bool isLoopRegister(unsigned R) const { + return (Hexagon::SA0 == R || Hexagon::LC0 == R || + Hexagon::SA1 == R || Hexagon::LC1 == R); + }; + + bool hasValidNewValueDef(const NewSense &Use, + const NewSenseList &Defs) const; + + public: + explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx, + const MCRegisterInfo& ri); + + bool check(); + + /// add a new error/warning + void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); }; + + /// Return the error code for the last operation in the insn bundle. + unsigned getError() { getErrInfo(); return CrntErrInfo.s.Error; }; + unsigned getWarning() { getErrInfo(); return CrntErrInfo.s.Warning; }; + unsigned getShuffleError() { getErrInfo(); return CrntErrInfo.s.ShuffleError; }; + unsigned getErrRegister() { getErrInfo(); return CrntErrInfo.s.Register; }; + bool getNextErrInfo() { + bLoadErrInfo = true; + return (ErrInfoQ.empty()) ? false : (getErrInfo(), true); + } +}; + +} + +#endif // HEXAGONMCCHECKER_H diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index 9fc4e2a..c2c6275 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -96,6 +96,12 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction( assert(!HexagonMCInstrInfo::isBundle(HMB)); uint64_t Binary; + // Compound instructions are limited to using registers 0-7 and 16-23 + // and here we make a map 16-23 to 8-15 so they can be correctly encoded. + static unsigned RegMap[8] = {Hexagon::R8, Hexagon::R9, Hexagon::R10, + Hexagon::R11, Hexagon::R12, Hexagon::R13, + Hexagon::R14, Hexagon::R15}; + // Pseudo instructions don't get encoded and shouldn't be here // in the first place! assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() && @@ -104,6 +110,16 @@ void HexagonMCCodeEmitter::EncodeSingleInstruction( " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'" "\n"); + if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) { + for (unsigned i = 0; i < HMB.getNumOperands(); ++i) + if (HMB.getOperand(i).isReg()) { + unsigned Reg = + MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg()); + if ((Reg <= 23) && (Reg >= 16)) + HMB.getOperand(i).setReg(RegMap[Reg - 16]); + } + } + if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) { // Calculate the new value distance to the associated producer MCOperand &MCO = @@ -318,7 +334,7 @@ static Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI, // The only relocs left should be GP relative: default: if (MCID.mayStore() || MCID.mayLoad()) { - for (const uint16_t *ImpUses = MCID.getImplicitUses(); *ImpUses; + for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses) { if (*ImpUses == Hexagon::GP) { switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) { @@ -389,10 +405,8 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI, return cast<MCConstantExpr>(ME)->getValue(); } if (MK == MCExpr::Binary) { - unsigned Res; - Res = getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI); - Res += - getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI); + getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getLHS(), Fixups, STI); + getExprOpValue(MI, MO, cast<MCBinaryExpr>(ME)->getRHS(), Fixups, STI); return 0; } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp index 886f8db..d194bea 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp @@ -115,8 +115,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { SrcReg = MI.getOperand(1).getReg(); if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MI.getOperand(2).isImm() && ((isUInt<5>(MI.getOperand(2).getImm())) || - (MI.getOperand(2).getImm() == -1))) + (HexagonMCInstrInfo::inRange<5>(MI, 2) || + HexagonMCInstrInfo::minConstant(MI, 2) == -1)) return HexagonII::HCG_A; break; case Hexagon::A2_tfr: @@ -134,8 +134,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { return false; // Rd = #u6 DstReg = MI.getOperand(0).getReg(); - if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() <= 63 && - MI.getOperand(1).getImm() >= 0 && + if (HexagonMCInstrInfo::minConstant(MI, 1) <= 63 && + HexagonMCInstrInfo::minConstant(MI, 1) >= 0 && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) return HexagonII::HCG_A; break; @@ -145,9 +145,8 @@ unsigned getCompoundCandidateGroup(MCInst const &MI, bool IsExtended) { DstReg = MI.getOperand(0).getReg(); Src1Reg = MI.getOperand(1).getReg(); if ((Hexagon::P0 == DstReg || Hexagon::P1 == DstReg) && - MI.getOperand(2).isImm() && HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && - (MI.getOperand(2).getImm() == 0)) + HexagonMCInstrInfo::minConstant(MI, 2) == 0) return HexagonII::HCG_A; break; // The fact that .new form is used pretty much guarantees @@ -206,6 +205,8 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { MCInst *CompoundInsn = 0; unsigned compoundOpcode; MCOperand Rs, Rt; + int64_t Value; + bool Success; switch (L.getOpcode()) { default: @@ -277,7 +278,10 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { case Hexagon::C2_cmpeqi: DEBUG(dbgs() << "CX: C2_cmpeqi\n"); - if (L.getOperand(2).getImm() == -1) + Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + (void)Success; + assert(Success); + if (Value == -1) compoundOpcode = cmpeqn1BitOpcode[getCompoundOp(R)]; else compoundOpcode = cmpeqiBitOpcode[getCompoundOp(R)]; @@ -286,14 +290,17 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { CompoundInsn = new (Context) MCInst; CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); - if (L.getOperand(2).getImm() != -1) + if (Value != -1) CompoundInsn->addOperand(L.getOperand(2)); CompoundInsn->addOperand(R.getOperand(1)); break; case Hexagon::C2_cmpgti: DEBUG(dbgs() << "CX: C2_cmpgti\n"); - if (L.getOperand(2).getImm() == -1) + Success = L.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + (void)Success; + assert(Success); + if (Value == -1) compoundOpcode = cmpgtn1BitOpcode[getCompoundOp(R)]; else compoundOpcode = cmpgtiBitOpcode[getCompoundOp(R)]; @@ -302,7 +309,7 @@ MCInst *getCompoundInsn(MCContext &Context, MCInst const &L, MCInst const &R) { CompoundInsn = new (Context) MCInst; CompoundInsn->setOpcode(compoundOpcode); CompoundInsn->addOperand(Rs); - if (L.getOperand(2).getImm() != -1) + if (Value != -1) CompoundInsn->addOperand(L.getOperand(2)); CompoundInsn->addOperand(R.getOperand(1)); break; @@ -404,7 +411,7 @@ bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) { /// additional slot. void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI) { - assert(MCI.getOpcode() == Hexagon::BUNDLE && + assert(HexagonMCInstrInfo::isBundle(MCI) && "Non-Bundle where Bundle expected"); // By definition a compound must have 2 insn. diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp index 7e9247c..e6194f6 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp @@ -26,7 +26,7 @@ using namespace Hexagon; #define DEBUG_TYPE "hexagon-mcduplex-info" // pair table of subInstructions with opcodes -static std::pair<unsigned, unsigned> opcodeData[] = { +static const std::pair<unsigned, unsigned> opcodeData[] = { std::make_pair((unsigned)V4_SA1_addi, 0), std::make_pair((unsigned)V4_SA1_addrx, 6144), std::make_pair((unsigned)V4_SA1_addsp, 3072), @@ -81,8 +81,7 @@ static std::pair<unsigned, unsigned> opcodeData[] = { std::make_pair((unsigned)V4_SS2_storewi1, 4352)}; static std::map<unsigned, unsigned> - subinstOpcodeMap(opcodeData, - opcodeData + sizeof(opcodeData) / sizeof(opcodeData[0])); + subinstOpcodeMap(std::begin(opcodeData), std::end(opcodeData)); bool HexagonMCInstrInfo::isDuplexPairMatch(unsigned Ga, unsigned Gb) { switch (Ga) { @@ -195,15 +194,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // Special case this one from Group L2. // Rd = memw(r29+#u5:2) if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { - if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg && - MCI.getOperand(2).isImm() && - isShiftedUInt<5, 2>(MCI.getOperand(2).getImm())) { + if (HexagonMCInstrInfo::isIntReg(SrcReg) && + Hexagon::R29 == SrcReg && inRange<5, 2>(MCI, 2)) { return HexagonII::HSIG_L2; } // Rd = memw(Rs+#u4:2) if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - (MCI.getOperand(2).isImm() && - isShiftedUInt<4, 2>(MCI.getOperand(2).getImm()))) { + inRange<4, 2>(MCI, 2)) { return HexagonII::HSIG_L1; } } @@ -214,7 +211,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && isUInt<4>(MCI.getOperand(2).getImm())) { + inRange<4>(MCI, 2)) { return HexagonII::HSIG_L1; } break; @@ -235,8 +232,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && - isShiftedUInt<3, 1>(MCI.getOperand(2).getImm())) { + inRange<3, 1>(MCI, 2)) { return HexagonII::HSIG_L2; } break; @@ -246,7 +242,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && isUInt<3>(MCI.getOperand(2).getImm())) { + inRange<3>(MCI, 2)) { return HexagonII::HSIG_L2; } break; @@ -256,8 +252,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg && - MCI.getOperand(2).isImm() && - isShiftedUInt<5, 3>(MCI.getOperand(2).getImm())) { + inRange<5, 3>(MCI, 2)) { return HexagonII::HSIG_L2; } break; @@ -326,15 +321,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isIntReg(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - Hexagon::R29 == Src1Reg && MCI.getOperand(1).isImm() && - isShiftedUInt<5, 2>(MCI.getOperand(1).getImm())) { + Hexagon::R29 == Src1Reg && inRange<5, 2>(MCI, 1)) { return HexagonII::HSIG_S2; } // memw(Rs+#u4:2) = Rt if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - MCI.getOperand(1).isImm() && - isShiftedUInt<4, 2>(MCI.getOperand(1).getImm())) { + inRange<4, 2>(MCI, 1)) { return HexagonII::HSIG_S1; } break; @@ -344,7 +337,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm())) { + inRange<4>(MCI, 1)) { return HexagonII::HSIG_S1; } break; @@ -363,8 +356,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && HexagonMCInstrInfo::isIntRegForSubInst(Src2Reg) && - MCI.getOperand(1).isImm() && - isShiftedUInt<3, 1>(MCI.getOperand(1).getImm())) { + inRange<3, 1>(MCI, 1)) { return HexagonII::HSIG_S2; } break; @@ -374,8 +366,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { Src2Reg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(Src2Reg) && HexagonMCInstrInfo::isIntReg(Src1Reg) && Hexagon::R29 == Src1Reg && - MCI.getOperand(1).isImm() && - isShiftedInt<6, 3>(MCI.getOperand(1).getImm())) { + inSRange<6, 3>(MCI, 1)) { return HexagonII::HSIG_S2; } break; @@ -383,9 +374,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // memw(Rs+#u4:2) = #U1 Src1Reg = MCI.getOperand(0).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && - MCI.getOperand(1).isImm() && - isShiftedUInt<4, 2>(MCI.getOperand(1).getImm()) && - MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) { + inRange<4, 2>(MCI, 1) && inRange<1>(MCI, 2)) { return HexagonII::HSIG_S2; } break; @@ -393,16 +382,13 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // memb(Rs+#u4) = #U1 Src1Reg = MCI.getOperand(0).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(Src1Reg) && - MCI.getOperand(1).isImm() && isUInt<4>(MCI.getOperand(1).getImm()) && - MCI.getOperand(2).isImm() && isUInt<1>(MCI.getOperand(2).getImm())) { + inRange<4>(MCI, 1) && inRange<1>(MCI, 2)) { return HexagonII::HSIG_S2; } break; case Hexagon::S2_allocframe: - if (MCI.getOperand(0).isImm() && - isShiftedUInt<5, 3>(MCI.getOperand(0).getImm())) { + if (inRange<5, 3>(MCI, 0)) return HexagonII::HSIG_S2; - } break; // // Group A: @@ -428,8 +414,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { // Rd = add(r29,#u6:2) if (HexagonMCInstrInfo::isIntReg(SrcReg) && Hexagon::R29 == SrcReg && - MCI.getOperand(2).isImm() && - isShiftedUInt<6, 2>(MCI.getOperand(2).getImm())) { + inRange<6, 2>(MCI, 2)) { return HexagonII::HSIG_A; } // Rx = add(Rx,#s7) @@ -439,8 +424,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // Rd = add(Rs,#1) // Rd = add(Rs,#-1) if (HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) || - (MCI.getOperand(2).getImm() == -1))) { + (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == -1)) { return HexagonII::HSIG_A; } } @@ -460,8 +444,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && ((MCI.getOperand(2).getImm() == 1) || - (MCI.getOperand(2).getImm() == 255))) { + (minConstant(MCI, 2) == 1 || minConstant(MCI, 2) == 255)) { return HexagonII::HSIG_A; } break; @@ -491,8 +474,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { DstReg = MCI.getOperand(0).getReg(); // Rd PredReg = MCI.getOperand(1).getReg(); // P0 if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg) && - Hexagon::P0 == PredReg && MCI.getOperand(2).isImm() && - MCI.getOperand(2).getImm() == 0) { + Hexagon::P0 == PredReg && minConstant(MCI, 2) == 0) { return HexagonII::HSIG_A; } break; @@ -502,7 +484,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (Hexagon::P0 == DstReg && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - MCI.getOperand(2).isImm() && isUInt<2>(MCI.getOperand(2).getImm())) { + inRange<2>(MCI, 2)) { return HexagonII::HSIG_A; } break; @@ -511,10 +493,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { // Rdd = combine(#u2,#U2) DstReg = MCI.getOperand(0).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && - // TODO: Handle Globals/Symbols - (MCI.getOperand(1).isImm() && isUInt<2>(MCI.getOperand(1).getImm())) && - ((MCI.getOperand(2).isImm() && - isUInt<2>(MCI.getOperand(2).getImm())))) { + inRange<2>(MCI, 1) && inRange<2>(MCI, 2)) { return HexagonII::HSIG_A; } break; @@ -524,7 +503,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(1).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - (MCI.getOperand(2).isImm() && MCI.getOperand(2).getImm() == 0)) { + minConstant(MCI, 2) == 0) { return HexagonII::HSIG_A; } break; @@ -534,7 +513,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { SrcReg = MCI.getOperand(2).getReg(); if (HexagonMCInstrInfo::isDblRegForSubInst(DstReg) && HexagonMCInstrInfo::isIntRegForSubInst(SrcReg) && - (MCI.getOperand(1).isImm() && MCI.getOperand(1).getImm() == 0)) { + minConstant(MCI, 1) == 0) { return HexagonII::HSIG_A; } break; @@ -556,19 +535,17 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) { } bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) { - unsigned DstReg, SrcReg; - switch (potentialDuplex.getOpcode()) { case Hexagon::A2_addi: // testing for case of: Rx = add(Rx,#s7) DstReg = potentialDuplex.getOperand(0).getReg(); SrcReg = potentialDuplex.getOperand(1).getReg(); if (DstReg == SrcReg && HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { - if (potentialDuplex.getOperand(2).isExpr()) + int64_t Value; + if (!potentialDuplex.getOperand(2).getExpr()->evaluateAsAbsolute(Value)) return true; - if (potentialDuplex.getOperand(2).isImm() && - !(isShiftedInt<7, 0>(potentialDuplex.getOperand(2).getImm()))) + if (!isShiftedInt<7, 0>(Value)) return true; } break; @@ -576,15 +553,14 @@ bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) { DstReg = potentialDuplex.getOperand(0).getReg(); if (HexagonMCInstrInfo::isIntRegForSubInst(DstReg)) { - if (potentialDuplex.getOperand(1).isExpr()) + int64_t Value; + if (!potentialDuplex.getOperand(1).getExpr()->evaluateAsAbsolute(Value)) return true; // Check for case of Rd = #-1. - if (potentialDuplex.getOperand(1).isImm() && - (potentialDuplex.getOperand(1).getImm() == -1)) + if (Value == -1) return false; // Check for case of Rd = #u6. - if (potentialDuplex.getOperand(1).isImm() && - !isShiftedUInt<6, 0>(potentialDuplex.getOperand(1).getImm())) + if (!isShiftedUInt<6, 0>(Value)) return true; } break; @@ -712,19 +688,23 @@ inline static void addOps(MCInst &subInstPtr, MCInst const &Inst, MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { MCInst Result; + bool Absolute; + int64_t Value; switch (Inst.getOpcode()) { default: // dbgs() << "opcode: "<< Inst->getOpcode() << "\n"; llvm_unreachable("Unimplemented subinstruction \n"); break; case Hexagon::A2_addi: - if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == 1) { + Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 1) { Result.setOpcode(Hexagon::V4_SA1_inc); addOps(Result, Inst, 0); addOps(Result, Inst, 1); break; } // 1,2 SUBInst $Rd = add($Rs, #1) - else if (Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == -1) { + else if (Value == -1) { Result.setOpcode(Hexagon::V4_SA1_dec); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -754,7 +734,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { addOps(Result, Inst, 0); break; // 1 SUBInst allocframe(#$u5_3) case Hexagon::A2_andir: - if (Inst.getOperand(2).getImm() == 255) { + if (minConstant(Inst, 2) == 255) { Result.setOpcode(Hexagon::V4_SA1_zxtb); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -772,26 +752,27 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { break; // 2,3 SUBInst p0 = cmp.eq($Rs, #$u2) case Hexagon::A4_combineii: case Hexagon::A2_combineii: - if (Inst.getOperand(1).getImm() == 1) { + Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 1) { Result.setOpcode(Hexagon::V4_SA1_combine1i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); break; // 1,3 SUBInst $Rdd = combine(#1, #$u2) } - - if (Inst.getOperand(1).getImm() == 3) { + if (Value == 3) { Result.setOpcode(Hexagon::V4_SA1_combine3i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); break; // 1,3 SUBInst $Rdd = combine(#3, #$u2) } - if (Inst.getOperand(1).getImm() == 0) { + if (Value == 0) { Result.setOpcode(Hexagon::V4_SA1_combine0i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); break; // 1,3 SUBInst $Rdd = combine(#0, #$u2) } - if (Inst.getOperand(1).getImm() == 2) { + if (Value == 2) { Result.setOpcode(Hexagon::V4_SA1_combine2i); addOps(Result, Inst, 0); addOps(Result, Inst, 2); @@ -894,12 +875,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { break; // 1,2,3 SUBInst $Rd = memw($Rs + #$u4_2) } case Hexagon::S4_storeirb_io: - if (Inst.getOperand(2).getImm() == 0) { + Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 0) { Result.setOpcode(Hexagon::V4_SS2_storebi0); addOps(Result, Inst, 0); addOps(Result, Inst, 1); break; // 1,2 SUBInst memb($Rs + #$u4_0)=#0 - } else if (Inst.getOperand(2).getImm() == 1) { + } else if (Value == 1) { Result.setOpcode(Hexagon::V4_SS2_storebi1); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -923,12 +906,14 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { addOps(Result, Inst, 2); break; // 1,2,3 SUBInst memb($Rs + #$u4_0) = $Rt case Hexagon::S4_storeiri_io: - if (Inst.getOperand(2).getImm() == 0) { + Absolute = Inst.getOperand(2).getExpr()->evaluateAsAbsolute(Value); + assert(Absolute);(void)Absolute; + if (Value == 0) { Result.setOpcode(Hexagon::V4_SS2_storewi0); addOps(Result, Inst, 0); addOps(Result, Inst, 1); break; // 3 1,2 SUBInst memw($Rs + #$u4_2)=#0 - } else if (Inst.getOperand(2).getImm() == 1) { + } else if (Value == 1) { Result.setOpcode(Hexagon::V4_SS2_storewi1); addOps(Result, Inst, 0); addOps(Result, Inst, 1); @@ -983,7 +968,8 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) { addOps(Result, Inst, 0); break; // 2 SUBInst if (p0) $Rd = #0 case Hexagon::A2_tfrsi: - if (Inst.getOperand(1).isImm() && Inst.getOperand(1).getImm() == -1) { + Absolute = Inst.getOperand(1).getExpr()->evaluateAsAbsolute(Value); + if (Absolute && Value == -1) { Result.setOpcode(Hexagon::V4_SA1_setin1); addOps(Result, Inst, 0); break; // 2 1 SUBInst $Rd = #-1 @@ -1044,6 +1030,8 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII, << "\n"); bisReversable = false; } + if (HexagonMCInstrInfo::isMemReorderDisabled(MCB)) // }:mem_noshuf + bisReversable = false; // Try in order. if (isOrderedDuplexPair( diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index bf51c35..eaa3550 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -37,9 +37,7 @@ static cl::opt<unsigned> void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK, const MCSubtargetInfo &STI) { - MCInst HMI; - HMI.setOpcode(Hexagon::BUNDLE); - HMI.addOperand(MCOperand::createImm(0)); + MCInst HMI = HexagonMCInstrInfo::createBundle(); MCInst *MCB; if (MCK.getOpcode() != Hexagon::BUNDLE) { @@ -50,7 +48,7 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK, // Examines packet and pad the packet, if needed, when an // end-loop is in the bundle. - HexagonMCInstrInfo::padEndloop(*MCB); + HexagonMCInstrInfo::padEndloop(getContext(), *MCB); HexagonMCShuffle(*MCII, STI, *MCB); assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE); @@ -60,9 +58,9 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK, if (Extended) { if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) { MCInst *SubInst = const_cast<MCInst *>(MCI->getOperand(1).getInst()); - HexagonMCInstrInfo::clampExtended(*MCII, *SubInst); + HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *SubInst); } else { - HexagonMCInstrInfo::clampExtended(*MCII, *MCI); + HexagonMCInstrInfo::clampExtended(*MCII, getContext(), *MCI); } Extended = false; } else { @@ -114,7 +112,7 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol, MCSection *Section = getAssembler().getContext().getELFSection( SectionName, ELF::SHT_NOBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); SwitchSection(Section); - AssignSection(Symbol, Section); + AssignFragment(Symbol, getCurrentFragment()); MCELFStreamer::EmitCommonSymbol(Symbol, Size, ByteAlignment); SwitchSection(CrntSection); diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp new file mode 100644 index 0000000..fc62626 --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp @@ -0,0 +1,49 @@ +//===-- HexagonMCExpr.cpp - Hexagon specific MC expression classes +//----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "HexagonMCExpr.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCValue.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "hexagon-mcexpr" + +HexagonNoExtendOperand *HexagonNoExtendOperand::Create(MCExpr const *Expr, + MCContext &Ctx) { + return new (Ctx) HexagonNoExtendOperand(Expr); +} + +bool HexagonNoExtendOperand::evaluateAsRelocatableImpl( + MCValue &Res, MCAsmLayout const *Layout, MCFixup const *Fixup) const { + return Expr->evaluateAsRelocatable(Res, Layout, Fixup); +} + +void HexagonNoExtendOperand::visitUsedExpr(MCStreamer &Streamer) const {} + +MCFragment *llvm::HexagonNoExtendOperand::findAssociatedFragment() const { + return Expr->findAssociatedFragment(); +} + +void HexagonNoExtendOperand::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {} + +MCExpr const *HexagonNoExtendOperand::getExpr() const { return Expr; } + +bool HexagonNoExtendOperand::classof(MCExpr const *E) { + return E->getKind() == MCExpr::Target; +} + +HexagonNoExtendOperand::HexagonNoExtendOperand(MCExpr const *Expr) + : Expr(Expr) {} + +void HexagonNoExtendOperand::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { + Expr->print(OS, MAI); +} diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h new file mode 100644 index 0000000..60f180f --- /dev/null +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h @@ -0,0 +1,35 @@ +//==- HexagonMCExpr.h - Hexagon specific MC expression classes --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H +#define LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H + +#include "llvm/MC/MCExpr.h" + +namespace llvm { +class MCInst; +class HexagonNoExtendOperand : public MCTargetExpr { +public: + static HexagonNoExtendOperand *Create(MCExpr const *Expr, MCContext &Ctx); + void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override; + bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout, + const MCFixup *Fixup) const override; + void visitUsedExpr(MCStreamer &Streamer) const override; + MCFragment *findAssociatedFragment() const override; + void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; + static bool classof(MCExpr const *E); + MCExpr const *getExpr() const; + +private: + HexagonNoExtendOperand(MCExpr const *Expr); + MCExpr const *Expr; +}; +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_HEXAGON_HEXAGONMCEXPR_H diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp index 48b15f8..e684207 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp @@ -15,17 +15,37 @@ #include "Hexagon.h" #include "HexagonBaseInfo.h" +#include "HexagonMCChecker.h" #include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" namespace llvm { +void HexagonMCInstrInfo::addConstant(MCInst &MI, uint64_t Value, + MCContext &Context) { + MI.addOperand(MCOperand::createExpr(MCConstantExpr::create(Value, Context))); +} + +void HexagonMCInstrInfo::addConstExtender(MCContext &Context, + MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI) { + assert(HexagonMCInstrInfo::isBundle(MCB)); + MCOperand const &exOp = + MCI.getOperand(HexagonMCInstrInfo::getExtendableOp(MCII, MCI)); + + // Create the extender. + MCInst *XMCI = + new (Context) MCInst(HexagonMCInstrInfo::deriveExtender(MCII, MCI, exOp)); + + MCB.addOperand(MCOperand::createInst(XMCI)); +} + iterator_range<MCInst::const_iterator> HexagonMCInstrInfo::bundleInstructions(MCInst const &MCI) { assert(isBundle(MCI)); - return iterator_range<MCInst::const_iterator>( - MCI.begin() + bundleInstructionsOffset, MCI.end()); + return make_range(MCI.begin() + bundleInstructionsOffset, MCI.end()); } size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) { @@ -35,7 +55,40 @@ size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) { return (1); } -void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) { +bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII, + MCSubtargetInfo const &STI, + MCContext &Context, MCInst &MCB, + HexagonMCChecker *Check) { + // Examine the packet and convert pairs of instructions to compound + // instructions when possible. + if (!HexagonDisableCompound) + HexagonMCInstrInfo::tryCompound(MCII, Context, MCB); + // Check the bundle for errors. + bool CheckOk = Check ? Check->check() : true; + if (!CheckOk) + return false; + HexagonMCShuffle(MCII, STI, MCB); + // Examine the packet and convert pairs of instructions to duplex + // instructions when possible. + MCInst InstBundlePreDuplex = MCInst(MCB); + if (!HexagonDisableDuplex) { + SmallVector<DuplexCandidate, 8> possibleDuplexes; + possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB); + HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes); + } + // Examines packet and pad the packet, if needed, when an + // end-loop is in the bundle. + HexagonMCInstrInfo::padEndloop(Context, MCB); + // If compounding and duplexing didn't reduce the size below + // 4 or less we have a packet that is too big. + if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) + return false; + HexagonMCShuffle(MCII, STI, MCB); + return true; +} + +void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, + MCContext &Context, MCInst &MCI) { assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) || HexagonMCInstrInfo::isExtended(MCII, MCI)); MCOperand &exOp = @@ -43,13 +96,20 @@ void HexagonMCInstrInfo::clampExtended(MCInstrInfo const &MCII, MCInst &MCI) { // If the extended value is a constant, then use it for the extended and // for the extender instructions, masking off the lower 6 bits and // including the assumed bits. - if (exOp.isImm()) { + int64_t Value; + if (exOp.getExpr()->evaluateAsAbsolute(Value)) { unsigned Shift = HexagonMCInstrInfo::getExtentAlignment(MCII, MCI); - int64_t Bits = exOp.getImm(); - exOp.setImm((Bits & 0x3f) << Shift); + exOp.setExpr(MCConstantExpr::create((Value & 0x3f) << Shift, Context)); } } +MCInst HexagonMCInstrInfo::createBundle() { + MCInst Result; + Result.setOpcode(Hexagon::BUNDLE); + Result.addOperand(MCOperand::createImm(0)); + return Result; +} + MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, MCInst const &inst1) { @@ -64,6 +124,27 @@ MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass, return duplexInst; } +MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII, + MCInst const &Inst, + MCOperand const &MO) { + assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) || + HexagonMCInstrInfo::isExtended(MCII, Inst)); + + MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst); + MCInst XMI; + XMI.setOpcode((Desc.isBranch() || Desc.isCall() || + HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR) + ? Hexagon::A4_ext_b + : Hexagon::A4_ext); + if (MO.isImm()) + XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f))); + else if (MO.isExpr()) + XMI.addOperand(MCOperand::createExpr(MO.getExpr())); + else + llvm_unreachable("invalid extendable operand"); + return XMI; +} + MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB, size_t Index) { assert(Index <= bundleSize(MCB)); @@ -76,6 +157,13 @@ MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB, return nullptr; } +void HexagonMCInstrInfo::extendIfNeeded(MCContext &Context, + MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI, bool MustExtend) { + if (isConstExtended(MCII, MCI) || MustExtend) + addConstExtender(Context, MCII, MCB, MCI); +} + HexagonII::MemAccessSize HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; @@ -186,6 +274,25 @@ MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII, return (MCO); } +/// Return the new value or the newly produced value. +unsigned short HexagonMCInstrInfo::getNewValueOp2(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return ((F >> HexagonII::NewValueOpPos2) & HexagonII::NewValueOpMask2); +} + +MCOperand const & +HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII, + MCInst const &MCI) { + unsigned O = HexagonMCInstrInfo::getNewValueOp2(MCII, MCI); + MCOperand const &MCO = MCI.getOperand(O); + + assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) || + HexagonMCInstrInfo::hasNewValue2(MCII, MCI)) && + MCO.isReg()); + return (MCO); +} + int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; @@ -242,6 +349,13 @@ bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII, return ((F >> HexagonII::hasNewValuePos) & HexagonII::hasNewValueMask); } +/// Return whether the insn produces a second value. +bool HexagonMCInstrInfo::hasNewValue2(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return ((F >> HexagonII::hasNewValuePos2) & HexagonII::hasNewValueMask2); +} + MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) { assert(isBundle(MCB)); assert(Index < HEXAGON_PACKET_SIZE); @@ -261,6 +375,11 @@ bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) { HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP); } +bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII, + MCInst const &MCI) { + return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND); +} + bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) { return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) || (Reg >= Hexagon::D8 && Reg <= Hexagon::D11)); @@ -282,14 +401,21 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI) { if (HexagonMCInstrInfo::isExtended(MCII, MCI)) return true; - - if (!HexagonMCInstrInfo::isExtendable(MCII, MCI)) + // Branch insns are handled as necessary by relaxation. + if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) || + (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND && + HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) || + (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV && + HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch())) + return false; + // Otherwise loop instructions and other CR insts are handled by relaxation + else if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR) && + (MCI.getOpcode() != Hexagon::C4_addipc)) + return false; + else if (!HexagonMCInstrInfo::isExtendable(MCII, MCI)) return false; - short ExtOpNum = HexagonMCInstrInfo::getCExtOpNum(MCII, MCI); - int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI); - int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI); - MCOperand const &MO = MCI.getOperand(ExtOpNum); + MCOperand const &MO = HexagonMCInstrInfo::getExtendableOperand(MCII, MCI); // We could be using an instruction with an extendable immediate and shoehorn // a global address into it. If it is a global address it will be constant @@ -297,15 +423,13 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII, // We currently only handle isGlobal() because it is the only kind of // object we are going to end up with here for now. // In the future we probably should add isSymbol(), etc. - if (MO.isExpr()) + assert(!MO.isImm()); + int64_t Value; + if (!MO.getExpr()->evaluateAsAbsolute(Value)) return true; - - // If the extendable operand is not 'Immediate' type, the instruction should - // have 'isExtended' flag set. - assert(MO.isImm() && "Extendable operand must be Immediate type"); - - int ImmValue = MO.getImm(); - return (ImmValue < MinValue || ImmValue > MaxValue); + int MinValue = HexagonMCInstrInfo::getMinValue(MCII, MCI); + int MaxValue = HexagonMCInstrInfo::getMaxValue(MCII, MCI); + return (MinValue > Value || Value > MaxValue); } bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII, @@ -374,6 +498,19 @@ bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII, return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask); } +bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return (F >> HexagonII::PredicateLatePos & HexagonII::PredicateLateMask); +} + +/// Return whether the insn is newly predicated. +bool HexagonMCInstrInfo::isPredicatedNew(MCInstrInfo const &MCII, + MCInst const &MCI) { + const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; + return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask); +} + bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; @@ -394,6 +531,18 @@ bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) { return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask); } +bool HexagonMCInstrInfo::isMemReorderDisabled(MCInst const &MCI) { + assert(isBundle(MCI)); + auto Flags = MCI.getOperand(0).getImm(); + return (Flags & memReorderDisabledMask) != 0; +} + +bool HexagonMCInstrInfo::isMemStoreReorderEnabled(MCInst const &MCI) { + assert(isBundle(MCI)); + auto Flags = MCI.getOperand(0).getImm(); + return (Flags & memStoreReorderEnabledMask) != 0; +} + bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) { const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags; return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask); @@ -405,7 +554,28 @@ bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII, return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask); } -void HexagonMCInstrInfo::padEndloop(MCInst &MCB) { +bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) { + if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) && + (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST)) + return true; + return false; +} + +int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) { + auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max()) + << 8; + if (MCI.size() <= Index) + return Sentinal; + MCOperand const &MCO = MCI.getOperand(Index); + if (!MCO.isExpr()) + return Sentinal; + int64_t Value; + if (!MCO.getExpr()->evaluateAsAbsolute(Value)) + return Sentinal; + return Value; +} + +void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) { MCInst Nop; Nop.setOpcode(Hexagon::A2_nop); assert(isBundle(MCB)); @@ -413,7 +583,7 @@ void HexagonMCInstrInfo::padEndloop(MCInst &MCB) { (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) || ((HexagonMCInstrInfo::isOuterLoop(MCB) && (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE)))) - MCB.addOperand(MCOperand::createInst(new MCInst(Nop))); + MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop))); } bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII, @@ -456,6 +626,20 @@ void HexagonMCInstrInfo::setInnerLoop(MCInst &MCI) { Operand.setImm(Operand.getImm() | innerLoopMask); } +void HexagonMCInstrInfo::setMemReorderDisabled(MCInst &MCI) { + assert(isBundle(MCI)); + MCOperand &Operand = MCI.getOperand(0); + Operand.setImm(Operand.getImm() | memReorderDisabledMask); + assert(isMemReorderDisabled(MCI)); +} + +void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) { + assert(isBundle(MCI)); + MCOperand &Operand = MCI.getOperand(0); + Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask); + assert(isMemStoreReorderEnabled(MCI)); +} + void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) { assert(isBundle(MCI)); MCOperand &Operand = MCI.getOperand(0); diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h index 32d61a4..0237b28 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h @@ -14,9 +14,11 @@ #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCINSTRINFO_H +#include "HexagonMCExpr.h" #include "llvm/MC/MCInst.h" namespace llvm { +class HexagonMCChecker; class MCContext; class MCInstrDesc; class MCInstrInfo; @@ -39,20 +41,47 @@ int64_t const innerLoopMask = 1 << innerLoopOffset; size_t const outerLoopOffset = 1; int64_t const outerLoopMask = 1 << outerLoopOffset; +// do not reorder memory load/stores by default load/stores are re-ordered +// and by default loads can be re-ordered +size_t const memReorderDisabledOffset = 2; +int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset; + +// allow re-ordering of memory stores by default stores cannot be re-ordered +size_t const memStoreReorderEnabledOffset = 3; +int64_t const memStoreReorderEnabledMask = 1 << memStoreReorderEnabledOffset; + size_t const bundleInstructionsOffset = 1; +void addConstant(MCInst &MI, uint64_t Value, MCContext &Context); +void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI); + // Returns a iterator range of instructions in this bundle iterator_range<MCInst::const_iterator> bundleInstructions(MCInst const &MCI); // Returns the number of instructions in the bundle size_t bundleSize(MCInst const &MCI); +// Put the packet in to canonical form, compound, duplex, pad, and shuffle +bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, + MCContext &Context, MCInst &MCB, + HexagonMCChecker *Checker); + // Clamp off upper 26 bits of extendable operand for emission -void clampExtended(MCInstrInfo const &MCII, MCInst &MCI); +void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI); + +MCInst createBundle(); + +// Return the extender for instruction at Index or nullptr if none +MCInst const *extenderForIndex(MCInst const &MCB, size_t Index); +void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB, + MCInst const &MCI, bool MustExtend); // Create a duplex instruction given the two subinsts MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0, MCInst const &inst1); +MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst, + MCOperand const &MO); // Convert this instruction in to a duplex subinst MCInst deriveSubInst(MCInst const &Inst); @@ -108,6 +137,9 @@ unsigned short getNewValueOp(MCInstrInfo const &MCII, MCInst const &MCI); // Return the operand that consumes or produces a new value. MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI); +unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI); +MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII, + MCInst const &MCI); int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI); @@ -125,6 +157,7 @@ bool hasImmExt(MCInst const &MCI); // Return whether the instruction is a legal new-value producer. bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI); +bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI); // Return the instruction at Index MCInst const &instruction(MCInst const &MCB, size_t Index); @@ -134,10 +167,24 @@ bool isBundle(MCInst const &MCI); // Return whether the insn is an actual insn. bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI); +bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI); // Return the duplex iclass given the two duplex classes unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb); +int64_t minConstant(MCInst const &MCI, size_t Index); +template <unsigned N, unsigned S> +bool inRange(MCInst const &MCI, size_t Index) { + return isShiftedUInt<N, S>(minConstant(MCI, Index)); +} +template <unsigned N, unsigned S> +bool inSRange(MCInst const &MCI, size_t Index) { + return isShiftedInt<N, S>(minConstant(MCI, Index)); +} +template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) { + return isUInt<N>(minConstant(MCI, Index)); +} + // Return whether the instruction needs to be constant extended. bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI); @@ -173,6 +220,8 @@ bool isIntReg(unsigned Reg); // Is this register suitable for use in a duplex subinst bool isIntRegForSubInst(unsigned Reg); +bool isMemReorderDisabled(MCInst const &MCI); +bool isMemStoreReorderEnabled(MCInst const &MCI); // Return whether the insn is a new-value consumer. bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI); @@ -191,6 +240,8 @@ bool isOuterLoop(MCInst const &MCI); // Return whether this instruction is predicated bool isPredicated(MCInstrInfo const &MCII, MCInst const &MCI); +bool isPredicateLate(MCInstrInfo const &MCII, MCInst const &MCI); +bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI); // Return whether the predicate sense is true bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI); @@ -209,9 +260,10 @@ bool isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI); /// Return whether the insn can be packaged only with an A-type insn in slot #1. bool isSoloAin1(MCInstrInfo const &MCII, MCInst const &MCI); +bool isVector(MCInstrInfo const &MCII, MCInst const &MCI); // Pad the bundle with nops to satisfy endloop requirements -void padEndloop(MCInst &MCI); +void padEndloop(MCContext &Context, MCInst &MCI); bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI); @@ -220,6 +272,8 @@ void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate); // Marks a bundle as endloop0 void setInnerLoop(MCInst &MCI); +void setMemReorderDisabled(MCInst &MCI); +void setMemStoreReorderEnabled(MCInst &MCI); // Marks a bundle as endloop1 void setOuterLoop(MCInst &MCI); diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 53305d8..9a29257 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -40,6 +40,20 @@ using namespace llvm; #define GET_REGINFO_MC_DESC #include "HexagonGenRegisterInfo.inc" +cl::opt<bool> llvm::HexagonDisableCompound + ("mno-compound", + cl::desc("Disable looking for compound instructions for Hexagon")); + +cl::opt<bool> llvm::HexagonDisableDuplex + ("mno-pairing", + cl::desc("Disable looking for duplex instructions for Hexagon")); + +StringRef HEXAGON_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) { + if (CPU.empty()) + CPU = "hexagonv60"; + return CPU; +} + MCInstrInfo *llvm::createHexagonMCInstrInfo() { MCInstrInfo *X = new MCInstrInfo(); InitHexagonMCInstrInfo(X); @@ -54,6 +68,7 @@ static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) { static MCSubtargetInfo * createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { + CPU = HEXAGON_MC::selectHexagonCPU(TT, CPU); return createHexagonMCSubtargetInfoImpl(TT, CPU, FS); } @@ -76,28 +91,23 @@ public: StringRef Contents(Buffer); auto PacketBundle = Contents.rsplit('\n'); auto HeadTail = PacketBundle.first.split('\n'); - auto Preamble = "\t{\n\t\t"; - auto Separator = ""; - while(!HeadTail.first.empty()) { - OS << Separator; - StringRef Inst; + StringRef Separator = "\n"; + StringRef Indent = "\t\t"; + OS << "\t{\n"; + while (!HeadTail.first.empty()) { + StringRef InstTxt; auto Duplex = HeadTail.first.split('\v'); - if(!Duplex.second.empty()){ - OS << Duplex.first << "\n"; - Inst = Duplex.second; - } - else { - if(!HeadTail.first.startswith("immext")) - Inst = Duplex.first; + if (!Duplex.second.empty()) { + OS << Indent << Duplex.first << Separator; + InstTxt = Duplex.second; + } else if (!HeadTail.first.trim().startswith("immext")) { + InstTxt = Duplex.first; } - OS << Preamble; - OS << Inst; + if (!InstTxt.empty()) + OS << Indent << InstTxt << Separator; HeadTail = HeadTail.second.split('\n'); - Preamble = ""; - Separator = "\n\t\t"; } - if(HexagonMCInstrInfo::bundleSize(Inst) != 0) - OS << "\n\t}" << PacketBundle.second; + OS << "\t}" << PacketBundle.second; } }; } @@ -154,9 +164,9 @@ static MCCodeGenInfo *createHexagonMCCodeGenInfo(const Triple &TT, CodeModel::Model CM, CodeGenOpt::Level OL) { MCCodeGenInfo *X = new MCCodeGenInfo(); - // For the time being, use static relocations, since there's really no - // support for PIC yet. - X->initMCCodeGenInfo(Reloc::Static, CM, OL); + if (RM == Reloc::Default) + RM = Reloc::Static; + X->initMCCodeGenInfo(RM, CM, OL); return X; } diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h index cb62650..a005a01 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h @@ -16,6 +16,8 @@ #include <cstdint> +#include "llvm/Support/CommandLine.h" + namespace llvm { struct InstrItinerary; struct InstrStage; @@ -33,22 +35,27 @@ class raw_ostream; class raw_pwrite_stream; extern Target TheHexagonTarget; - +extern cl::opt<bool> HexagonDisableCompound; +extern cl::opt<bool> HexagonDisableDuplex; extern const InstrStage HexagonStages[]; MCInstrInfo *createHexagonMCInstrInfo(); -MCCodeEmitter *createHexagonMCCodeEmitter(MCInstrInfo const &MCII, - MCRegisterInfo const &MRI, +MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, MCContext &MCT); -MCAsmBackend *createHexagonAsmBackend(Target const &T, - MCRegisterInfo const &MRI, +MCAsmBackend *createHexagonAsmBackend(const Target &T, + const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU); MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS, uint8_t OSABI, StringRef CPU); +namespace HEXAGON_MC { + StringRef selectHexagonCPU(const Triple &TT, StringRef CPU); +} + } // End llvm namespace // Define symbolic names for Hexagon registers. This defines a mapping from diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp index 41112ac..6ceb848 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp @@ -27,6 +27,7 @@ using namespace llvm; +namespace { // Insn shuffling priority. class HexagonBid { // The priority is directly proportional to how restricted the insn is based @@ -75,6 +76,7 @@ public: return false; }; }; +} // end anonymous namespace unsigned HexagonResource::setWeight(unsigned s) { const unsigned SlotWeight = 8; @@ -93,6 +95,60 @@ unsigned HexagonResource::setWeight(unsigned s) { return (Weight); } +HexagonCVIResource::TypeUnitsAndLanes *HexagonCVIResource::TUL; + +bool HexagonCVIResource::SetUp = HexagonCVIResource::setup(); + +bool HexagonCVIResource::setup() { + assert(!TUL); + TUL = new (TypeUnitsAndLanes); + + (*TUL)[HexagonII::TypeCVI_VA] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2); + (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2); + (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1); + (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2); + (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1); + (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1); + (*TUL)[HexagonII::TypeCVI_VM_LD] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0); + (*TUL)[HexagonII::TypeCVI_VM_CUR_LD] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1); + (*TUL)[HexagonII::TypeCVI_VM_ST] = + UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1); + (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0); + (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1); + (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4); + + return true; +} + +HexagonCVIResource::HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, + MCInst const *id) + : HexagonResource(s) { + unsigned T = HexagonMCInstrInfo::getType(MCII, *id); + + if (TUL->count(T)) { + // For an HVX insn. + Valid = true; + setUnits((*TUL)[T].first); + setLanes((*TUL)[T].second); + setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad()); + setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore()); + } else { + // For core insns. + Valid = false; + setUnits(0); + setLanes(0); + setLoad(false); + setStore(false); + } +} + HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI) : MCII(MCII), STI(STI) { @@ -107,7 +163,7 @@ void HexagonShuffler::reset() { void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender, unsigned S, bool X) { - HexagonInstr PI(ID, Extender, S, X); + HexagonInstr PI(MCII, ID, Extender, S, X); Packet.push_back(PI); } @@ -126,6 +182,8 @@ bool HexagonShuffler::check() { // Number of memory operations, loads, solo loads, stores, solo stores, single // stores. unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0; + // Number of HVX loads, HVX stores. + unsigned CVIloads = 0, CVIstores = 0; // Number of duplex insns, solo insns. unsigned duplex = 0, solo = 0; // Number of insns restricting other insns in the packet to A and X types, @@ -168,6 +226,12 @@ bool HexagonShuffler::check() { case HexagonII::TypeJ: ++jumps; break; + case HexagonII::TypeCVI_VM_VP_LDU: + ++onlyNo1; + case HexagonII::TypeCVI_VM_LD: + case HexagonII::TypeCVI_VM_TMP_LD: + case HexagonII::TypeCVI_VM_CUR_LD: + ++CVIloads; case HexagonII::TypeLD: ++loads; ++memory; @@ -176,6 +240,11 @@ bool HexagonShuffler::check() { if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn()) ++jumps, ++jump1; // DEALLOC_RETURN is of type LD. break; + case HexagonII::TypeCVI_VM_STU: + ++onlyNo1; + case HexagonII::TypeCVI_VM_ST: + case HexagonII::TypeCVI_VM_NEW_ST: + ++CVIstores; case HexagonII::TypeST: ++stores; ++memory; @@ -203,9 +272,9 @@ bool HexagonShuffler::check() { } // Check if the packet is legal. - if ((load0 > 1 || store0 > 1) || (duplex > 1 || (duplex && memory)) || - (solo && size() > 1) || (onlyAX && neitherAnorX > 1) || - (onlyAX && xtypeFloat)) { + if ((load0 > 1 || store0 > 1 || CVIloads > 1 || CVIstores > 1) || + (duplex > 1 || (duplex && memory)) || (solo && size() > 1) || + (onlyAX && neitherAnorX > 1) || (onlyAX && xtypeFloat)) { Error = SHUFFLE_ERROR_INVALID; return false; } @@ -336,6 +405,19 @@ bool HexagonShuffler::check() { return false; } } + // Verify the CVI slot subscriptions. + { + HexagonUnitAuction AuctionCVI; + + std::sort(begin(), end(), HexagonInstr::lessCVI); + + for (iterator I = begin(); I != end(); ++I) + for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid? + if (!AuctionCVI.bid(I->CVI.getUnits() << i)) { + Error = SHUFFLE_ERROR_SLOTS; + return false; + } + } Error = SHUFFLE_SUCCESS; return true; diff --git a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h index 8b6c72e..174f10f 100644 --- a/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h +++ b/contrib/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h @@ -51,6 +51,44 @@ public: }; }; +// HVX insn resources. +class HexagonCVIResource : public HexagonResource { + typedef std::pair<unsigned, unsigned> UnitsAndLanes; + typedef llvm::DenseMap<unsigned, UnitsAndLanes> TypeUnitsAndLanes; + + // Available HVX slots. + enum { + CVI_NONE = 0, + CVI_XLANE = 1 << 0, + CVI_SHIFT = 1 << 1, + CVI_MPY0 = 1 << 2, + CVI_MPY1 = 1 << 3 + }; + + static bool SetUp; + static bool setup(); + static TypeUnitsAndLanes *TUL; + + // Count of adjacent slots that the insn requires to be executed. + unsigned Lanes; + // Flag whether the insn is a load or a store. + bool Load, Store; + // Flag whether the HVX resources are valid. + bool Valid; + + void setLanes(unsigned l) { Lanes = l; }; + void setLoad(bool f = true) { Load = f; }; + void setStore(bool f = true) { Store = f; }; + +public: + HexagonCVIResource(MCInstrInfo const &MCII, unsigned s, MCInst const *id); + + bool isValid() const { return (Valid); }; + unsigned getLanes() const { return (Lanes); }; + bool mayLoad() const { return (Load); }; + bool mayStore() const { return (Store); }; +}; + // Handle to an insn used by the shuffling algorithm. class HexagonInstr { friend class HexagonShuffler; @@ -58,12 +96,14 @@ class HexagonInstr { MCInst const *ID; MCInst const *Extender; HexagonResource Core; + HexagonCVIResource CVI; bool SoloException; public: - HexagonInstr(MCInst const *id, MCInst const *Extender, unsigned s, - bool x = false) - : ID(id), Extender(Extender), Core(s), SoloException(x){}; + HexagonInstr(MCInstrInfo const &MCII, MCInst const *id, + MCInst const *Extender, unsigned s, bool x = false) + : ID(id), Extender(Extender), Core(s), CVI(MCII, s, id), + SoloException(x){}; MCInst const *getDesc() const { return (ID); }; @@ -79,6 +119,10 @@ public: static bool lessCore(const HexagonInstr &A, const HexagonInstr &B) { return (HexagonResource::lessUnits(A.Core, B.Core)); }; + // Check if the handles are in ascending order by HVX slots. + static bool lessCVI(const HexagonInstr &A, const HexagonInstr &B) { + return (HexagonResource::lessUnits(A.CVI, B.CVI)); + }; }; // Bundle shuffler. @@ -108,6 +152,8 @@ public: SHUFFLE_ERROR_BRANCHES, ///< No free slots for branch insns. SHUFFLE_ERROR_NOSLOTS, ///< No free slots for other insns. SHUFFLE_ERROR_SLOTS, ///< Over-subscribed slots. + SHUFFLE_ERROR_ERRATA2, ///< Errata violation (v60). + SHUFFLE_ERROR_STORE_LOAD_CONFLICT, ///< store/load conflict SHUFFLE_ERROR_UNKNOWN ///< Unknown error. }; diff --git a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h index 70141a9..72afec1 100644 --- a/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h +++ b/contrib/llvm/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h @@ -17,8 +17,6 @@ #include "llvm/MC/MCInstPrinter.h" namespace llvm { - class MCOperand; - class MSP430InstPrinter : public MCInstPrinter { public: MSP430InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h index ff5b0b6..183dee3 100644 --- a/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h @@ -17,13 +17,14 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; +class Triple; - class MSP430MCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit MSP430MCAsmInfo(const Triple &TT); - }; +class MSP430MCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit MSP430MCAsmInfo(const Triple &TT); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp index ffcf222..606abc2 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430BranchSelector.cpp @@ -64,7 +64,7 @@ bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) { unsigned FuncSize = 0; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { - MachineBasicBlock *MBB = MFI; + MachineBasicBlock *MBB = &*MFI; unsigned BlockSize = 0; for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end(); diff --git a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index 29bc8b3..18f38b7 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -69,10 +69,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, computeRegisterProperties(STI.getRegisterInfo()); // Provide all sorts of operation actions - - // Division is expensive - setIntDivIsCheap(false); - setStackPointerRegisterToSaveRestore(MSP430::SP); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct? @@ -508,9 +504,10 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain, // Create the SelectionDAG nodes corresponding to a load //from this parameter SDValue FIN = DAG.getFrameIndex(FI, MVT::i16); - InVal = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + InVal = DAG.getLoad( + VA.getLocVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); } InVals.push_back(InVal); @@ -1231,8 +1228,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI, } const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = BB; - ++I; + MachineFunction::iterator I = ++BB->getIterator(); // Create loop block MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB); @@ -1320,8 +1316,7 @@ MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator I = BB; - ++I; + MachineFunction::iterator I = ++BB->getIterator(); // thisMBB: // ... diff --git a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp index 72b1780..d4f82bd 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp @@ -44,11 +44,10 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOStore, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); if (RC == &MSP430::GR16RegClass) BuildMI(MBB, MI, DL, get(MSP430::MOV16mr)) @@ -72,11 +71,10 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); if (RC == &MSP430::GR16RegClass) BuildMI(MBB, MI, DL, get(MSP430::MOV16rm)) diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp index 54154a8..47b0e27 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430MCInstLower.cpp @@ -50,9 +50,9 @@ GetExternalSymbolSymbol(const MachineOperand &MO) const { MCSymbol *MSP430MCInstLower:: GetJumpTableSymbol(const MachineOperand &MO) const { - const DataLayout *DL = Printer.TM.getDataLayout(); + const DataLayout &DL = Printer.getDataLayout(); SmallString<256> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "JTI" + raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "JTI" << Printer.getFunctionNumber() << '_' << MO.getIndex(); @@ -67,9 +67,9 @@ GetJumpTableSymbol(const MachineOperand &MO) const { MCSymbol *MSP430MCInstLower:: GetConstantPoolIndexSymbol(const MachineOperand &MO) const { - const DataLayout *DL = Printer.TM.getDataLayout(); + const DataLayout &DL = Printer.getDataLayout(); SmallString<256> Name; - raw_svector_ostream(Name) << DL->getPrivateGlobalPrefix() << "CPI" + raw_svector_ostream(Name) << DL.getPrivateGlobalPrefix() << "CPI" << Printer.getFunctionNumber() << '_' << MO.getIndex(); diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp index 0f75399..b442fc0 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- MSP430MachineFuctionInfo.cpp - MSP430 machine function info -------===// +//===-- MSP430MachineFunctionInfo.cpp - MSP430 machine function info ------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h index fcc5f5b..2d93731 100644 --- a/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==// +//=== MSP430MachineFunctionInfo.h - MSP430 machine function info -*- C++ -*-==// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 5107d2a..d4e061f 100644 --- a/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/contrib/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -11,6 +11,7 @@ #include "MCTargetDesc/MipsMCExpr.h" #include "MCTargetDesc/MipsMCTargetDesc.h" #include "MipsRegisterInfo.h" +#include "MipsTargetObjectFile.h" #include "MipsTargetStreamer.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/SmallVector.h" @@ -106,7 +107,6 @@ class MipsAsmParser : public MCTargetAsmParser { return static_cast<MipsTargetStreamer &>(TS); } - MCSubtargetInfo &STI; MipsABIInfo ABI; SmallVector<std::unique_ptr<MipsAssemblerOptions>, 2> AssemblerOptions; MCSymbol *CurrentFn; // Pointer to the function being parsed. It may be a @@ -114,6 +114,12 @@ class MipsAsmParser : public MCTargetAsmParser { // selected. This usually happens after an '.end func' // directive. bool IsLittleEndian; + bool IsPicEnabled; + bool IsCpRestoreSet; + int CpRestoreOffset; + unsigned CpSaveLocation; + /// If true, then CpSaveLocation is a register, otherwise it's an offset. + bool CpSaveLocationIsRegister; // Print a warning along with its fix-it message at the given range. void printWarningWithFixIt(const Twine &Msg, const Twine &FixMsg, @@ -141,50 +147,41 @@ class MipsAsmParser : public MCTargetAsmParser { bool ParseDirective(AsmToken DirectiveID) override; - MipsAsmParser::OperandMatchResultTy parseMemOperand(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy + OperandMatchResultTy parseMemOperand(OperandVector &Operands); + OperandMatchResultTy matchAnyRegisterNameWithoutDollar(OperandVector &Operands, StringRef Identifier, SMLoc S); - - MipsAsmParser::OperandMatchResultTy - matchAnyRegisterWithoutDollar(OperandVector &Operands, SMLoc S); - - MipsAsmParser::OperandMatchResultTy parseAnyRegister(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseImm(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseJumpTarget(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseInvNum(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy parseLSAImm(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy - parseRegisterPair (OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy - parseMovePRegPair(OperandVector &Operands); - - MipsAsmParser::OperandMatchResultTy - parseRegisterList (OperandVector &Operands); + OperandMatchResultTy matchAnyRegisterWithoutDollar(OperandVector &Operands, + SMLoc S); + OperandMatchResultTy parseAnyRegister(OperandVector &Operands); + OperandMatchResultTy parseImm(OperandVector &Operands); + OperandMatchResultTy parseJumpTarget(OperandVector &Operands); + OperandMatchResultTy parseInvNum(OperandVector &Operands); + OperandMatchResultTy parseLSAImm(OperandVector &Operands); + OperandMatchResultTy parseRegisterPair(OperandVector &Operands); + OperandMatchResultTy parseMovePRegPair(OperandVector &Operands); + OperandMatchResultTy parseRegisterList(OperandVector &Operands); bool searchSymbolAlias(OperandVector &Operands); bool parseOperand(OperandVector &, StringRef Mnemonic); - bool needsExpansion(MCInst &Inst); + enum MacroExpanderResultTy { + MER_NotAMacro, + MER_Success, + MER_Fail, + }; // Expands assembly pseudo instructions. - // Returns false on success, true otherwise. - bool expandInstruction(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); + MacroExpanderResultTy + tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); bool expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); bool loadImmediate(int64_t ImmValue, unsigned DstReg, unsigned SrcReg, - bool Is32BitImm, SMLoc IDLoc, + bool Is32BitImm, bool IsAddress, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); bool loadAndAddSymbolAddress(const MCExpr *SymExpr, unsigned DstReg, @@ -194,11 +191,10 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - bool expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); + bool expandLoadAddress(unsigned DstReg, unsigned BaseReg, + const MCOperand &Offset, bool Is32BitAddress, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - bool expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); @@ -209,24 +205,43 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + bool expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandBranchImm(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); bool expandCondBranches(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); - bool expandUlhu(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions); + bool expandDiv(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions, const bool IsMips64, + const bool Signed); + + bool expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); bool expandUlw(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); + bool expandRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandDRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool expandDRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + void createNop(bool hasShortDelaySlot, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); void createAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, SmallVectorImpl<MCInst> &Instructions); + void createCpRestoreMemOp(bool IsLoad, int StackOffset, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions); + bool reportParseError(Twine ErrorMsg); bool reportParseError(SMLoc Loc, Twine ErrorMsg); @@ -239,8 +254,11 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseSetMips0Directive(); bool parseSetArchDirective(); bool parseSetFeature(uint64_t Feature); + bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup. bool parseDirectiveCpLoad(SMLoc Loc); + bool parseDirectiveCpRestore(SMLoc Loc); bool parseDirectiveCPSetup(); + bool parseDirectiveCPReturn(); bool parseDirectiveNaN(); bool parseDirectiveSet(); bool parseDirectiveOption(); @@ -337,6 +355,7 @@ class MipsAsmParser : public MCTargetAsmParser { // FeatureMipsGP64 | FeatureMips1) // Clearing Mips3 is equivalent to clear (FeatureMips3 | FeatureMips4). void selectArch(StringRef ArchFeature) { + MCSubtargetInfo &STI = copySTI(); FeatureBitset FeatureBits = STI.getFeatureBits(); FeatureBits &= ~MipsAssemblerOptions::AllArchRelatedMask; STI.setFeatureBits(FeatureBits); @@ -346,7 +365,8 @@ class MipsAsmParser : public MCTargetAsmParser { } void setFeatureBits(uint64_t Feature, StringRef FeatureString) { - if (!(STI.getFeatureBits()[Feature])) { + if (!(getSTI().getFeatureBits()[Feature])) { + MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(STI.ToggleFeature(FeatureString))); AssemblerOptions.back()->setFeatures(STI.getFeatureBits()); @@ -354,7 +374,8 @@ class MipsAsmParser : public MCTargetAsmParser { } void clearFeatureBits(uint64_t Feature, StringRef FeatureString) { - if (STI.getFeatureBits()[Feature]) { + if (getSTI().getFeatureBits()[Feature]) { + MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(STI.ToggleFeature(FeatureString))); AssemblerOptions.back()->setFeatures(STI.getFeatureBits()); @@ -363,26 +384,25 @@ class MipsAsmParser : public MCTargetAsmParser { void setModuleFeatureBits(uint64_t Feature, StringRef FeatureString) { setFeatureBits(Feature, FeatureString); - AssemblerOptions.front()->setFeatures(STI.getFeatureBits()); + AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits()); } void clearModuleFeatureBits(uint64_t Feature, StringRef FeatureString) { clearFeatureBits(Feature, FeatureString); - AssemblerOptions.front()->setFeatures(STI.getFeatureBits()); + AssemblerOptions.front()->setFeatures(getSTI().getFeatureBits()); } public: enum MipsMatchResultTy { - Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY + Match_RequiresDifferentSrcAndDst = FIRST_TARGET_MATCH_RESULT_TY, #define GET_OPERAND_DIAGNOSTIC_TYPES #include "MipsGenAsmMatcher.inc" #undef GET_OPERAND_DIAGNOSTIC_TYPES - }; - MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, + MipsAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), + : MCTargetAsmParser(Options, sti), ABI(MipsABIInfo::computeTargetABI(Triple(sti.getTargetTriple()), sti.getCPU(), Options)) { MCAsmParserExtension::Initialize(parser); @@ -390,15 +410,15 @@ public: parser.addAliasForDirective(".asciiz", ".asciz"); // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); - + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); + // Remember the initial assembler options. The user can not modify these. AssemblerOptions.push_back( - llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits())); - + llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); + // Create an assembler options environment for the user to modify. AssemblerOptions.push_back( - llvm::make_unique<MipsAssemblerOptions>(STI.getFeatureBits())); + llvm::make_unique<MipsAssemblerOptions>(getSTI().getFeatureBits())); getTargetStreamer().updateABIInfo(*this); @@ -407,6 +427,12 @@ public: CurrentFn = nullptr; + IsPicEnabled = + (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_); + + IsCpRestoreSet = false; + CpRestoreOffset = -1; + Triple TheTriple(sti.getTargetTriple()); if ((TheTriple.getArch() == Triple::mips) || (TheTriple.getArch() == Triple::mips64)) @@ -418,70 +444,103 @@ public: /// True if all of $fcc0 - $fcc7 exist for the current ISA. bool hasEightFccRegisters() const { return hasMips4() || hasMips32(); } - bool isGP64bit() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; } - bool isFP64bit() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; } + bool isGP64bit() const { + return getSTI().getFeatureBits()[Mips::FeatureGP64Bit]; + } + bool isFP64bit() const { + return getSTI().getFeatureBits()[Mips::FeatureFP64Bit]; + } const MipsABIInfo &getABI() const { return ABI; } bool isABI_N32() const { return ABI.IsN32(); } bool isABI_N64() const { return ABI.IsN64(); } bool isABI_O32() const { return ABI.IsO32(); } - bool isABI_FPXX() const { return STI.getFeatureBits()[Mips::FeatureFPXX]; } + bool isABI_FPXX() const { + return getSTI().getFeatureBits()[Mips::FeatureFPXX]; + } bool useOddSPReg() const { - return !(STI.getFeatureBits()[Mips::FeatureNoOddSPReg]); + return !(getSTI().getFeatureBits()[Mips::FeatureNoOddSPReg]); } bool inMicroMipsMode() const { - return STI.getFeatureBits()[Mips::FeatureMicroMips]; + return getSTI().getFeatureBits()[Mips::FeatureMicroMips]; + } + bool hasMips1() const { + return getSTI().getFeatureBits()[Mips::FeatureMips1]; + } + bool hasMips2() const { + return getSTI().getFeatureBits()[Mips::FeatureMips2]; + } + bool hasMips3() const { + return getSTI().getFeatureBits()[Mips::FeatureMips3]; + } + bool hasMips4() const { + return getSTI().getFeatureBits()[Mips::FeatureMips4]; + } + bool hasMips5() const { + return getSTI().getFeatureBits()[Mips::FeatureMips5]; } - bool hasMips1() const { return STI.getFeatureBits()[Mips::FeatureMips1]; } - bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; } - bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; } - bool hasMips4() const { return STI.getFeatureBits()[Mips::FeatureMips4]; } - bool hasMips5() const { return STI.getFeatureBits()[Mips::FeatureMips5]; } bool hasMips32() const { - return STI.getFeatureBits()[Mips::FeatureMips32]; + return getSTI().getFeatureBits()[Mips::FeatureMips32]; } bool hasMips64() const { - return STI.getFeatureBits()[Mips::FeatureMips64]; + return getSTI().getFeatureBits()[Mips::FeatureMips64]; } bool hasMips32r2() const { - return STI.getFeatureBits()[Mips::FeatureMips32r2]; + return getSTI().getFeatureBits()[Mips::FeatureMips32r2]; } bool hasMips64r2() const { - return STI.getFeatureBits()[Mips::FeatureMips64r2]; + return getSTI().getFeatureBits()[Mips::FeatureMips64r2]; } bool hasMips32r3() const { - return (STI.getFeatureBits()[Mips::FeatureMips32r3]); + return (getSTI().getFeatureBits()[Mips::FeatureMips32r3]); } bool hasMips64r3() const { - return (STI.getFeatureBits()[Mips::FeatureMips64r3]); + return (getSTI().getFeatureBits()[Mips::FeatureMips64r3]); } bool hasMips32r5() const { - return (STI.getFeatureBits()[Mips::FeatureMips32r5]); + return (getSTI().getFeatureBits()[Mips::FeatureMips32r5]); } bool hasMips64r5() const { - return (STI.getFeatureBits()[Mips::FeatureMips64r5]); + return (getSTI().getFeatureBits()[Mips::FeatureMips64r5]); } bool hasMips32r6() const { - return STI.getFeatureBits()[Mips::FeatureMips32r6]; + return getSTI().getFeatureBits()[Mips::FeatureMips32r6]; } bool hasMips64r6() const { - return STI.getFeatureBits()[Mips::FeatureMips64r6]; + return getSTI().getFeatureBits()[Mips::FeatureMips64r6]; } - bool hasDSP() const { return STI.getFeatureBits()[Mips::FeatureDSP]; } - bool hasDSPR2() const { return STI.getFeatureBits()[Mips::FeatureDSPR2]; } - bool hasMSA() const { return STI.getFeatureBits()[Mips::FeatureMSA]; } + bool hasDSP() const { + return getSTI().getFeatureBits()[Mips::FeatureDSP]; + } + bool hasDSPR2() const { + return getSTI().getFeatureBits()[Mips::FeatureDSPR2]; + } + bool hasDSPR3() const { + return getSTI().getFeatureBits()[Mips::FeatureDSPR3]; + } + bool hasMSA() const { + return getSTI().getFeatureBits()[Mips::FeatureMSA]; + } bool hasCnMips() const { - return (STI.getFeatureBits()[Mips::FeatureCnMips]); + return (getSTI().getFeatureBits()[Mips::FeatureCnMips]); + } + + bool inPicMode() { + return IsPicEnabled; } bool inMips16Mode() const { - return STI.getFeatureBits()[Mips::FeatureMips16]; + return getSTI().getFeatureBits()[Mips::FeatureMips16]; + } + + bool useTraps() const { + return getSTI().getFeatureBits()[Mips::FeatureUseTCCInDIV]; } bool useSoftFloat() const { - return STI.getFeatureBits()[Mips::FeatureSoftFloat]; + return getSTI().getFeatureBits()[Mips::FeatureSoftFloat]; } /// Warn if RegIndex is the same as the current AT. @@ -869,6 +928,16 @@ public: Inst.addOperand(MCOperand::createReg(getHWRegsReg())); } + template <unsigned Bits, int Offset = 0, int AdjustOffset = 0> + void addConstantUImmOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + uint64_t Imm = getConstantImm() - Offset; + Imm &= (1 << Bits) - 1; + Imm += Offset; + Imm += AdjustOffset; + Inst.addOperand(MCOperand::createImm(Imm)); + } + void addImmOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); const MCExpr *Expr = getImm(); @@ -878,7 +947,9 @@ public: void addMemOperands(MCInst &Inst, unsigned N) const { assert(N == 2 && "Invalid number of operands!"); - Inst.addOperand(MCOperand::createReg(getMemBase()->getGPR32Reg())); + Inst.addOperand(MCOperand::createReg(AsmParser.getABI().ArePtrs64bit() + ? getMemBase()->getGPR64Reg() + : getMemBase()->getGPR32Reg())); const MCExpr *Expr = getMemOff(); addExpr(Inst, Expr); @@ -924,10 +995,16 @@ public: bool isRegIdx() const { return Kind == k_RegisterIndex; } bool isImm() const override { return Kind == k_Immediate; } bool isConstantImm() const { - return isImm() && dyn_cast<MCConstantExpr>(getImm()); + return isImm() && isa<MCConstantExpr>(getImm()); + } + bool isConstantImmz() const { + return isConstantImm() && getConstantImm() == 0; } - template <unsigned Bits> bool isUImm() const { - return isImm() && isConstantImm() && isUInt<Bits>(getConstantImm()); + template <unsigned Bits, int Offset = 0> bool isConstantUImm() const { + return isConstantImm() && isUInt<Bits>(getConstantImm() - Offset); + } + template <unsigned Bits> bool isConstantSImm() const { + return isConstantImm() && isInt<Bits>(getConstantImm()); } bool isToken() const override { // Note: It's not possible to pretend that other operand kinds are tokens. @@ -936,10 +1013,15 @@ public: } bool isMem() const override { return Kind == k_Memory; } bool isConstantMemOff() const { - return isMem() && dyn_cast<MCConstantExpr>(getMemOff()); + return isMem() && isa<MCConstantExpr>(getMemOff()); } template <unsigned Bits> bool isMemWithSimmOffset() const { - return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()); + return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()) + && getMemBase()->isGPRAsmReg(); + } + template <unsigned Bits> bool isMemWithSimmOffsetGPR() const { + return isMem() && isConstantMemOff() && isInt<Bits>(getConstantMemOff()) && + getMemBase()->isGPRAsmReg(); } bool isMemWithGRPMM16Base() const { return isMem() && getMemBase()->isMM16AsmReg(); @@ -953,13 +1035,23 @@ public: && (getConstantMemOff() % 4 == 0) && getMemBase()->isRegIdx() && (getMemBase()->getGPR32Reg() == Mips::SP); } + template <unsigned Bits, unsigned ShiftLeftAmount> + bool isScaledUImm() const { + return isConstantImm() && + isShiftedUInt<Bits, ShiftLeftAmount>(getConstantImm()); + } bool isRegList16() const { if (!isRegList()) return false; int Size = RegList.List->size(); - if (Size < 2 || Size > 5 || *RegList.List->begin() != Mips::S0 || - RegList.List->back() != Mips::RA) + if (Size < 2 || Size > 5) + return false; + + unsigned R0 = RegList.List->front(); + unsigned R1 = RegList.List->back(); + if (!((R0 == Mips::S0 && R1 == Mips::RA) || + (R0 == Mips::S0_64 && R1 == Mips::RA_64))) return false; int PrevReg = *RegList.List->begin(); @@ -1304,9 +1396,123 @@ static bool hasShortDelaySlot(unsigned Opcode) { } } +static const MCSymbol *getSingleMCSymbol(const MCExpr *Expr) { + if (const MCSymbolRefExpr *SRExpr = dyn_cast<MCSymbolRefExpr>(Expr)) { + return &SRExpr->getSymbol(); + } + + if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) { + const MCSymbol *LHSSym = getSingleMCSymbol(BExpr->getLHS()); + const MCSymbol *RHSSym = getSingleMCSymbol(BExpr->getRHS()); + + if (LHSSym) + return LHSSym; + + if (RHSSym) + return RHSSym; + + return nullptr; + } + + if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr)) + return getSingleMCSymbol(UExpr->getSubExpr()); + + return nullptr; +} + +static unsigned countMCSymbolRefExpr(const MCExpr *Expr) { + if (isa<MCSymbolRefExpr>(Expr)) + return 1; + + if (const MCBinaryExpr *BExpr = dyn_cast<MCBinaryExpr>(Expr)) + return countMCSymbolRefExpr(BExpr->getLHS()) + + countMCSymbolRefExpr(BExpr->getRHS()); + + if (const MCUnaryExpr *UExpr = dyn_cast<MCUnaryExpr>(Expr)) + return countMCSymbolRefExpr(UExpr->getSubExpr()); + + return 0; +} + +namespace { +void emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(Reg0)); + tmpInst.addOperand(Op1); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + emitRX(Opcode, Reg0, MCOperand::createImm(Imm), IDLoc, Instructions); +} + +void emitRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + emitRX(Opcode, Reg0, MCOperand::createReg(Reg1), IDLoc, Instructions); +} + +void emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createImm(Imm1)); + tmpInst.addOperand(MCOperand::createImm(Imm2)); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(Reg0)); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1, MCOperand Op2, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + MCInst tmpInst; + tmpInst.setOpcode(Opcode); + tmpInst.addOperand(MCOperand::createReg(Reg0)); + tmpInst.addOperand(MCOperand::createReg(Reg1)); + tmpInst.addOperand(Op2); + tmpInst.setLoc(IDLoc); + Instructions.push_back(tmpInst); +} + +void emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1, unsigned Reg2, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + emitRRX(Opcode, Reg0, Reg1, MCOperand::createReg(Reg2), IDLoc, + Instructions); +} + +void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, + Instructions); +} + +void emitAppropriateDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount, + SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + if (ShiftAmount >= 32) { + emitRRI(Mips::DSLL32, DstReg, SrcReg, ShiftAmount - 32, IDLoc, + Instructions); + return; + } + + emitRRI(Mips::DSLL, DstReg, SrcReg, ShiftAmount, IDLoc, Instructions); +} +} // end anonymous namespace. + bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode()); + bool ExpandedJalSym = false; Inst.setLoc(IDLoc); @@ -1365,12 +1571,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, return Error(IDLoc, "branch to misaligned address"); break; case Mips::BEQZ16_MM: + case Mips::BEQZC16_MMR6: case Mips::BNEZ16_MM: + case Mips::BNEZC16_MMR6: assert(MCID.getNumOperands() == 2 && "unexpected number of operands"); Offset = Inst.getOperand(1); if (!Offset.isImm()) break; // We'll deal with this situation later on when applying fixups. - if (!isIntN(8, Offset.getImm())) + if (!isInt<8>(Offset.getImm())) return Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 2LL)) return Error(IDLoc, "branch to misaligned address"); @@ -1415,32 +1623,6 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, } break; - case Mips::CINS: - case Mips::CINS32: - case Mips::EXTS: - case Mips::EXTS32: - assert(MCID.getNumOperands() == 4 && "unexpected number of operands"); - // Check length - Opnd = Inst.getOperand(3); - if (!Opnd.isImm()) - return Error(IDLoc, "expected immediate operand kind"); - Imm = Opnd.getImm(); - if (Imm < 0 || Imm > 31) - return Error(IDLoc, "immediate operand value out of range"); - // Check position - Opnd = Inst.getOperand(2); - if (!Opnd.isImm()) - return Error(IDLoc, "expected immediate operand kind"); - Imm = Opnd.getImm(); - if (Imm < 0 || Imm > (Opcode == Mips::CINS || - Opcode == Mips::EXTS ? 63 : 31)) - return Error(IDLoc, "immediate operand value out of range"); - if (Imm > 31) { - Inst.setOpcode(Opcode == Mips::CINS ? Mips::CINS32 : Mips::EXTS32); - Inst.getOperand(2).setImm(Imm - 32); - } - break; - case Mips::SEQi: case Mips::SNEi: assert(MCID.getNumOperands() == 3 && "unexpected number of operands"); @@ -1454,6 +1636,81 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, } } + // This expansion is not in a function called by tryExpandInstruction() + // because the pseudo-instruction doesn't have a distinct opcode. + if ((Inst.getOpcode() == Mips::JAL || Inst.getOpcode() == Mips::JAL_MM) && + inPicMode()) { + warnIfNoMacro(IDLoc); + + const MCExpr *JalExpr = Inst.getOperand(0).getExpr(); + + // We can do this expansion if there's only 1 symbol in the argument + // expression. + if (countMCSymbolRefExpr(JalExpr) > 1) + return Error(IDLoc, "jal doesn't support multiple symbols in PIC mode"); + + // FIXME: This is checking the expression can be handled by the later stages + // of the assembler. We ought to leave it to those later stages but + // we can't do that until we stop evaluateRelocExpr() rewriting the + // expressions into non-equivalent forms. + const MCSymbol *JalSym = getSingleMCSymbol(JalExpr); + + // FIXME: Add support for label+offset operands (currently causes an error). + // FIXME: Add support for forward-declared local symbols. + // FIXME: Add expansion for when the LargeGOT option is enabled. + if (JalSym->isInSection() || JalSym->isTemporary()) { + if (isABI_O32()) { + // If it's a local symbol and the O32 ABI is being used, we expand to: + // lw $25, 0($gp) + // R_(MICRO)MIPS_GOT16 label + // addiu $25, $25, 0 + // R_(MICRO)MIPS_LO16 label + // jalr $25 + const MCExpr *Got16RelocExpr = evaluateRelocExpr(JalExpr, "got"); + const MCExpr *Lo16RelocExpr = evaluateRelocExpr(JalExpr, "lo"); + + emitRRX(Mips::LW, Mips::T9, Mips::GP, + MCOperand::createExpr(Got16RelocExpr), IDLoc, Instructions); + emitRRX(Mips::ADDiu, Mips::T9, Mips::T9, + MCOperand::createExpr(Lo16RelocExpr), IDLoc, Instructions); + } else if (isABI_N32() || isABI_N64()) { + // If it's a local symbol and the N32/N64 ABIs are being used, + // we expand to: + // lw/ld $25, 0($gp) + // R_(MICRO)MIPS_GOT_DISP label + // jalr $25 + const MCExpr *GotDispRelocExpr = evaluateRelocExpr(JalExpr, "got_disp"); + + emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP, + MCOperand::createExpr(GotDispRelocExpr), IDLoc, Instructions); + } + } else { + // If it's an external/weak symbol, we expand to: + // lw/ld $25, 0($gp) + // R_(MICRO)MIPS_CALL16 label + // jalr $25 + const MCExpr *Call16RelocExpr = evaluateRelocExpr(JalExpr, "call16"); + + emitRRX(ABI.ArePtrs64bit() ? Mips::LD : Mips::LW, Mips::T9, Mips::GP, + MCOperand::createExpr(Call16RelocExpr), IDLoc, Instructions); + } + + MCInst JalrInst; + if (IsCpRestoreSet && inMicroMipsMode()) + JalrInst.setOpcode(Mips::JALRS_MM); + else + JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); + JalrInst.addOperand(MCOperand::createReg(Mips::RA)); + JalrInst.addOperand(MCOperand::createReg(Mips::T9)); + + // FIXME: Add an R_(MICRO)MIPS_JALR relocation after the JALR. + // This relocation is supposed to be an optimization hint for the linker + // and is not necessary for correctness. + + Inst = JalrInst; + ExpandedJalSym = true; + } + if (MCID.mayLoad() || MCID.mayStore()) { // Check the offset of memory operand, if it is a symbol // reference or immediate we may have to expand instructions. @@ -1500,17 +1757,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, int MemOffset = Op.getImm(); MCOperand &DstReg = Inst.getOperand(0); MCOperand &BaseReg = Inst.getOperand(1); - if (isIntN(9, MemOffset) && (MemOffset % 4 == 0) && + if (isInt<9>(MemOffset) && (MemOffset % 4 == 0) && getContext().getRegisterInfo()->getRegClass( Mips::GPRMM16RegClassID).contains(DstReg.getReg()) && - BaseReg.getReg() == Mips::GP) { - MCInst TmpInst; - TmpInst.setLoc(IDLoc); - TmpInst.setOpcode(Mips::LWGP_MM); - TmpInst.addOperand(MCOperand::createReg(DstReg.getReg())); - TmpInst.addOperand(MCOperand::createReg(Mips::GP)); - TmpInst.addOperand(MCOperand::createImm(MemOffset)); - Instructions.push_back(TmpInst); + (BaseReg.getReg() == Mips::GP || + BaseReg.getReg() == Mips::GP_64)) { + + emitRRI(Mips::LWGP_MM, DstReg.getReg(), Mips::GP, MemOffset, + IDLoc, Instructions); return false; } } @@ -1597,7 +1851,14 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, if (Imm < -1 || Imm > 14) return Error(IDLoc, "immediate operand value out of range"); break; + case Mips::TEQ_MM: + case Mips::TGE_MM: + case Mips::TGEU_MM: + case Mips::TLT_MM: + case Mips::TLTU_MM: + case Mips::TNE_MM: case Mips::SB16_MM: + case Mips::SB16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); @@ -1607,6 +1868,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; case Mips::LHU16_MM: case Mips::SH16_MM: + case Mips::SH16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); @@ -1616,6 +1878,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, break; case Mips::LW16_MM: case Mips::SW16_MM: + case Mips::SW16_MMR6: Opnd = Inst.getOperand(2); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); @@ -1623,93 +1886,111 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, if (Imm < 0 || Imm > 60 || (Imm % 4 != 0)) return Error(IDLoc, "immediate operand value out of range"); break; - case Mips::CACHE: - case Mips::PREF: - Opnd = Inst.getOperand(2); - if (!Opnd.isImm()) - return Error(IDLoc, "expected immediate operand kind"); - Imm = Opnd.getImm(); - if (!isUInt<5>(Imm)) - return Error(IDLoc, "immediate operand value out of range"); - break; case Mips::ADDIUPC_MM: MCOperand Opnd = Inst.getOperand(1); if (!Opnd.isImm()) return Error(IDLoc, "expected immediate operand kind"); int Imm = Opnd.getImm(); - if ((Imm % 4 != 0) || !isIntN(25, Imm)) + if ((Imm % 4 != 0) || !isInt<25>(Imm)) return Error(IDLoc, "immediate operand value out of range"); break; } } - if (needsExpansion(Inst)) { - if (expandInstruction(Inst, IDLoc, Instructions)) - return true; - } else + MacroExpanderResultTy ExpandResult = + tryExpandInstruction(Inst, IDLoc, Instructions); + switch (ExpandResult) { + case MER_NotAMacro: Instructions.push_back(Inst); + break; + case MER_Success: + break; + case MER_Fail: + return true; + } // If this instruction has a delay slot and .set reorder is active, // emit a NOP after it. if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions); - return false; -} + if ((Inst.getOpcode() == Mips::JalOneReg || + Inst.getOpcode() == Mips::JalTwoReg || ExpandedJalSym) && + isPicAndNotNxxAbi()) { + if (IsCpRestoreSet) { + // We need a NOP between the JALR and the LW: + // If .set reorder has been used, we've already emitted a NOP. + // If .set noreorder has been used, we need to emit a NOP at this point. + if (!AssemblerOptions.back()->isReorder()) + createNop(hasShortDelaySlot(Inst.getOpcode()), IDLoc, Instructions); -bool MipsAsmParser::needsExpansion(MCInst &Inst) { + // Load the $gp from the stack. + SmallVector<MCInst, 3> LoadInsts; + createCpRestoreMemOp(true /*IsLoad*/, CpRestoreOffset /*StackOffset*/, + IDLoc, LoadInsts); - switch (Inst.getOpcode()) { - case Mips::LoadImm32: - case Mips::LoadImm64: - case Mips::LoadAddrImm32: - case Mips::LoadAddrReg32: - case Mips::B_MM_Pseudo: - case Mips::LWM_MM: - case Mips::SWM_MM: - case Mips::JalOneReg: - case Mips::JalTwoReg: - case Mips::BneImm: - case Mips::BeqImm: - case Mips::BLT: - case Mips::BLE: - case Mips::BGE: - case Mips::BGT: - case Mips::BLTU: - case Mips::BLEU: - case Mips::BGEU: - case Mips::BGTU: - case Mips::Ulhu: - case Mips::Ulw: - return true; - default: - return false; + for (const MCInst &Inst : LoadInsts) + Instructions.push_back(Inst); + + } else + Warning(IDLoc, "no .cprestore used in PIC mode"); } + + return false; } -bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { +MipsAsmParser::MacroExpanderResultTy +MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { switch (Inst.getOpcode()) { - default: llvm_unreachable("unimplemented expansion"); + default: + return MER_NotAMacro; case Mips::LoadImm32: - return expandLoadImm(Inst, true, IDLoc, Instructions); + return expandLoadImm(Inst, true, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::LoadImm64: - return expandLoadImm(Inst, false, IDLoc, Instructions); + return expandLoadImm(Inst, false, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::LoadAddrImm32: - return expandLoadAddressImm(Inst, true, IDLoc, Instructions); + case Mips::LoadAddrImm64: + assert(Inst.getOperand(0).isReg() && "expected register operand kind"); + assert((Inst.getOperand(1).isImm() || Inst.getOperand(1).isExpr()) && + "expected immediate operand kind"); + + return expandLoadAddress(Inst.getOperand(0).getReg(), Mips::NoRegister, + Inst.getOperand(1), + Inst.getOpcode() == Mips::LoadAddrImm32, IDLoc, + Instructions) + ? MER_Fail + : MER_Success; case Mips::LoadAddrReg32: - return expandLoadAddressReg(Inst, true, IDLoc, Instructions); + case Mips::LoadAddrReg64: + assert(Inst.getOperand(0).isReg() && "expected register operand kind"); + assert(Inst.getOperand(1).isReg() && "expected register operand kind"); + assert((Inst.getOperand(2).isImm() || Inst.getOperand(2).isExpr()) && + "expected immediate operand kind"); + + return expandLoadAddress(Inst.getOperand(0).getReg(), + Inst.getOperand(1).getReg(), Inst.getOperand(2), + Inst.getOpcode() == Mips::LoadAddrReg32, IDLoc, + Instructions) + ? MER_Fail + : MER_Success; case Mips::B_MM_Pseudo: - return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions); + case Mips::B_MMR6_Pseudo: + return expandUncondBranchMMPseudo(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::SWM_MM: case Mips::LWM_MM: - return expandLoadStoreMultiple(Inst, IDLoc, Instructions); + return expandLoadStoreMultiple(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::JalOneReg: case Mips::JalTwoReg: - return expandJalWithRegs(Inst, IDLoc, Instructions); + return expandJalWithRegs(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; case Mips::BneImm: case Mips::BeqImm: - return expandBranchImm(Inst, IDLoc, Instructions); + return expandBranchImm(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success; case Mips::BLT: case Mips::BLE: case Mips::BGE: @@ -1718,78 +1999,97 @@ bool MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc, case Mips::BLEU: case Mips::BGEU: case Mips::BGTU: - return expandCondBranches(Inst, IDLoc, Instructions); + case Mips::BLTL: + case Mips::BLEL: + case Mips::BGEL: + case Mips::BGTL: + case Mips::BLTUL: + case Mips::BLEUL: + case Mips::BGEUL: + case Mips::BGTUL: + case Mips::BLTImmMacro: + case Mips::BLEImmMacro: + case Mips::BGEImmMacro: + case Mips::BGTImmMacro: + case Mips::BLTUImmMacro: + case Mips::BLEUImmMacro: + case Mips::BGEUImmMacro: + case Mips::BGTUImmMacro: + case Mips::BLTLImmMacro: + case Mips::BLELImmMacro: + case Mips::BGELImmMacro: + case Mips::BGTLImmMacro: + case Mips::BLTULImmMacro: + case Mips::BLEULImmMacro: + case Mips::BGEULImmMacro: + case Mips::BGTULImmMacro: + return expandCondBranches(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::SDivMacro: + return expandDiv(Inst, IDLoc, Instructions, false, true) ? MER_Fail + : MER_Success; + case Mips::DSDivMacro: + return expandDiv(Inst, IDLoc, Instructions, true, true) ? MER_Fail + : MER_Success; + case Mips::UDivMacro: + return expandDiv(Inst, IDLoc, Instructions, false, false) ? MER_Fail + : MER_Success; + case Mips::DUDivMacro: + return expandDiv(Inst, IDLoc, Instructions, true, false) ? MER_Fail + : MER_Success; + case Mips::Ulh: + return expandUlh(Inst, true, IDLoc, Instructions) ? MER_Fail : MER_Success; case Mips::Ulhu: - return expandUlhu(Inst, IDLoc, Instructions); + return expandUlh(Inst, false, IDLoc, Instructions) ? MER_Fail : MER_Success; case Mips::Ulw: - return expandUlw(Inst, IDLoc, Instructions); + return expandUlw(Inst, IDLoc, Instructions) ? MER_Fail : MER_Success; + case Mips::NORImm: + return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::ADDi: + case Mips::ADDiu: + case Mips::SLTi: + case Mips::SLTiu: + if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) { + int64_t ImmValue = Inst.getOperand(2).getImm(); + if (isInt<16>(ImmValue)) + return MER_NotAMacro; + return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + } + return MER_NotAMacro; + case Mips::ANDi: + case Mips::ORi: + case Mips::XORi: + if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) { + int64_t ImmValue = Inst.getOperand(2).getImm(); + if (isUInt<16>(ImmValue)) + return MER_NotAMacro; + return expandAliasImmediate(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + } + return MER_NotAMacro; + case Mips::ROL: + case Mips::ROR: + return expandRotation(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::ROLImm: + case Mips::RORImm: + return expandRotationImm(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::DROL: + case Mips::DROR: + return expandDRotation(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; + case Mips::DROLImm: + case Mips::DRORImm: + return expandDRotationImm(Inst, IDLoc, Instructions) ? MER_Fail + : MER_Success; } } -namespace { -void emitRX(unsigned Opcode, unsigned DstReg, MCOperand Imm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - MCInst tmpInst; - tmpInst.setOpcode(Opcode); - tmpInst.addOperand(MCOperand::createReg(DstReg)); - tmpInst.addOperand(Imm); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); -} - -void emitRI(unsigned Opcode, unsigned DstReg, int16_t Imm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - emitRX(Opcode, DstReg, MCOperand::createImm(Imm), IDLoc, Instructions); -} - - -void emitRRX(unsigned Opcode, unsigned DstReg, unsigned SrcReg, MCOperand Imm, - SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { - MCInst tmpInst; - tmpInst.setOpcode(Opcode); - tmpInst.addOperand(MCOperand::createReg(DstReg)); - tmpInst.addOperand(MCOperand::createReg(SrcReg)); - tmpInst.addOperand(Imm); - tmpInst.setLoc(IDLoc); - Instructions.push_back(tmpInst); -} - -void emitRRR(unsigned Opcode, unsigned DstReg, unsigned SrcReg, - unsigned SrcReg2, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - emitRRX(Opcode, DstReg, SrcReg, MCOperand::createReg(SrcReg2), IDLoc, - Instructions); -} - -void emitRRI(unsigned Opcode, unsigned DstReg, unsigned SrcReg, int16_t Imm, - SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { - emitRRX(Opcode, DstReg, SrcReg, MCOperand::createImm(Imm), IDLoc, - Instructions); -} - -template <int16_t ShiftAmount> -void createLShiftOri(MCOperand Operand, unsigned RegNo, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - if (ShiftAmount >= 32) - emitRRI(Mips::DSLL32, RegNo, RegNo, ShiftAmount - 32, IDLoc, Instructions); - else if (ShiftAmount > 0) - emitRRI(Mips::DSLL, RegNo, RegNo, ShiftAmount, IDLoc, Instructions); - - // There's no need for an ORi if the immediate is 0. - if (Operand.isImm() && Operand.getImm() == 0) - return; - - emitRRX(Mips::ORi, RegNo, RegNo, Operand, IDLoc, Instructions); -} - -template <unsigned ShiftAmount> -void createLShiftOri(int64_t Value, unsigned RegNo, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - createLShiftOri<ShiftAmount>(MCOperand::createImm(Value), RegNo, IDLoc, - Instructions); -} -} - bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { // Create a JALR instruction which is going to replace the pseudo-JAL. @@ -1800,8 +2100,11 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, if (Opcode == Mips::JalOneReg) { // jal $rs => jalr $rs - if (inMicroMipsMode()) { - JalrInst.setOpcode(Mips::JALR16_MM); + if (IsCpRestoreSet && inMicroMipsMode()) { + JalrInst.setOpcode(Mips::JALRS16_MM); + JalrInst.addOperand(FirstRegOp); + } else if (inMicroMipsMode()) { + JalrInst.setOpcode(hasMips32r6() ? Mips::JALRC16_MMR6 : Mips::JALR16_MM); JalrInst.addOperand(FirstRegOp); } else { JalrInst.setOpcode(Mips::JALR); @@ -1810,30 +2113,47 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc, } } else if (Opcode == Mips::JalTwoReg) { // jal $rd, $rs => jalr $rd, $rs - JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); + if (IsCpRestoreSet && inMicroMipsMode()) + JalrInst.setOpcode(Mips::JALRS_MM); + else + JalrInst.setOpcode(inMicroMipsMode() ? Mips::JALR_MM : Mips::JALR); JalrInst.addOperand(FirstRegOp); const MCOperand SecondRegOp = Inst.getOperand(1); JalrInst.addOperand(SecondRegOp); } Instructions.push_back(JalrInst); - // If .set reorder is active, emit a NOP after it. - if (AssemblerOptions.back()->isReorder()) { - // This is a 32-bit NOP because these 2 pseudo-instructions - // do not have a short delay slot. - MCInst NopInst; - NopInst.setOpcode(Mips::SLL); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createImm(0)); - Instructions.push_back(NopInst); + // If .set reorder is active and branch instruction has a delay slot, + // emit a NOP after it. + const MCInstrDesc &MCID = getInstDesc(JalrInst.getOpcode()); + if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) { + createNop(hasShortDelaySlot(JalrInst.getOpcode()), IDLoc, Instructions); } return false; } +/// Can the value be represented by a unsigned N-bit value and a shift left? +template <unsigned N> static bool isShiftedUIntAtAnyPosition(uint64_t x) { + unsigned BitNum = findFirstSet(x); + + return (x == x >> BitNum << BitNum) && isUInt<N>(x >> BitNum); +} + +/// Load (or add) an immediate into a register. +/// +/// @param ImmValue The immediate to load. +/// @param DstReg The register that will hold the immediate. +/// @param SrcReg A register to add to the immediate or Mips::NoRegister +/// for a simple initialization. +/// @param Is32BitImm Is ImmValue 32-bit or 64-bit? +/// @param IsAddress True if the immediate represents an address. False if it +/// is an integer. +/// @param IDLoc Location of the immediate in the source file. +/// @param Instructions The instructions emitted by this expansion. bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, - unsigned SrcReg, bool Is32BitImm, SMLoc IDLoc, + unsigned SrcReg, bool Is32BitImm, + bool IsAddress, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { if (!Is32BitImm && !isGP64bit()) { Error(IDLoc, "instruction requires a 64-bit architecture"); @@ -1852,6 +2172,9 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, } } + unsigned ZeroReg = IsAddress ? ABI.GetNullPtr() : ABI.GetZeroReg(); + unsigned AdduOp = !Is32BitImm ? Mips::DADDu : Mips::ADDu; + bool UseSrcReg = false; if (SrcReg != Mips::NoRegister) UseSrcReg = true; @@ -1866,111 +2189,129 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg, TmpReg = ATReg; } - // FIXME: gas has a special case for values that are 000...1111, which - // becomes a li -1 and then a dsrl if (isInt<16>(ImmValue)) { - // li d,j => addiu d,$zero,j if (!UseSrcReg) - SrcReg = Mips::ZERO; + SrcReg = ZeroReg; + + // This doesn't quite follow the usual ABI expectations for N32 but matches + // traditional assembler behaviour. N32 would normally use addiu for both + // integers and addresses. + if (IsAddress && !Is32BitImm) { + emitRRI(Mips::DADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions); + return false; + } + emitRRI(Mips::ADDiu, DstReg, SrcReg, ImmValue, IDLoc, Instructions); - } else if (isUInt<16>(ImmValue)) { - // li d,j => ori d,$zero,j + return false; + } + + if (isUInt<16>(ImmValue)) { unsigned TmpReg = DstReg; if (SrcReg == DstReg) { - unsigned ATReg = getATReg(IDLoc); - if (!ATReg) + TmpReg = getATReg(IDLoc); + if (!TmpReg) return true; - TmpReg = ATReg; } - emitRRI(Mips::ORi, TmpReg, Mips::ZERO, ImmValue, IDLoc, Instructions); + emitRRI(Mips::ORi, TmpReg, ZeroReg, ImmValue, IDLoc, Instructions); if (UseSrcReg) - emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions); - } else if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { + emitRRR(ABI.GetPtrAdduOp(), DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } + + if (isInt<32>(ImmValue) || isUInt<32>(ImmValue)) { warnIfNoMacro(IDLoc); - // For all other values which are representable as a 32-bit integer: - // li d,j => lui d,hi16(j) - // ori d,d,lo16(j) uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; uint16_t Bits15To0 = ImmValue & 0xffff; if (!Is32BitImm && !isInt<32>(ImmValue)) { - // For DLI, expand to an ORi instead of a LUi to avoid sign-extending the + // Traditional behaviour seems to special case this particular value. It's + // not clear why other masks are handled differently. + if (ImmValue == 0xffffffff) { + emitRI(Mips::LUi, TmpReg, 0xffff, IDLoc, Instructions); + emitRRI(Mips::DSRL32, TmpReg, TmpReg, 0, IDLoc, Instructions); + if (UseSrcReg) + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } + + // Expand to an ORi instead of a LUi to avoid sign-extending into the // upper 32 bits. - emitRRI(Mips::ORi, TmpReg, Mips::ZERO, Bits31To16, IDLoc, Instructions); + emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits31To16, IDLoc, Instructions); emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, Instructions); - } else - emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions); - createLShiftOri<0>(Bits15To0, TmpReg, IDLoc, Instructions); + if (Bits15To0) + emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions); + if (UseSrcReg) + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } + emitRI(Mips::LUi, TmpReg, Bits31To16, IDLoc, Instructions); + if (Bits15To0) + emitRRI(Mips::ORi, TmpReg, TmpReg, Bits15To0, IDLoc, Instructions); if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); - - } else if ((ImmValue & (0xffffLL << 48)) == 0) { - warnIfNoMacro(IDLoc); + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; + } - // <------- lo32 ------> - // <------- hi32 ------> - // <- hi16 -> <- lo16 -> - // _________________________________ - // | | | | - // | 16-bits | 16-bits | 16-bits | - // |__________|__________|__________| - // - // For any 64-bit value that is representable as a 48-bit integer: - // li d,j => lui d,hi16(j) - // ori d,d,hi16(lo32(j)) - // dsll d,d,16 - // ori d,d,lo16(lo32(j)) - uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff; - uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; - uint16_t Bits15To0 = ImmValue & 0xffff; + if (isShiftedUIntAtAnyPosition<16>(ImmValue)) { + if (Is32BitImm) { + Error(IDLoc, "instruction requires a 32-bit immediate"); + return true; + } - emitRI(Mips::LUi, TmpReg, Bits47To32, IDLoc, Instructions); - createLShiftOri<0>(Bits31To16, TmpReg, IDLoc, Instructions); - createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions); + // Traditionally, these immediates are shifted as little as possible and as + // such we align the most significant bit to bit 15 of our temporary. + unsigned FirstSet = findFirstSet((uint64_t)ImmValue); + unsigned LastSet = findLastSet((uint64_t)ImmValue); + unsigned ShiftAmount = FirstSet - (15 - (LastSet - FirstSet)); + uint16_t Bits = (ImmValue >> ShiftAmount) & 0xffff; + emitRRI(Mips::ORi, TmpReg, ZeroReg, Bits, IDLoc, Instructions); + emitRRI(Mips::DSLL, TmpReg, TmpReg, ShiftAmount, IDLoc, Instructions); if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); - } else { - warnIfNoMacro(IDLoc); + return false; + } - // <------- hi32 ------> <------- lo32 ------> - // <- hi16 -> <- lo16 -> - // ___________________________________________ - // | | | | | - // | 16-bits | 16-bits | 16-bits | 16-bits | - // |__________|__________|__________|__________| - // - // For all other values which are representable as a 64-bit integer: - // li d,j => lui d,hi16(j) - // ori d,d,lo16(hi32(j)) - // dsll d,d,16 - // ori d,d,hi16(lo32(j)) - // dsll d,d,16 - // ori d,d,lo16(lo32(j)) - uint16_t Bits63To48 = (ImmValue >> 48) & 0xffff; - uint16_t Bits47To32 = (ImmValue >> 32) & 0xffff; - uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff; - uint16_t Bits15To0 = ImmValue & 0xffff; + warnIfNoMacro(IDLoc); - emitRI(Mips::LUi, TmpReg, Bits63To48, IDLoc, Instructions); - createLShiftOri<0>(Bits47To32, TmpReg, IDLoc, Instructions); + // The remaining case is packed with a sequence of dsll and ori with zeros + // being omitted and any neighbouring dsll's being coalesced. + // The highest 32-bit's are equivalent to a 32-bit immediate load. - // When Bits31To16 is 0, do a left shift of 32 bits instead of doing - // two left shifts of 16 bits. - if (Bits31To16 == 0) { - createLShiftOri<32>(Bits15To0, TmpReg, IDLoc, Instructions); - } else { - createLShiftOri<16>(Bits31To16, TmpReg, IDLoc, Instructions); - createLShiftOri<16>(Bits15To0, TmpReg, IDLoc, Instructions); + // Load bits 32-63 of ImmValue into bits 0-31 of the temporary register. + if (loadImmediate(ImmValue >> 32, TmpReg, Mips::NoRegister, true, false, + IDLoc, Instructions)) + return false; + + // Shift and accumulate into the register. If a 16-bit chunk is zero, then + // skip it and defer the shift to the next chunk. + unsigned ShiftCarriedForwards = 16; + for (int BitNum = 16; BitNum >= 0; BitNum -= 16) { + uint16_t ImmChunk = (ImmValue >> BitNum) & 0xffff; + + if (ImmChunk != 0) { + emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, + Instructions); + emitRRI(Mips::ORi, TmpReg, TmpReg, ImmChunk, IDLoc, Instructions); + ShiftCarriedForwards = 0; } - if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitImm, Instructions); + ShiftCarriedForwards += 16; } + ShiftCarriedForwards -= 16; + + // Finish any remaining shifts left by trailing zeros. + if (ShiftCarriedForwards) + emitAppropriateDSLL(TmpReg, TmpReg, ShiftCarriedForwards, IDLoc, + Instructions); + + if (UseSrcReg) + emitRRR(AdduOp, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + return false; } @@ -1982,63 +2323,38 @@ bool MipsAsmParser::expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, assert(DstRegOp.isReg() && "expected register operand kind"); if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister, - Is32BitImm, IDLoc, Instructions)) + Is32BitImm, false, IDLoc, Instructions)) return true; return false; } -bool -MipsAsmParser::expandLoadAddressReg(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - const MCOperand &DstRegOp = Inst.getOperand(0); - assert(DstRegOp.isReg() && "expected register operand kind"); - - const MCOperand &SrcRegOp = Inst.getOperand(1); - assert(SrcRegOp.isReg() && "expected register operand kind"); - - const MCOperand &ImmOp = Inst.getOperand(2); - assert((ImmOp.isImm() || ImmOp.isExpr()) && - "expected immediate operand kind"); - if (!ImmOp.isImm()) { - if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(), - SrcRegOp.getReg(), Is32BitImm, IDLoc, - Instructions)) - return true; - - return false; - } - - if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), SrcRegOp.getReg(), - Is32BitImm, IDLoc, Instructions)) +bool MipsAsmParser::expandLoadAddress(unsigned DstReg, unsigned BaseReg, + const MCOperand &Offset, + bool Is32BitAddress, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + // la can't produce a usable address when addresses are 64-bit. + if (Is32BitAddress && ABI.ArePtrs64bit()) { + // FIXME: Demote this to a warning and continue as if we had 'dla' instead. + // We currently can't do this because we depend on the equality + // operator and N64 can end up with a GPR32/GPR64 mismatch. + Error(IDLoc, "la used to load 64-bit address"); + // Continue as if we had 'dla' instead. + Is32BitAddress = false; + } + + // dla requires 64-bit addresses. + if (!Is32BitAddress && !ABI.ArePtrs64bit()) { + Error(IDLoc, "instruction requires a 64-bit architecture"); return true; - - return false; -} - -bool -MipsAsmParser::expandLoadAddressImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { - const MCOperand &DstRegOp = Inst.getOperand(0); - assert(DstRegOp.isReg() && "expected register operand kind"); - - const MCOperand &ImmOp = Inst.getOperand(1); - assert((ImmOp.isImm() || ImmOp.isExpr()) && - "expected immediate operand kind"); - if (!ImmOp.isImm()) { - if (loadAndAddSymbolAddress(ImmOp.getExpr(), DstRegOp.getReg(), - Mips::NoRegister, Is32BitImm, IDLoc, - Instructions)) - return true; - - return false; } - if (loadImmediate(ImmOp.getImm(), DstRegOp.getReg(), Mips::NoRegister, - Is32BitImm, IDLoc, Instructions)) - return true; + if (!Offset.isImm()) + return loadAndAddSymbolAddress(Offset.getExpr(), DstReg, BaseReg, + Is32BitAddress, IDLoc, Instructions); - return false; + return loadImmediate(Offset.getImm(), DstReg, BaseReg, Is32BitAddress, true, + IDLoc, Instructions); } bool MipsAsmParser::loadAndAddSymbolAddress( @@ -2046,67 +2362,102 @@ bool MipsAsmParser::loadAndAddSymbolAddress( SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { warnIfNoMacro(IDLoc); - if (Is32BitSym && isABI_N64()) - Warning(IDLoc, "instruction loads the 32-bit address of a 64-bit symbol"); - - MCInst tmpInst; - const MCSymbolRefExpr *Symbol = cast<MCSymbolRefExpr>(SymExpr); - const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_HI, getContext()); - const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_ABS_LO, getContext()); + const MCExpr *Symbol = cast<MCExpr>(SymExpr); + const MipsMCExpr *HiExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_ABS_HI, Symbol, getContext()); + const MipsMCExpr *LoExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_ABS_LO, Symbol, getContext()); bool UseSrcReg = SrcReg != Mips::NoRegister; + // This is the 64-bit symbol address expansion. + if (ABI.ArePtrs64bit() && isGP64bit()) { + // We always need AT for the 64-bit expansion. + // If it is not available we exit. + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + + const MipsMCExpr *HighestExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_HIGHEST, Symbol, getContext()); + const MipsMCExpr *HigherExpr = MipsMCExpr::create( + MCSymbolRefExpr::VK_Mips_HIGHER, Symbol, getContext()); + + if (UseSrcReg && (DstReg == SrcReg)) { + // If $rs is the same as $rd: + // (d)la $rd, sym($rd) => lui $at, %highest(sym) + // daddiu $at, $at, %higher(sym) + // dsll $at, $at, 16 + // daddiu $at, $at, %hi(sym) + // dsll $at, $at, 16 + // daddiu $at, $at, %lo(sym) + // daddu $rd, $at, $rd + emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc, + Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HigherExpr), + IDLoc, Instructions); + emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr), IDLoc, + Instructions); + emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, + Instructions); + emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, Instructions); + + return false; + } + + // Otherwise, if the $rs is different from $rd or if $rs isn't specified: + // (d)la $rd, sym/sym($rs) => lui $rd, %highest(sym) + // lui $at, %hi(sym) + // daddiu $rd, $rd, %higher(sym) + // daddiu $at, $at, %lo(sym) + // dsll32 $rd, $rd, 0 + // daddu $rd, $rd, $at + // (daddu $rd, $rd, $rs) + emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc, + Instructions); + emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, + Instructions); + emitRRX(Mips::DADDiu, DstReg, DstReg, MCOperand::createExpr(HigherExpr), + IDLoc, Instructions); + emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr), IDLoc, + Instructions); + emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, Instructions); + emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, Instructions); + if (UseSrcReg) + emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, Instructions); + + return false; + } + + // And now, the 32-bit symbol address expansion: + // If $rs is the same as $rd: + // (d)la $rd, sym($rd) => lui $at, %hi(sym) + // ori $at, $at, %lo(sym) + // addu $rd, $at, $rd + // Otherwise, if the $rs is different from $rd or if $rs isn't specified: + // (d)la $rd, sym/sym($rs) => lui $rd, %hi(sym) + // ori $rd, $rd, %lo(sym) + // (addu $rd, $rd, $rs) unsigned TmpReg = DstReg; if (UseSrcReg && (DstReg == SrcReg)) { - // At this point we need AT to perform the expansions and we exit if it is - // not available. + // If $rs is the same as $rd, we need to use AT. + // If it is not available we exit. unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; TmpReg = ATReg; } - if (!Is32BitSym) { - // If it's a 64-bit architecture, expand to: - // la d,sym => lui d,highest(sym) - // ori d,d,higher(sym) - // dsll d,d,16 - // ori d,d,hi16(sym) - // dsll d,d,16 - // ori d,d,lo16(sym) - const MCSymbolRefExpr *HighestExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHEST, getContext()); - const MCSymbolRefExpr *HigherExpr = MCSymbolRefExpr::create( - &Symbol->getSymbol(), MCSymbolRefExpr::VK_Mips_HIGHER, getContext()); - - tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createExpr(HighestExpr)); - Instructions.push_back(tmpInst); - - createLShiftOri<0>(MCOperand::createExpr(HigherExpr), TmpReg, SMLoc(), - Instructions); - createLShiftOri<16>(MCOperand::createExpr(HiExpr), TmpReg, SMLoc(), - Instructions); - createLShiftOri<16>(MCOperand::createExpr(LoExpr), TmpReg, SMLoc(), - Instructions); - } else { - // Otherwise, expand to: - // la d,sym => lui d,hi16(sym) - // ori d,d,lo16(sym) - tmpInst.setOpcode(Mips::LUi); - tmpInst.addOperand(MCOperand::createReg(TmpReg)); - tmpInst.addOperand(MCOperand::createExpr(HiExpr)); - Instructions.push_back(tmpInst); - - emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), SMLoc(), - Instructions); - } + emitRX(Mips::LUi, TmpReg, MCOperand::createExpr(HiExpr), IDLoc, Instructions); + emitRRX(Mips::ADDiu, TmpReg, TmpReg, MCOperand::createExpr(LoExpr), IDLoc, + Instructions); if (UseSrcReg) - createAddu(DstReg, TmpReg, SrcReg, !Is32BitSym, Instructions); + emitRRR(Mips::ADDu, DstReg, TmpReg, SrcReg, IDLoc, Instructions); + else + assert(DstReg == TmpReg); return false; } @@ -2125,12 +2476,13 @@ bool MipsAsmParser::expandUncondBranchMMPseudo( Inst.addOperand(MCOperand::createExpr(Offset.getExpr())); } else { assert(Offset.isImm() && "expected immediate operand kind"); - if (isIntN(11, Offset.getImm())) { + if (isInt<11>(Offset.getImm())) { // If offset fits into 11 bits then this instruction becomes microMIPS // 16-bit unconditional branch instruction. - Inst.setOpcode(Mips::B16_MM); + if (inMicroMipsMode()) + Inst.setOpcode(hasMips32r6() ? Mips::BC16_MMR6 : Mips::B16_MM); } else { - if (!isIntN(17, Offset.getImm())) + if (!isInt<17>(Offset.getImm())) Error(IDLoc, "branch target out of range"); if (OffsetToAlignment(Offset.getImm(), 1LL << 1)) Error(IDLoc, "branch to misaligned address"); @@ -2143,8 +2495,10 @@ bool MipsAsmParser::expandUncondBranchMMPseudo( } Instructions.push_back(Inst); - // If .set reorder is active, emit a NOP after the branch instruction. - if (AssemblerOptions.back()->isReorder()) + // If .set reorder is active and branch instruction has a delay slot, + // emit a NOP after it. + const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode()); + if (MCID.hasDelaySlot() && AssemblerOptions.back()->isReorder()) createNop(true, IDLoc, Instructions); return false; @@ -2175,30 +2529,21 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, } int64_t ImmValue = ImmOp.getImm(); - if (ImmValue == 0) { - MCInst BranchInst; - BranchInst.setOpcode(OpCode); - BranchInst.addOperand(DstRegOp); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MemOffsetOp); - Instructions.push_back(BranchInst); - } else { + if (ImmValue == 0) + emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc, + Instructions); + else { warnIfNoMacro(IDLoc); unsigned ATReg = getATReg(IDLoc); if (!ATReg) return true; - if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), IDLoc, - Instructions)) + if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, !isGP64bit(), true, + IDLoc, Instructions)) return true; - MCInst BranchInst; - BranchInst.setOpcode(OpCode); - BranchInst.addOperand(DstRegOp); - BranchInst.addOperand(MCOperand::createReg(ATReg)); - BranchInst.addOperand(MemOffsetOp); - Instructions.push_back(BranchInst); + emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, Instructions); } return false; } @@ -2206,7 +2551,6 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd) { - MCInst TempInst; unsigned ImmOffset, HiOffset, LoOffset; const MCExpr *ExprOffset; unsigned TmpRegNum; @@ -2227,8 +2571,6 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, HiOffset++; } else ExprOffset = Inst.getOperand(2).getExpr(); - // All instructions will have the same location. - TempInst.setLoc(IDLoc); // These are some of the types of expansions we perform here: // 1) lw $8, sym => lui $8, %hi(sym) // lw $8, %lo(sym)($8) @@ -2267,40 +2609,20 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, return; } - TempInst.setOpcode(Mips::LUi); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - if (isImmOpnd) - TempInst.addOperand(MCOperand::createImm(HiOffset)); - else { - const MCExpr *HiExpr = evaluateRelocExpr(ExprOffset, "hi"); - TempInst.addOperand(MCOperand::createExpr(HiExpr)); - } - // Add the instruction to the list. - Instructions.push_back(TempInst); - // Prepare TempInst for next instruction. - TempInst.clear(); + emitRX(Mips::LUi, TmpRegNum, + isImmOpnd ? MCOperand::createImm(HiOffset) + : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "hi")), + IDLoc, Instructions); // Add temp register to base. - if (BaseRegNum != Mips::ZERO) { - TempInst.setOpcode(Mips::ADDu); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - TempInst.addOperand(MCOperand::createReg(BaseRegNum)); - Instructions.push_back(TempInst); - TempInst.clear(); - } + if (BaseRegNum != Mips::ZERO) + emitRRR(Mips::ADDu, TmpRegNum, TmpRegNum, BaseRegNum, IDLoc, Instructions); // And finally, create original instruction with low part // of offset and new base. - TempInst.setOpcode(Inst.getOpcode()); - TempInst.addOperand(MCOperand::createReg(RegOpNum)); - TempInst.addOperand(MCOperand::createReg(TmpRegNum)); - if (isImmOpnd) - TempInst.addOperand(MCOperand::createImm(LoOffset)); - else { - const MCExpr *LoExpr = evaluateRelocExpr(ExprOffset, "lo"); - TempInst.addOperand(MCOperand::createExpr(LoExpr)); - } - Instructions.push_back(TempInst); - TempInst.clear(); + emitRRX(Inst.getOpcode(), RegOpNum, TmpRegNum, + isImmOpnd + ? MCOperand::createImm(LoOffset) + : MCOperand::createExpr(evaluateRelocExpr(ExprOffset, "lo")), + IDLoc, Instructions); } bool @@ -2316,10 +2638,16 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, if (OpNum < 8 && Inst.getOperand(OpNum - 1).getImm() <= 60 && Inst.getOperand(OpNum - 1).getImm() >= 0 && - Inst.getOperand(OpNum - 2).getReg() == Mips::SP && - Inst.getOperand(OpNum - 3).getReg() == Mips::RA) + (Inst.getOperand(OpNum - 2).getReg() == Mips::SP || + Inst.getOperand(OpNum - 2).getReg() == Mips::SP_64) && + (Inst.getOperand(OpNum - 3).getReg() == Mips::RA || + Inst.getOperand(OpNum - 3).getReg() == Mips::RA_64)) { // It can be implemented as SWM16 or LWM16 instruction. - NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM; + if (inMicroMipsMode() && hasMips32r6()) + NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MMR6 : Mips::LWM16_MMR6; + else + NewOpcode = Opcode == Mips::SWM_MM ? Mips::SWM16_MM : Mips::LWM16_MM; + } Inst.setOpcode(NewOpcode); Instructions.push_back(Inst); @@ -2328,44 +2656,126 @@ MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { + bool EmittedNoMacroWarning = false; unsigned PseudoOpcode = Inst.getOpcode(); unsigned SrcReg = Inst.getOperand(0).getReg(); - unsigned TrgReg = Inst.getOperand(1).getReg(); + const MCOperand &TrgOp = Inst.getOperand(1); const MCExpr *OffsetExpr = Inst.getOperand(2).getExpr(); unsigned ZeroSrcOpcode, ZeroTrgOpcode; - bool ReverseOrderSLT, IsUnsigned, AcceptsEquality; + bool ReverseOrderSLT, IsUnsigned, IsLikely, AcceptsEquality; + + unsigned TrgReg; + if (TrgOp.isReg()) + TrgReg = TrgOp.getReg(); + else if (TrgOp.isImm()) { + warnIfNoMacro(IDLoc); + EmittedNoMacroWarning = true; + + TrgReg = getATReg(IDLoc); + if (!TrgReg) + return true; + + switch(PseudoOpcode) { + default: + llvm_unreachable("unknown opcode for branch pseudo-instruction"); + case Mips::BLTImmMacro: + PseudoOpcode = Mips::BLT; + break; + case Mips::BLEImmMacro: + PseudoOpcode = Mips::BLE; + break; + case Mips::BGEImmMacro: + PseudoOpcode = Mips::BGE; + break; + case Mips::BGTImmMacro: + PseudoOpcode = Mips::BGT; + break; + case Mips::BLTUImmMacro: + PseudoOpcode = Mips::BLTU; + break; + case Mips::BLEUImmMacro: + PseudoOpcode = Mips::BLEU; + break; + case Mips::BGEUImmMacro: + PseudoOpcode = Mips::BGEU; + break; + case Mips::BGTUImmMacro: + PseudoOpcode = Mips::BGTU; + break; + case Mips::BLTLImmMacro: + PseudoOpcode = Mips::BLTL; + break; + case Mips::BLELImmMacro: + PseudoOpcode = Mips::BLEL; + break; + case Mips::BGELImmMacro: + PseudoOpcode = Mips::BGEL; + break; + case Mips::BGTLImmMacro: + PseudoOpcode = Mips::BGTL; + break; + case Mips::BLTULImmMacro: + PseudoOpcode = Mips::BLTUL; + break; + case Mips::BLEULImmMacro: + PseudoOpcode = Mips::BLEUL; + break; + case Mips::BGEULImmMacro: + PseudoOpcode = Mips::BGEUL; + break; + case Mips::BGTULImmMacro: + PseudoOpcode = Mips::BGTUL; + break; + } + + if (loadImmediate(TrgOp.getImm(), TrgReg, Mips::NoRegister, !isGP64bit(), + false, IDLoc, Instructions)) + return true; + } switch (PseudoOpcode) { case Mips::BLT: case Mips::BLTU: + case Mips::BLTL: + case Mips::BLTUL: AcceptsEquality = false; ReverseOrderSLT = false; - IsUnsigned = (PseudoOpcode == Mips::BLTU); + IsUnsigned = ((PseudoOpcode == Mips::BLTU) || (PseudoOpcode == Mips::BLTUL)); + IsLikely = ((PseudoOpcode == Mips::BLTL) || (PseudoOpcode == Mips::BLTUL)); ZeroSrcOpcode = Mips::BGTZ; ZeroTrgOpcode = Mips::BLTZ; break; case Mips::BLE: case Mips::BLEU: + case Mips::BLEL: + case Mips::BLEUL: AcceptsEquality = true; ReverseOrderSLT = true; - IsUnsigned = (PseudoOpcode == Mips::BLEU); + IsUnsigned = ((PseudoOpcode == Mips::BLEU) || (PseudoOpcode == Mips::BLEUL)); + IsLikely = ((PseudoOpcode == Mips::BLEL) || (PseudoOpcode == Mips::BLEUL)); ZeroSrcOpcode = Mips::BGEZ; ZeroTrgOpcode = Mips::BLEZ; break; case Mips::BGE: case Mips::BGEU: + case Mips::BGEL: + case Mips::BGEUL: AcceptsEquality = true; ReverseOrderSLT = false; - IsUnsigned = (PseudoOpcode == Mips::BGEU); + IsUnsigned = ((PseudoOpcode == Mips::BGEU) || (PseudoOpcode == Mips::BGEUL)); + IsLikely = ((PseudoOpcode == Mips::BGEL) || (PseudoOpcode == Mips::BGEUL)); ZeroSrcOpcode = Mips::BLEZ; ZeroTrgOpcode = Mips::BGEZ; break; case Mips::BGT: case Mips::BGTU: + case Mips::BGTL: + case Mips::BGTUL: AcceptsEquality = false; ReverseOrderSLT = true; - IsUnsigned = (PseudoOpcode == Mips::BGTU); + IsUnsigned = ((PseudoOpcode == Mips::BGTU) || (PseudoOpcode == Mips::BGTUL)); + IsLikely = ((PseudoOpcode == Mips::BGTL) || (PseudoOpcode == Mips::BGTUL)); ZeroSrcOpcode = Mips::BLTZ; ZeroTrgOpcode = Mips::BGTZ; break; @@ -2373,7 +2783,6 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, llvm_unreachable("unknown opcode for branch pseudo-instruction"); } - MCInst BranchInst; bool IsTrgRegZero = (TrgReg == Mips::ZERO); bool IsSrcRegZero = (SrcReg == Mips::ZERO); if (IsSrcRegZero && IsTrgRegZero) { @@ -2381,51 +2790,37 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // with GAS' behaviour. However, they may not generate the most efficient // code in some circumstances. if (PseudoOpcode == Mips::BLT) { - BranchInst.setOpcode(Mips::BLTZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BLTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); return false; } if (PseudoOpcode == Mips::BLE) { - BranchInst.setOpcode(Mips::BLEZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BLEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); Warning(IDLoc, "branch is always taken"); return false; } if (PseudoOpcode == Mips::BGE) { - BranchInst.setOpcode(Mips::BGEZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BGEZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); Warning(IDLoc, "branch is always taken"); return false; } if (PseudoOpcode == Mips::BGT) { - BranchInst.setOpcode(Mips::BGTZ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(Mips::BGTZ, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); return false; } if (PseudoOpcode == Mips::BGTU) { - BranchInst.setOpcode(Mips::BNE); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(Mips::BNE, Mips::ZERO, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); return false; } if (AcceptsEquality) { // If both registers are $0 and the pseudo-branch accepts equality, it // will always be taken, so we emit an unconditional branch. - BranchInst.setOpcode(Mips::BEQ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); Warning(IDLoc, "branch is always taken"); return false; } @@ -2449,11 +2844,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // the pseudo-branch will always be taken, so we emit an unconditional // branch. // This only applies to unsigned pseudo-branches. - BranchInst.setOpcode(Mips::BEQ); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(Mips::BEQ, Mips::ZERO, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); Warning(IDLoc, "branch is always taken"); return false; } @@ -2470,21 +2862,17 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // // Because only BLEU and BGEU branch on equality, we can use the // AcceptsEquality variable to decide when to emit the BEQZ. - BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE); - BranchInst.addOperand( - MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRX(AcceptsEquality ? Mips::BEQ : Mips::BNE, + IsSrcRegZero ? TrgReg : SrcReg, Mips::ZERO, + MCOperand::createExpr(OffsetExpr), IDLoc, Instructions); return false; } // If we have a signed pseudo-branch and one of the registers is $0, // we can use an appropriate compare-to-zero branch. We select which one // to use in the switch statement above. - BranchInst.setOpcode(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode); - BranchInst.addOperand(MCOperand::createReg(IsSrcRegZero ? TrgReg : SrcReg)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRX(IsSrcRegZero ? ZeroSrcOpcode : ZeroTrgOpcode, + IsSrcRegZero ? TrgReg : SrcReg, MCOperand::createExpr(OffsetExpr), + IDLoc, Instructions); return false; } @@ -2494,7 +2882,8 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, if (!ATRegNum) return true; - warnIfNoMacro(IDLoc); + if (!EmittedNoMacroWarning) + warnIfNoMacro(IDLoc); // SLT fits well with 2 of our 4 pseudo-branches: // BLT, where $rs < $rt, translates into "slt $at, $rs, $rt" and @@ -2511,23 +2900,135 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc, // // The same applies to the unsigned variants, except that SLTu is used // instead of SLT. - MCInst SetInst; - SetInst.setOpcode(IsUnsigned ? Mips::SLTu : Mips::SLT); - SetInst.addOperand(MCOperand::createReg(ATRegNum)); - SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? TrgReg : SrcReg)); - SetInst.addOperand(MCOperand::createReg(ReverseOrderSLT ? SrcReg : TrgReg)); - Instructions.push_back(SetInst); - - BranchInst.setOpcode(AcceptsEquality ? Mips::BEQ : Mips::BNE); - BranchInst.addOperand(MCOperand::createReg(ATRegNum)); - BranchInst.addOperand(MCOperand::createReg(Mips::ZERO)); - BranchInst.addOperand(MCOperand::createExpr(OffsetExpr)); - Instructions.push_back(BranchInst); + emitRRR(IsUnsigned ? Mips::SLTu : Mips::SLT, ATRegNum, + ReverseOrderSLT ? TrgReg : SrcReg, ReverseOrderSLT ? SrcReg : TrgReg, + IDLoc, Instructions); + + emitRRX(IsLikely ? (AcceptsEquality ? Mips::BEQL : Mips::BNEL) + : (AcceptsEquality ? Mips::BEQ : Mips::BNE), + ATRegNum, Mips::ZERO, MCOperand::createExpr(OffsetExpr), IDLoc, + Instructions); return false; } -bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc, - SmallVectorImpl<MCInst> &Instructions) { +bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions, + const bool IsMips64, const bool Signed) { + if (hasMips32r6()) { + Error(IDLoc, "instruction not supported on mips32r6 or mips64r6"); + return false; + } + + warnIfNoMacro(IDLoc); + + const MCOperand &RsRegOp = Inst.getOperand(0); + assert(RsRegOp.isReg() && "expected register operand kind"); + unsigned RsReg = RsRegOp.getReg(); + + const MCOperand &RtRegOp = Inst.getOperand(1); + assert(RtRegOp.isReg() && "expected register operand kind"); + unsigned RtReg = RtRegOp.getReg(); + unsigned DivOp; + unsigned ZeroReg; + + if (IsMips64) { + DivOp = Signed ? Mips::DSDIV : Mips::DUDIV; + ZeroReg = Mips::ZERO_64; + } else { + DivOp = Signed ? Mips::SDIV : Mips::UDIV; + ZeroReg = Mips::ZERO; + } + + bool UseTraps = useTraps(); + + if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) { + if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) + Warning(IDLoc, "dividing zero by zero"); + if (IsMips64) { + if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) { + if (UseTraps) { + emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions); + return false; + } + + emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions); + return false; + } + } else { + emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions); + return false; + } + } + + if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) { + Warning(IDLoc, "division by zero"); + if (Signed) { + if (UseTraps) { + emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions); + return false; + } + + emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions); + return false; + } + } + + // FIXME: The values for these two BranchTarget variables may be different in + // micromips. These magic numbers need to be removed. + unsigned BranchTargetNoTraps; + unsigned BranchTarget; + + if (UseTraps) { + BranchTarget = IsMips64 ? 12 : 8; + emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, Instructions); + } else { + BranchTarget = IsMips64 ? 20 : 16; + BranchTargetNoTraps = 8; + // Branch to the li instruction. + emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, + Instructions); + } + + emitRR(DivOp, RsReg, RtReg, IDLoc, Instructions); + + if (!UseTraps) + emitII(Mips::BREAK, 0x7, 0, IDLoc, Instructions); + + if (!Signed) { + emitR(Mips::MFLO, RsReg, IDLoc, Instructions); + return false; + } + + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + + emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, Instructions); + if (IsMips64) { + // Branch to the mflo instruction. + emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions); + emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, Instructions); + emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, Instructions); + } else { + // Branch to the mflo instruction. + emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, Instructions); + emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, Instructions); + } + + if (UseTraps) + emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, Instructions); + else { + // Branch to the mflo instruction. + emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, Instructions); + emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, Instructions); + emitII(Mips::BREAK, 0x6, 0, IDLoc, Instructions); + } + emitR(Mips::MFLO, RsReg, IDLoc, Instructions); + return false; +} + +bool MipsAsmParser::expandUlh(MCInst &Inst, bool Signed, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { if (hasMips32r6() || hasMips64r6()) { Error(IDLoc, "instruction not supported on mips32r6 or mips64r6"); return false; @@ -2562,7 +3063,7 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc, LoadedOffsetInAT = true; if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(), - IDLoc, Instructions)) + true, IDLoc, Instructions)) return true; // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate() @@ -2590,33 +3091,15 @@ bool MipsAsmParser::expandUlhu(MCInst &Inst, SMLoc IDLoc, unsigned SllReg = LoadedOffsetInAT ? DstReg : ATReg; - MCInst TmpInst; - TmpInst.setOpcode(Mips::LBu); - TmpInst.addOperand(MCOperand::createReg(FirstLbuDstReg)); - TmpInst.addOperand(MCOperand::createReg(LbuSrcReg)); - TmpInst.addOperand(MCOperand::createImm(FirstLbuOffset)); - Instructions.push_back(TmpInst); - - TmpInst.clear(); - TmpInst.setOpcode(Mips::LBu); - TmpInst.addOperand(MCOperand::createReg(SecondLbuDstReg)); - TmpInst.addOperand(MCOperand::createReg(LbuSrcReg)); - TmpInst.addOperand(MCOperand::createImm(SecondLbuOffset)); - Instructions.push_back(TmpInst); - - TmpInst.clear(); - TmpInst.setOpcode(Mips::SLL); - TmpInst.addOperand(MCOperand::createReg(SllReg)); - TmpInst.addOperand(MCOperand::createReg(SllReg)); - TmpInst.addOperand(MCOperand::createImm(8)); - Instructions.push_back(TmpInst); - - TmpInst.clear(); - TmpInst.setOpcode(Mips::OR); - TmpInst.addOperand(MCOperand::createReg(DstReg)); - TmpInst.addOperand(MCOperand::createReg(DstReg)); - TmpInst.addOperand(MCOperand::createReg(ATReg)); - Instructions.push_back(TmpInst); + emitRRI(Signed ? Mips::LB : Mips::LBu, FirstLbuDstReg, LbuSrcReg, + FirstLbuOffset, IDLoc, Instructions); + + emitRRI(Mips::LBu, SecondLbuDstReg, LbuSrcReg, SecondLbuOffset, IDLoc, + Instructions); + + emitRRI(Mips::SLL, SllReg, SllReg, 8, IDLoc, Instructions); + + emitRRR(Mips::OR, DstReg, DstReg, ATReg, IDLoc, Instructions); return false; } @@ -2654,7 +3137,7 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc, warnIfNoMacro(IDLoc); if (loadImmediate(OffsetValue, ATReg, Mips::NoRegister, !ABI.ArePtrs64bit(), - IDLoc, Instructions)) + true, IDLoc, Instructions)) return true; // NOTE: We do this (D)ADDu here instead of doing it in loadImmediate() @@ -2677,37 +3160,373 @@ bool MipsAsmParser::expandUlw(MCInst &Inst, SMLoc IDLoc, RightLoadOffset = LoadedOffsetInAT ? 3 : (OffsetValue + 3); } - MCInst LeftLoadInst; - LeftLoadInst.setOpcode(Mips::LWL); - LeftLoadInst.addOperand(DstRegOp); - LeftLoadInst.addOperand(MCOperand::createReg(FinalSrcReg)); - LeftLoadInst.addOperand(MCOperand::createImm(LeftLoadOffset)); - Instructions.push_back(LeftLoadInst); + emitRRI(Mips::LWL, DstRegOp.getReg(), FinalSrcReg, LeftLoadOffset, IDLoc, + Instructions); - MCInst RightLoadInst; - RightLoadInst.setOpcode(Mips::LWR); - RightLoadInst.addOperand(DstRegOp); - RightLoadInst.addOperand(MCOperand::createReg(FinalSrcReg)); - RightLoadInst.addOperand(MCOperand::createImm(RightLoadOffset )); - Instructions.push_back(RightLoadInst); + emitRRI(Mips::LWR, DstRegOp.getReg(), FinalSrcReg, RightLoadOffset, IDLoc, + Instructions); return false; } +bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + assert (Inst.getNumOperands() == 3 && "Invalid operand count"); + assert (Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isImm() && "Invalid instruction operand."); + + unsigned ATReg = Mips::NoRegister; + unsigned FinalDstReg = Mips::NoRegister; + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm(); + + bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue); + + unsigned FinalOpcode = Inst.getOpcode(); + + if (DstReg == SrcReg) { + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + FinalDstReg = DstReg; + DstReg = ATReg; + } + + if (!loadImmediate(ImmValue, DstReg, Mips::NoRegister, Is32Bit, false, Inst.getLoc(), Instructions)) { + switch (FinalOpcode) { + default: + llvm_unreachable("unimplemented expansion"); + case (Mips::ADDi): + FinalOpcode = Mips::ADD; + break; + case (Mips::ADDiu): + FinalOpcode = Mips::ADDu; + break; + case (Mips::ANDi): + FinalOpcode = Mips::AND; + break; + case (Mips::NORImm): + FinalOpcode = Mips::NOR; + break; + case (Mips::ORi): + FinalOpcode = Mips::OR; + break; + case (Mips::SLTi): + FinalOpcode = Mips::SLT; + break; + case (Mips::SLTiu): + FinalOpcode = Mips::SLTu; + break; + case (Mips::XORi): + FinalOpcode = Mips::XOR; + break; + } + + if (FinalDstReg == Mips::NoRegister) + emitRRR(FinalOpcode, DstReg, DstReg, SrcReg, IDLoc, Instructions); + else + emitRRR(FinalOpcode, FinalDstReg, FinalDstReg, DstReg, IDLoc, + Instructions); + return false; + } + return true; +} + +bool MipsAsmParser::expandRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + unsigned TReg = Inst.getOperand(2).getReg(); + unsigned TmpReg = DReg; + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + if (hasMips32r2()) { + + if (DReg == SReg) { + TmpReg = getATReg(Inst.getLoc()); + if (!TmpReg) + return true; + } + + if (Inst.getOpcode() == Mips::ROL) { + emitRRR(Mips::SUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::ROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions); + return false; + } + + if (Inst.getOpcode() == Mips::ROR) { + emitRRR(Mips::ROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions); + return false; + } + + return true; + } + + if (hasMips32()) { + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::ROL: + FirstShift = Mips::SRLV; + SecondShift = Mips::SLLV; + break; + case Mips::ROR: + FirstShift = Mips::SLLV; + SecondShift = Mips::SRLV; + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRR(Mips::SUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions); + emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + +bool MipsAsmParser::expandRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm(); + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + if (hasMips32r2()) { + + if (Inst.getOpcode() == Mips::ROLImm) { + uint64_t MaxShift = 32; + uint64_t ShiftValue = ImmValue; + if (ImmValue != 0) + ShiftValue = MaxShift - ImmValue; + emitRRI(Mips::ROTR, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions); + return false; + } + + if (Inst.getOpcode() == Mips::RORImm) { + emitRRI(Mips::ROTR, DReg, SReg, ImmValue, Inst.getLoc(), Instructions); + return false; + } + + return true; + } + + if (hasMips32()) { + + if (ImmValue == 0) { + emitRRI(Mips::SRL, DReg, SReg, 0, Inst.getLoc(), Instructions); + return false; + } + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::ROLImm: + FirstShift = Mips::SLL; + SecondShift = Mips::SRL; + break; + case Mips::RORImm: + FirstShift = Mips::SRL; + SecondShift = Mips::SLL; + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRI(FirstShift, ATReg, SReg, ImmValue, Inst.getLoc(), Instructions); + emitRRI(SecondShift, DReg, SReg, 32 - ImmValue, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + +bool MipsAsmParser::expandDRotation(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + unsigned TReg = Inst.getOperand(2).getReg(); + unsigned TmpReg = DReg; + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + if (hasMips64r2()) { + + if (TmpReg == SReg) { + TmpReg = getATReg(Inst.getLoc()); + if (!TmpReg) + return true; + } + + if (Inst.getOpcode() == Mips::DROL) { + emitRRR(Mips::DSUBu, TmpReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::DROTRV, DReg, SReg, TmpReg, Inst.getLoc(), Instructions); + return false; + } + + if (Inst.getOpcode() == Mips::DROR) { + emitRRR(Mips::DROTRV, DReg, SReg, TReg, Inst.getLoc(), Instructions); + return false; + } + + return true; + } + + if (hasMips64()) { + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::DROL: + FirstShift = Mips::DSRLV; + SecondShift = Mips::DSLLV; + break; + case Mips::DROR: + FirstShift = Mips::DSLLV; + SecondShift = Mips::DSRLV; + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRR(Mips::DSUBu, ATReg, Mips::ZERO, TReg, Inst.getLoc(), Instructions); + emitRRR(FirstShift, ATReg, SReg, ATReg, Inst.getLoc(), Instructions); + emitRRR(SecondShift, DReg, SReg, TReg, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + +bool MipsAsmParser::expandDRotationImm(MCInst &Inst, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + + unsigned ATReg = Mips::NoRegister; + unsigned DReg = Inst.getOperand(0).getReg(); + unsigned SReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm() % 64; + + unsigned FirstShift = Mips::NOP; + unsigned SecondShift = Mips::NOP; + + MCInst TmpInst; + + if (hasMips64r2()) { + + unsigned FinalOpcode = Mips::NOP; + if (ImmValue == 0) + FinalOpcode = Mips::DROTR; + else if (ImmValue % 32 == 0) + FinalOpcode = Mips::DROTR32; + else if ((ImmValue >= 1) && (ImmValue <= 32)) { + if (Inst.getOpcode() == Mips::DROLImm) + FinalOpcode = Mips::DROTR32; + else + FinalOpcode = Mips::DROTR; + } else if (ImmValue >= 33) { + if (Inst.getOpcode() == Mips::DROLImm) + FinalOpcode = Mips::DROTR; + else + FinalOpcode = Mips::DROTR32; + } + + uint64_t ShiftValue = ImmValue % 32; + if (Inst.getOpcode() == Mips::DROLImm) + ShiftValue = (32 - ImmValue % 32) % 32; + + emitRRI(FinalOpcode, DReg, SReg, ShiftValue, Inst.getLoc(), Instructions); + + return false; + } + + if (hasMips64()) { + + if (ImmValue == 0) { + emitRRI(Mips::DSRL, DReg, SReg, 0, Inst.getLoc(), Instructions); + return false; + } + + switch (Inst.getOpcode()) { + default: + llvm_unreachable("unexpected instruction opcode"); + case Mips::DROLImm: + if ((ImmValue >= 1) && (ImmValue <= 31)) { + FirstShift = Mips::DSLL; + SecondShift = Mips::DSRL32; + } + if (ImmValue == 32) { + FirstShift = Mips::DSLL32; + SecondShift = Mips::DSRL32; + } + if ((ImmValue >= 33) && (ImmValue <= 63)) { + FirstShift = Mips::DSLL32; + SecondShift = Mips::DSRL; + } + break; + case Mips::DRORImm: + if ((ImmValue >= 1) && (ImmValue <= 31)) { + FirstShift = Mips::DSRL; + SecondShift = Mips::DSLL32; + } + if (ImmValue == 32) { + FirstShift = Mips::DSRL32; + SecondShift = Mips::DSLL32; + } + if ((ImmValue >= 33) && (ImmValue <= 63)) { + FirstShift = Mips::DSRL32; + SecondShift = Mips::DSLL; + } + break; + } + + ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + + emitRRI(FirstShift, ATReg, SReg, ImmValue % 32, Inst.getLoc(), Instructions); + emitRRI(SecondShift, DReg, SReg, (32 - ImmValue % 32) % 32, Inst.getLoc(), Instructions); + emitRRR(Mips::OR, DReg, DReg, ATReg, Inst.getLoc(), Instructions); + + return false; + } + + return true; +} + void MipsAsmParser::createNop(bool hasShortDelaySlot, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions) { - MCInst NopInst; - if (hasShortDelaySlot) { - NopInst.setOpcode(Mips::MOVE16_MM); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - } else { - NopInst.setOpcode(Mips::SLL); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createReg(Mips::ZERO)); - NopInst.addOperand(MCOperand::createImm(0)); - } - Instructions.push_back(NopInst); + if (hasShortDelaySlot) + emitRR(Mips::MOVE16_MM, Mips::ZERO, Mips::ZERO, IDLoc, Instructions); + else + emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, Instructions); } void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg, @@ -2717,6 +3536,24 @@ void MipsAsmParser::createAddu(unsigned DstReg, unsigned SrcReg, Instructions); } +void MipsAsmParser::createCpRestoreMemOp( + bool IsLoad, int StackOffset, SMLoc IDLoc, + SmallVectorImpl<MCInst> &Instructions) { + // If the offset can not fit into 16 bits, we need to expand. + if (!isInt<16>(StackOffset)) { + MCInst MemInst; + MemInst.setOpcode(IsLoad ? Mips::LW : Mips::SW); + MemInst.addOperand(MCOperand::createReg(Mips::GP)); + MemInst.addOperand(MCOperand::createReg(Mips::SP)); + MemInst.addOperand(MCOperand::createImm(StackOffset)); + expandMemInst(MemInst, IDLoc, Instructions, IsLoad, true /*HasImmOpnd*/); + return; + } + + emitRRI(IsLoad ? Mips::LW : Mips::SW, Mips::GP, Mips::SP, StackOffset, IDLoc, + Instructions); +} + unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { // As described by the Mips32r2 spec, the registers Rd and Rs for // jalr.hb must be different. @@ -2729,6 +3566,17 @@ unsigned MipsAsmParser::checkTargetMatchPredicate(MCInst &Inst) { return Match_Success; } +static SMLoc RefineErrorLoc(const SMLoc Loc, const OperandVector &Operands, + uint64_t ErrorInfo) { + if (ErrorInfo != ~0ULL && ErrorInfo < Operands.size()) { + SMLoc ErrorLoc = Operands[ErrorInfo]->getStartLoc(); + if (ErrorLoc == SMLoc()) + return Loc; + return ErrorLoc; + } + return Loc; +} + bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2745,7 +3593,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (processInstruction(Inst, IDLoc, Instructions)) return true; for (unsigned i = 0; i < Instructions.size(); i++) - Out.EmitInstruction(Instructions[i], STI); + Out.EmitInstruction(Instructions[i], getSTI()); return false; } case Match_MissingFeature: @@ -2757,7 +3605,7 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, if (ErrorInfo >= Operands.size()) return Error(IDLoc, "too few operands for instruction"); - ErrorLoc = ((MipsOperand &)*Operands[ErrorInfo]).getStartLoc(); + ErrorLoc = Operands[ErrorInfo]->getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; } @@ -2768,6 +3616,58 @@ bool MipsAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, "invalid instruction"); case Match_RequiresDifferentSrcAndDst: return Error(IDLoc, "source and destination must be different"); + case Match_Immz: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), "expected '0'"); + case Match_UImm1_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 1-bit unsigned immediate"); + case Match_UImm2_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 2-bit unsigned immediate"); + case Match_UImm2_1: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 1 .. 4"); + case Match_UImm3_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 3-bit unsigned immediate"); + case Match_UImm4_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 4-bit unsigned immediate"); + case Match_UImm5_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 5-bit unsigned immediate"); + case Match_UImm5_1: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 1 .. 32"); + case Match_UImm5_32: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 32 .. 63"); + case Match_UImm5_33: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected immediate in range 33 .. 64"); + case Match_UImm5_0_Report_UImm6: + // This is used on UImm5 operands that have a corresponding UImm5_32 + // operand to avoid confusing the user. + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 6-bit unsigned immediate"); + case Match_UImm5_Lsl2: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected both 7-bit unsigned immediate and multiple of 4"); + case Match_UImm6_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 6-bit unsigned immediate"); + case Match_SImm6: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 6-bit signed immediate"); + case Match_UImm7_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 7-bit unsigned immediate"); + case Match_UImm8_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 8-bit unsigned immediate"); + case Match_UImm10_0: + return Error(RefineErrorLoc(IDLoc, Operands, ErrorInfo), + "expected 10-bit unsigned immediate"); } llvm_unreachable("Implement any new match types added!"); @@ -3264,7 +4164,7 @@ MipsAsmParser::parseMemOperand(OperandVector &Operands) { const AsmToken &Tok = Parser.getTok(); // Get the next token. if (Tok.isNot(AsmToken::LParen)) { MipsOperand &Mnemonic = static_cast<MipsOperand &>(*Operands[0]); - if (Mnemonic.getToken() == "la") { + if (Mnemonic.getToken() == "la" || Mnemonic.getToken() == "dla") { SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1); Operands.push_back(MipsOperand::CreateImm(IdVal, S, E, *this)); @@ -3598,12 +4498,15 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) { if (RegRange) { // Remove last register operand because registers from register range // should be inserted first. - if (RegNo == Mips::RA) { + if ((isGP64bit() && RegNo == Mips::RA_64) || + (!isGP64bit() && RegNo == Mips::RA)) { Regs.push_back(RegNo); } else { unsigned TmpReg = PrevReg + 1; while (TmpReg <= RegNo) { - if ((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) { + if ((((TmpReg < Mips::S0) || (TmpReg > Mips::S7)) && !isGP64bit()) || + (((TmpReg < Mips::S0_64) || (TmpReg > Mips::S7_64)) && + isGP64bit())) { Error(E, "invalid register operand"); return MatchOperand_ParseFail; } @@ -3615,16 +4518,23 @@ MipsAsmParser::parseRegisterList(OperandVector &Operands) { RegRange = false; } else { - if ((PrevReg == Mips::NoRegister) && (RegNo != Mips::S0) && - (RegNo != Mips::RA)) { + if ((PrevReg == Mips::NoRegister) && + ((isGP64bit() && (RegNo != Mips::S0_64) && (RegNo != Mips::RA_64)) || + (!isGP64bit() && (RegNo != Mips::S0) && (RegNo != Mips::RA)))) { Error(E, "$16 or $31 expected"); return MatchOperand_ParseFail; - } else if (((RegNo < Mips::S0) || (RegNo > Mips::S7)) && - (RegNo != Mips::FP) && (RegNo != Mips::RA)) { + } else if (!(((RegNo == Mips::FP || RegNo == Mips::RA || + (RegNo >= Mips::S0 && RegNo <= Mips::S7)) && + !isGP64bit()) || + ((RegNo == Mips::FP_64 || RegNo == Mips::RA_64 || + (RegNo >= Mips::S0_64 && RegNo <= Mips::S7_64)) && + isGP64bit()))) { Error(E, "invalid register operand"); return MatchOperand_ParseFail; } else if ((PrevReg != Mips::NoRegister) && (RegNo != PrevReg + 1) && - (RegNo != Mips::FP) && (RegNo != Mips::RA)) { + ((RegNo != Mips::FP && RegNo != Mips::RA && !isGP64bit()) || + (RegNo != Mips::FP_64 && RegNo != Mips::RA_64 && + isGP64bit()))) { Error(E, "consecutive register numbers expected"); return MatchOperand_ParseFail; } @@ -4152,6 +5062,7 @@ bool MipsAsmParser::parseSetPopDirective() { if (AssemblerOptions.size() == 2) return reportParseError(Loc, ".set pop with no .set push"); + MCSubtargetInfo &STI = copySTI(); AssemblerOptions.pop_back(); setAvailableFeatures( ComputeAvailableFeatures(AssemblerOptions.back()->getFeatures())); @@ -4225,6 +5136,7 @@ bool MipsAsmParser::parseSetMips0Directive() { return reportParseError("unexpected token, expected end of statement"); // Reset assembler options to their initial values. + MCSubtargetInfo &STI = copySTI(); setAvailableFeatures( ComputeAvailableFeatures(AssemblerOptions.front()->getFeatures())); STI.setFeatureBits(AssemblerOptions.front()->getFeatures()); @@ -4366,6 +5278,14 @@ bool MipsAsmParser::eatComma(StringRef ErrorStr) { return true; } +// Used to determine if .cpload, .cprestore, and .cpsetup have any effect. +// In this class, it is only used for .cprestore. +// FIXME: Only keep track of IsPicEnabled in one place, instead of in both +// MipsTargetELFStreamer and MipsAsmParser. +bool MipsAsmParser::isPicAndNotNxxAbi() { + return inPicMode() && !(isABI_N32() || isABI_N64()); +} + bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) { if (AssemblerOptions.back()->isReorder()) Warning(Loc, ".cpload should be inside a noreorder section"); @@ -4398,6 +5318,54 @@ bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) { return false; } +bool MipsAsmParser::parseDirectiveCpRestore(SMLoc Loc) { + MCAsmParser &Parser = getParser(); + + // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it + // is used in non-PIC mode. + + if (inMips16Mode()) { + reportParseError(".cprestore is not supported in Mips16 mode"); + return false; + } + + // Get the stack offset value. + const MCExpr *StackOffset; + int64_t StackOffsetVal; + if (Parser.parseExpression(StackOffset)) { + reportParseError("expected stack offset value"); + return false; + } + + if (!StackOffset->evaluateAsAbsolute(StackOffsetVal)) { + reportParseError("stack offset is not an absolute expression"); + return false; + } + + if (StackOffsetVal < 0) { + Warning(Loc, ".cprestore with negative stack offset has no effect"); + IsCpRestoreSet = false; + } else { + IsCpRestoreSet = true; + CpRestoreOffset = StackOffsetVal; + } + + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + + // Store the $gp on the stack. + SmallVector<MCInst, 3> StoreInsts; + createCpRestoreMemOp(false /*IsLoad*/, CpRestoreOffset /*StackOffset*/, Loc, + StoreInsts); + + getTargetStreamer().emitDirectiveCpRestore(StoreInsts, CpRestoreOffset); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + bool MipsAsmParser::parseDirectiveCPSetup() { MCAsmParser &Parser = getParser(); unsigned FuncReg; @@ -4427,16 +5395,19 @@ bool MipsAsmParser::parseDirectiveCPSetup() { ResTy = parseAnyRegister(TmpReg); if (ResTy == MatchOperand_NoMatch) { - const AsmToken &Tok = Parser.getTok(); - if (Tok.is(AsmToken::Integer)) { - Save = Tok.getIntVal(); - SaveIsReg = false; - Parser.Lex(); - } else { - reportParseError("expected save register or stack offset"); + const MCExpr *OffsetExpr; + int64_t OffsetVal; + SMLoc ExprLoc = getLexer().getLoc(); + + if (Parser.parseExpression(OffsetExpr) || + !OffsetExpr->evaluateAsAbsolute(OffsetVal)) { + reportParseError(ExprLoc, "expected save register or stack offset"); Parser.eatToEndOfStatement(); return false; } + + Save = OffsetVal; + SaveIsReg = false; } else { MipsOperand &SaveOpnd = static_cast<MipsOperand &>(*TmpReg[0]); if (!SaveOpnd.isGPRAsmReg()) { @@ -4462,11 +5433,20 @@ bool MipsAsmParser::parseDirectiveCPSetup() { } const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr); + CpSaveLocation = Save; + CpSaveLocationIsRegister = SaveIsReg; + getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, Ref->getSymbol(), SaveIsReg); return false; } +bool MipsAsmParser::parseDirectiveCPReturn() { + getTargetStreamer().emitDirectiveCpreturn(CpSaveLocation, + CpSaveLocationIsRegister); + return false; +} + bool MipsAsmParser::parseDirectiveNaN() { MCAsmParser &Parser = getParser(); if (getLexer().isNot(AsmToken::EndOfStatement)) { @@ -4655,6 +5635,9 @@ bool MipsAsmParser::parseDirectiveOption() { StringRef Option = Tok.getIdentifier(); if (Option == "pic0") { + // MipsAsmParser needs to know if the current PIC mode changes. + IsPicEnabled = false; + getTargetStreamer().emitDirectiveOptionPic0(); Parser.Lex(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { @@ -4666,6 +5649,9 @@ bool MipsAsmParser::parseDirectiveOption() { } if (Option == "pic2") { + // MipsAsmParser needs to know if the current PIC mode changes. + IsPicEnabled = true; + getTargetStreamer().emitDirectiveOptionPic2(); Parser.Lex(); if (Parser.getTok().isNot(AsmToken::EndOfStatement)) { @@ -4924,6 +5910,8 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".cpload") return parseDirectiveCpLoad(DirectiveID.getLoc()); + if (IDVal == ".cprestore") + return parseDirectiveCpRestore(DirectiveID.getLoc()); if (IDVal == ".dword") { parseDataDirective(8, DirectiveID.getLoc()); return false; @@ -4974,6 +5962,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { getTargetStreamer().emitDirectiveEnt(*Sym); CurrentFn = Sym; + IsCpRestoreSet = false; return false; } @@ -5002,6 +5991,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { getTargetStreamer().emitDirectiveEnd(SymbolName); CurrentFn = nullptr; + IsCpRestoreSet = false; return false; } @@ -5073,6 +6063,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { getTargetStreamer().emitFrame(StackReg, FrameSizeVal, ReturnRegOpnd.getGPR32Reg()); + IsCpRestoreSet = false; return false; } @@ -5173,6 +6164,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { if (IDVal == ".cpsetup") return parseDirectiveCPSetup(); + if (IDVal == ".cpreturn") + return parseDirectiveCPReturn(); + if (IDVal == ".module") return parseDirectiveModule(); diff --git a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp index a34ba3b..3c1a771 100644 --- a/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp +++ b/contrib/llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp @@ -229,6 +229,13 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, uint64_t Address, const void *Decoder); +// DecodeBranchTarget26MM - Decode microMIPS branch offset, which is +// shifted left by 1 bit. +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder); + // DecodeJumpTargetMM - Decode microMIPS jump target, which is // shifted left by 1 bit. static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, @@ -241,17 +248,42 @@ static DecodeStatus DecodeMem(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemEVA(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeLoadByte9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeLoadByte15(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeCacheOp(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeCacheOpR6(MCInst &Inst, +static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodeCacheOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeCacheOpMM(MCInst &Inst, +static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + +static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -261,6 +293,11 @@ static DecodeStatus DecodeSyncI(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeSynciR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -284,6 +321,11 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeMemMMImm9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -330,6 +372,11 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst, uint64_t Address, const void *Decoder); +static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, + unsigned Value, + uint64_t Address, + const void *Decoder); + static DecodeStatus DecodeSimm4(MCInst &Inst, unsigned Value, uint64_t Address, @@ -340,23 +387,15 @@ static DecodeStatus DecodeSimm16(MCInst &Inst, uint64_t Address, const void *Decoder); -// Decode the immediate field of an LSA instruction which -// is off by one. -static DecodeStatus DecodeLSAImm(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder); +template <unsigned Bits, int Offset> +static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeInsSize(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); -static DecodeStatus DecodeExtSize(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder); - static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder); @@ -830,9 +869,24 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, if (IsMicroMips) { Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian); + if (Result == MCDisassembler::Fail) + return MCDisassembler::Fail; + + if (hasMips32r6()) { + DEBUG(dbgs() << "Trying MicroMipsR616 table (16-bit instructions):\n"); + // Calling the auto-generated decoder function for microMIPS32R6 + // (and microMIPS64R6) 16-bit instructions. + Result = decodeInstruction(DecoderTableMicroMipsR616, Instr, Insn, + Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 2; + return Result; + } + } DEBUG(dbgs() << "Trying MicroMips16 table (16-bit instructions):\n"); - // Calling the auto-generated decoder function. + // Calling the auto-generated decoder function for microMIPS 16-bit + // instructions. Result = decodeInstruction(DecoderTableMicroMips16, Instr, Insn, Address, this, STI); if (Result != MCDisassembler::Fail) { @@ -847,24 +901,33 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, if (hasMips32r6()) { DEBUG(dbgs() << "Trying MicroMips32r632 table (32-bit instructions):\n"); // Calling the auto-generated decoder function. - Result = decodeInstruction(DecoderTableMicroMips32r632, Instr, Insn, Address, - this, STI); - } else { - DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n"); - // Calling the auto-generated decoder function. - Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address, + Result = decodeInstruction(DecoderTableMicroMipsR632, Instr, Insn, Address, this, STI); + if (Result != MCDisassembler::Fail) { + Size = 4; + return Result; + } } + + DEBUG(dbgs() << "Trying MicroMips32 table (32-bit instructions):\n"); + // Calling the auto-generated decoder function. + Result = decodeInstruction(DecoderTableMicroMips32, Instr, Insn, Address, + this, STI); if (Result != MCDisassembler::Fail) { Size = 4; return Result; } + // This is an invalid instruction. Let the disassembler move forward by the + // minimum instruction size. + Size = 2; return MCDisassembler::Fail; } Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false); - if (Result == MCDisassembler::Fail) + if (Result == MCDisassembler::Fail) { + Size = 4; return MCDisassembler::Fail; + } if (hasCOP3()) { DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n"); @@ -925,6 +988,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size, return Result; } + Size = 4; return MCDisassembler::Fail; } @@ -1079,10 +1143,66 @@ static DecodeStatus DecodeMem(MCInst &Inst, Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); Base = getReg(Decoder, Mips::GPR32RegClassID, Base); - if(Inst.getOpcode() == Mips::SC || - Inst.getOpcode() == Mips::SCD){ + if (Inst.getOpcode() == Mips::SC || + Inst.getOpcode() == Mips::SCD) Inst.addOperand(MCOperand::createReg(Reg)); - } + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeMemEVA(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn >> 7); + unsigned Reg = fieldFromInstruction(Insn, 16, 5); + unsigned Base = fieldFromInstruction(Insn, 21, 5); + + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + if (Inst.getOpcode() == Mips::SCE) + Inst.addOperand(MCOperand::createReg(Reg)); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeLoadByte9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeLoadByte15(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<16>(Insn & 0xffff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); Inst.addOperand(MCOperand::createReg(Reg)); Inst.addOperand(MCOperand::createReg(Base)); @@ -1125,11 +1245,28 @@ static DecodeStatus DecodeCacheOpMM(MCInst &Inst, return MCDisassembler::Success; } -static DecodeStatus DecodeCacheOpR6(MCInst &Inst, +static DecodeStatus DecodePrefeOpMM(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { - int Offset = fieldFromInstruction(Insn, 7, 9); + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + unsigned Hint = fieldFromInstruction(Insn, 21, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + Inst.addOperand(MCOperand::createImm(Hint)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeCacheeOp_CacheOpR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn >> 7); unsigned Hint = fieldFromInstruction(Insn, 16, 5); unsigned Base = fieldFromInstruction(Insn, 21, 5); @@ -1142,6 +1279,24 @@ static DecodeStatus DecodeCacheOpR6(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeStoreEvaOpMM(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + static DecodeStatus DecodeSyncI(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1157,6 +1312,21 @@ static DecodeStatus DecodeSyncI(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeSynciR6(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Immediate = SignExtend32<16>(Insn & 0xffff); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Immediate)); + + return MCDisassembler::Success; +} + static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10)); @@ -1220,8 +1390,11 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst, return MCDisassembler::Fail; break; case Mips::SB16_MM: + case Mips::SB16_MMR6: case Mips::SH16_MM: + case Mips::SH16_MMR6: case Mips::SW16_MM: + case Mips::SW16_MMR6: if (DecodeGPRMM16ZeroRegisterClass(Inst, Reg, Address, Decoder) == MCDisassembler::Fail) return MCDisassembler::Fail; @@ -1240,14 +1413,17 @@ static DecodeStatus DecodeMemMMImm4(MCInst &Inst, Inst.addOperand(MCOperand::createImm(Offset)); break; case Mips::SB16_MM: + case Mips::SB16_MMR6: Inst.addOperand(MCOperand::createImm(Offset)); break; case Mips::LHU16_MM: case Mips::SH16_MM: + case Mips::SH16_MMR6: Inst.addOperand(MCOperand::createImm(Offset << 1)); break; case Mips::LW16_MM: case Mips::SW16_MM: + case Mips::SW16_MMR6: Inst.addOperand(MCOperand::createImm(Offset << 2)); break; } @@ -1291,7 +1467,16 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { - int Offset = SignExtend32<4>(Insn & 0xf); + int Offset; + switch (Inst.getOpcode()) { + case Mips::LWM16_MMR6: + case Mips::SWM16_MMR6: + Offset = fieldFromInstruction(Insn, 4, 4); + break; + default: + Offset = SignExtend32<4>(Insn & 0xf); + break; + } if (DecodeRegListOperand16(Inst, Insn, Address, Decoder) == MCDisassembler::Fail) @@ -1303,6 +1488,27 @@ static DecodeStatus DecodeMemMMReglistImm4Lsl2(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeMemMMImm9(MCInst &Inst, + unsigned Insn, + uint64_t Address, + const void *Decoder) { + int Offset = SignExtend32<9>(Insn & 0x1ff); + unsigned Reg = fieldFromInstruction(Insn, 21, 5); + unsigned Base = fieldFromInstruction(Insn, 16, 5); + + Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg); + Base = getReg(Decoder, Mips::GPR32RegClassID, Base); + + if (Inst.getOpcode() == Mips::SCE_MM) + Inst.addOperand(MCOperand::createReg(Reg)); + + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createReg(Base)); + Inst.addOperand(MCOperand::createImm(Offset)); + + return MCDisassembler::Success; +} + static DecodeStatus DecodeMemMMImm12(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1659,6 +1865,16 @@ static DecodeStatus DecodeBranchTargetMM(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeBranchTarget26MM(MCInst &Inst, + unsigned Offset, + uint64_t Address, + const void *Decoder) { + int32_t BranchOffset = SignExtend32<26>(Offset) << 1; + + Inst.addOperand(MCOperand::createImm(BranchOffset)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeJumpTargetMM(MCInst &Inst, unsigned Insn, uint64_t Address, @@ -1700,6 +1916,14 @@ static DecodeStatus DecodeLiSimm7(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodePOOL16BEncodedField(MCInst &Inst, + unsigned Value, + uint64_t Address, + const void *Decoder) { + Inst.addOperand(MCOperand::createImm(Value == 0x0 ? 8 : Value)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeSimm4(MCInst &Inst, unsigned Value, uint64_t Address, @@ -1716,12 +1940,12 @@ static DecodeStatus DecodeSimm16(MCInst &Inst, return MCDisassembler::Success; } -static DecodeStatus DecodeLSAImm(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder) { - // We add one to the immediate field as it was encoded as 'imm - 1'. - Inst.addOperand(MCOperand::createImm(Insn + 1)); +template <unsigned Bits, int Offset> +static DecodeStatus DecodeUImmWithOffset(MCInst &Inst, unsigned Value, + uint64_t Address, + const void *Decoder) { + Value &= ((1 << Bits) - 1); + Inst.addOperand(MCOperand::createImm(Value + Offset)); return MCDisassembler::Success; } @@ -1736,15 +1960,6 @@ static DecodeStatus DecodeInsSize(MCInst &Inst, return MCDisassembler::Success; } -static DecodeStatus DecodeExtSize(MCInst &Inst, - unsigned Insn, - uint64_t Address, - const void *Decoder) { - int Size = (int) Insn + 1; - Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size))); - return MCDisassembler::Success; -} - static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { Inst.addOperand(MCOperand::createImm(SignExtend32<19>(Insn) * 4)); @@ -1792,15 +2007,21 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, uint64_t Address, const void *Decoder) { unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5, - Mips::S6, Mips::FP}; + Mips::S6, Mips::S7, Mips::FP}; unsigned RegNum; unsigned RegLst = fieldFromInstruction(Insn, 21, 5); + // Empty register lists are not allowed. if (RegLst == 0) return MCDisassembler::Fail; RegNum = RegLst & 0xf; + + // RegLst values 10-15, and 26-31 are reserved. + if (RegNum > 9) + return MCDisassembler::Fail; + for (unsigned i = 0; i < RegNum; i++) Inst.addOperand(MCOperand::createReg(Regs[i])); @@ -1814,7 +2035,16 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn, uint64_t Address, const void *Decoder) { unsigned Regs[] = {Mips::S0, Mips::S1, Mips::S2, Mips::S3}; - unsigned RegLst = fieldFromInstruction(Insn, 4, 2); + unsigned RegLst; + switch(Inst.getOpcode()) { + default: + RegLst = fieldFromInstruction(Insn, 4, 2); + break; + case Mips::LWM16_MMR6: + case Mips::SWM16_MMR6: + RegLst = fieldFromInstruction(Insn, 8, 2); + break; + } unsigned RegNum = RegLst & 0x3; for (unsigned i = 0; i <= RegNum; i++) diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp index a5637b1..a7b7d2e 100644 --- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp +++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp @@ -235,7 +235,9 @@ printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) { case Mips::SWM32_MM: case Mips::LWM32_MM: case Mips::SWM16_MM: + case Mips::SWM16_MMR6: case Mips::LWM16_MM: + case Mips::LWM16_MMR6: opNum = MI->getNumOperands() - 2; break; } diff --git a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h index 713f35c..0e61ea6 100644 --- a/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h +++ b/contrib/llvm/lib/Target/Mips/InstPrinter/MipsInstPrinter.h @@ -73,8 +73,6 @@ enum CondCode { const char *MipsFCCToString(Mips::CondCode CC); } // end namespace Mips -class TargetMachine; - class MipsInstPrinter : public MCInstPrinter { public: MipsInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp index 8e6c9e6..cdcc392 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp @@ -23,7 +23,7 @@ static const MCPhysReg Mips64IntRegs[8] = { Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64}; } -const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const { +ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const { if (IsO32()) return makeArrayRef(O32IntRegs); if (IsN32() || IsN64()) @@ -31,7 +31,7 @@ const ArrayRef<MCPhysReg> MipsABIInfo::GetByValArgRegs() const { llvm_unreachable("Unhandled ABI"); } -const ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const { +ArrayRef<MCPhysReg> MipsABIInfo::GetVarArgRegs() const { if (IsO32()) return makeArrayRef(O32IntRegs); if (IsN32() || IsN64()) @@ -78,7 +78,6 @@ MipsABIInfo MipsABIInfo::computeTargetABI(const Triple &TT, StringRef CPU, .Case("mips32r3", MipsABIInfo::O32()) .Case("mips32r5", MipsABIInfo::O32()) .Case("mips32r6", MipsABIInfo::O32()) - .Case("mips16", MipsABIInfo::O32()) .Case("mips3", MipsABIInfo::N64()) .Case("mips4", MipsABIInfo::N64()) .Case("mips5", MipsABIInfo::N64()) @@ -107,6 +106,10 @@ unsigned MipsABIInfo::GetNullPtr() const { return ArePtrs64bit() ? Mips::ZERO_64 : Mips::ZERO; } +unsigned MipsABIInfo::GetZeroReg() const { + return AreGprs64bit() ? Mips::ZERO_64 : Mips::ZERO; +} + unsigned MipsABIInfo::GetPtrAdduOp() const { return ArePtrs64bit() ? Mips::DADDu : Mips::ADDu; } @@ -115,6 +118,10 @@ unsigned MipsABIInfo::GetPtrAddiuOp() const { return ArePtrs64bit() ? Mips::DADDiu : Mips::ADDiu; } +unsigned MipsABIInfo::GetGPRMoveOp() const { + return ArePtrs64bit() ? Mips::OR64 : Mips::OR; +} + unsigned MipsABIInfo::GetEhDataReg(unsigned I) const { static const unsigned EhDataReg[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h index 40c5681..ffa2c76 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h @@ -47,10 +47,10 @@ public: ABI GetEnumValue() const { return ThisABI; } /// The registers to use for byval arguments. - const ArrayRef<MCPhysReg> GetByValArgRegs() const; + ArrayRef<MCPhysReg> GetByValArgRegs() const; /// The registers to use for the variable argument list. - const ArrayRef<MCPhysReg> GetVarArgRegs() const; + ArrayRef<MCPhysReg> GetVarArgRegs() const; /// Obtain the size of the area allocated by the callee for arguments. /// CallingConv::FastCall affects the value for O32. @@ -67,9 +67,12 @@ public: unsigned GetFramePtr() const; unsigned GetBasePtr() const; unsigned GetNullPtr() const; + unsigned GetZeroReg() const; unsigned GetPtrAdduOp() const; unsigned GetPtrAddiuOp() const; + unsigned GetGPRMoveOp() const; inline bool ArePtrs64bit() const { return IsN64(); } + inline bool AreGprs64bit() const { return IsN32() || IsN64(); } unsigned GetEhDataReg(unsigned I) const; }; diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index 328e717..e4865e2 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -63,15 +63,19 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, // address range. Forcing a signed division because Value can be negative. Value = (int64_t)Value / 4; // We now check if Value can be encoded as a 16-bit signed immediate. - if (!isIntN(16, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup"); + if (!isInt<16>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC19_S2: // Forcing a signed division because Value can be negative. Value = (int64_t)Value / 4; // We now check if Value can be encoded as a 19-bit signed immediate. - if (!isIntN(19, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC19 fixup"); + if (!isInt<19>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup"); + return 0; + } break; case Mips::fixup_Mips_26: // So far we are only using this type for jumps. @@ -104,45 +108,57 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 2; // We now check if Value can be encoded as a 7-bit signed immediate. - if (!isIntN(7, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC7 fixup"); + if (!isInt<7>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup"); + return 0; + } break; case Mips::fixup_MICROMIPS_PC10_S1: Value -= 2; // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 2; // We now check if Value can be encoded as a 10-bit signed immediate. - if (!isIntN(10, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC10 fixup"); + if (!isInt<10>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup"); + return 0; + } break; case Mips::fixup_MICROMIPS_PC16_S1: Value -= 4; // Forcing a signed division because Value can be negative. Value = (int64_t)Value / 2; // We now check if Value can be encoded as a 16-bit signed immediate. - if (!isIntN(16, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC16 fixup"); + if (!isInt<16>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC18_S3: // Forcing a signed division because Value can be negative. Value = (int64_t)Value / 8; // We now check if Value can be encoded as a 18-bit signed immediate. - if (!isIntN(18, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC18 fixup"); + if (!isInt<18>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC21_S2: // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 4; // We now check if Value can be encoded as a 21-bit signed immediate. - if (!isIntN(21, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC21 fixup"); + if (!isInt<21>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup"); + return 0; + } break; case Mips::fixup_MIPS_PC26_S2: // Forcing a signed division because Value can be negative. Value = (int64_t) Value / 4; // We now check if Value can be encoded as a 26-bit signed immediate. - if (!isIntN(26, Value) && Ctx) - Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup"); + if (!isInt<26>(Value) && Ctx) { + Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup"); + return 0; + } break; } @@ -232,6 +248,18 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, } } +bool MipsAsmBackend::getFixupKind(StringRef Name, MCFixupKind &MappedKind) const { + if (Name == "R_MIPS_NONE") { + MappedKind = (MCFixupKind)Mips::fixup_Mips_NONE; + return true; + } + if (Name == "R_MIPS_32") { + MappedKind = FK_Data_4; + return true; + } + return MCAsmBackend::getFixupKind(Name, MappedKind); +} + const MCFixupKindInfo &MipsAsmBackend:: getFixupKindInfo(MCFixupKind Kind) const { const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = { @@ -239,6 +267,7 @@ getFixupKindInfo(MCFixupKind Kind) const { // MipsFixupKinds.h. // // name offset bits flags + { "fixup_Mips_NONE", 0, 0, 0 }, { "fixup_Mips_16", 0, 16, 0 }, { "fixup_Mips_32", 0, 32, 0 }, { "fixup_Mips_REL32", 0, 32, 0 }, @@ -304,6 +333,7 @@ getFixupKindInfo(MCFixupKind Kind) const { // MipsFixupKinds.h. // // name offset bits flags + { "fixup_Mips_NONE", 0, 0, 0 }, { "fixup_Mips_16", 16, 16, 0 }, { "fixup_Mips_32", 0, 32, 0 }, { "fixup_Mips_REL32", 0, 32, 0 }, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index b3d5a49..1c9af92 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -41,6 +41,7 @@ public: void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, uint64_t Value, bool IsPCRel) const override; + bool getFixupKind(StringRef Name, MCFixupKind &MappedKind) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; unsigned getNumFixupKinds() const override { diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 9b29527..5b9f02b 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -68,6 +68,8 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target, unsigned Kind = (unsigned)Fixup.getKind(); switch (Kind) { + case Mips::fixup_Mips_NONE: + return ELF::R_MIPS_NONE; case Mips::fixup_Mips_16: case FK_Data_2: return IsPCRel ? ELF::R_MIPS_PC16 : ELF::R_MIPS_16; @@ -325,13 +327,24 @@ static void setMatch(MipsRelocationEntry &Hi, MipsRelocationEntry &Lo) { // matching LO; // - prefer LOs without a pair; // - prefer LOs with higher offset; + +static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) { + const ELFRelocationEntry &A = *AP; + const ELFRelocationEntry &B = *BP; + if (A.Offset != B.Offset) + return B.Offset - A.Offset; + if (B.Type != A.Type) + return A.Type - B.Type; + return 0; +} + void MipsELFObjectWriter::sortRelocs(const MCAssembler &Asm, std::vector<ELFRelocationEntry> &Relocs) { if (Relocs.size() < 2) return; - // The default function sorts entries by Offset in descending order. - MCELFObjectTargetWriter::sortRelocs(Asm, Relocs); + // Sorts entries by Offset in descending order. + array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel); // Init MipsRelocs from Relocs. std::vector<MipsRelocationEntry> MipsRelocs; diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index b45d9cf..e7d687e 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -63,7 +63,7 @@ void MipsELFStreamer::SwitchSection(MCSection *Section, } void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) { + SMLoc Loc) { MCELFStreamer::EmitValueImpl(Value, Size, Loc); Labels.clear(); } diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h index af9311f..a241cde 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h @@ -60,8 +60,7 @@ public: /// Overriding this function allows us to dismiss all labels that are /// candidates for marking as microMIPS when .word directive is emitted. - void EmitValueImpl(const MCExpr *Value, unsigned Size, - const SMLoc &Loc) override; + void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override; /// Emits all the option records stored up until the point it's called. void EmitMipsOptionRecords(); diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h index e601963..3652f4b 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h @@ -23,8 +23,11 @@ namespace Mips { // in MipsAsmBackend.cpp. // enum Fixups { + // Branch fixups resulting in R_MIPS_NONE. + fixup_Mips_NONE = FirstTargetFixupKind, + // Branch fixups resulting in R_MIPS_16. - fixup_Mips_16 = FirstTargetFixupKind, + fixup_Mips_16, // Pure 32 bit data fixup resulting in - R_MIPS_32. fixup_Mips_32, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h index 5d23fcb..d4ccf03 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h @@ -17,13 +17,14 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; +class Triple; - class MipsMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit MipsMCAsmInfo(const Triple &TheTriple); - }; +class MipsMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit MipsMCAsmInfo(const Triple &TheTriple); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index e36263d..4b030eb 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -190,6 +190,10 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, else NewOpcode = Mips::Std2MicroMips(Opcode, Mips::Arch_micromips); + // Check whether it is Dsp instruction. + if (NewOpcode == -1) + NewOpcode = Mips::Dsp2MicroMips(Opcode, Mips::Arch_mmdsp); + if (NewOpcode != -1) { if (Fixups.size() > N) Fixups.pop_back(); @@ -346,6 +350,23 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo, return 0; } +/// getBranchTarget26OpValueMM - Return binary encoding of the branch +/// target operand. If the machine operand requires relocation, +/// record the relocation and return zero. +unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM( + const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + + const MCOperand &MO = MI.getOperand(OpNo); + + // If the destination is an immediate, divide by 2. + if (MO.isImm()) + return MO.getImm() >> 1; + + // TODO: Push 26 PC fixup. + return 0; +} + /// getJumpOffset16OpValue - Return binary encoding of the jump /// target operand. If the machine operand requires relocation, /// record the relocation and return zero. @@ -745,7 +766,8 @@ getMemEncodingMMSPImm5Lsl2(const MCInst &MI, unsigned OpNo, const MCSubtargetInfo &STI) const { // Register is encoded in bits 9-5, offset is encoded in bits 4-0. assert(MI.getOperand(OpNo).isReg() && - MI.getOperand(OpNo).getReg() == Mips::SP && + (MI.getOperand(OpNo).getReg() == Mips::SP || + MI.getOperand(OpNo).getReg() == Mips::SP_64) && "Unexpected base register!"); unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) >> 2; @@ -769,6 +791,19 @@ getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo, } unsigned MipsMCCodeEmitter:: +getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Base register is encoded in bits 20-16, offset is encoded in bits 8-0. + assert(MI.getOperand(OpNo).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, + STI) << 16; + unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI); + + return (OffBits & 0x1FF) | RegBits; +} + +unsigned MipsMCCodeEmitter:: getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -792,6 +827,19 @@ getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo, } unsigned MipsMCCodeEmitter:: +getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + // Base register is encoded in bits 20-16, offset is encoded in bits 15-0. + assert(MI.getOperand(OpNo).isReg()); + unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, + STI) << 16; + unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI); + + return (OffBits & 0xFFFF) | RegBits; +} + +unsigned MipsMCCodeEmitter:: getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const { @@ -801,7 +849,9 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, default: break; case Mips::SWM16_MM: + case Mips::SWM16_MMR6: case Mips::LWM16_MM: + case Mips::LWM16_MMR6: OpNo = MI.getNumOperands() - 2; break; } @@ -815,15 +865,6 @@ getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, return ((OffBits >> 2) & 0x0F); } -unsigned -MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { - assert(MI.getOperand(OpNo).isImm()); - unsigned SizeEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI); - return SizeEncoding - 1; -} - // FIXME: should be called getMSBEncoding // unsigned @@ -838,13 +879,15 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo, return Position + Size - 1; } +template <unsigned Bits, int Offset> unsigned -MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const { +MipsMCCodeEmitter::getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { assert(MI.getOperand(OpNo).isImm()); - // The immediate is encoded as 'immediate - 1'. - return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1; + unsigned Value = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI); + Value -= Offset; + return Value; } unsigned diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h index 911cc2f..fdacd17 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h @@ -137,6 +137,13 @@ public: SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + // getBranchTarget26OpValueMM - Return binary encoding of the branch + // offset operand. If the machine operand requires relocation, + // record the relocation and return zero. + unsigned getBranchTarget26OpValueMM(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + // getJumpOffset16OpValue - Return binary encoding of the jump // offset operand. If the machine operand requires relocation, // record the relocation and return zero. @@ -172,23 +179,27 @@ public: unsigned getMemEncodingMMGPImm7Lsl2(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + unsigned getMemEncodingMMImm9(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; + unsigned getMemEncodingMMImm16(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getMemEncodingMMImm4sp(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const; - // getLSAImmEncoding - Return binary encoding of LSA immediate. - unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo, - SmallVectorImpl<MCFixup> &Fixups, - const MCSubtargetInfo &STI) const; + /// Subtract Offset then encode as a N-bit unsigned integer. + template <unsigned Bits, int Offset> + unsigned getUImmWithOffsetEncoding(const MCInst &MI, unsigned OpNo, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups, diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h index fd2ed17..e889972 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h @@ -51,8 +51,8 @@ public: const MCAsmLayout *Layout, const MCFixup *Fixup) const override; void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } // There are no TLS MipsMCExprs at the moment. diff --git a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index e4da2df..e5fa755 100644 --- a/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/contrib/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -89,9 +89,15 @@ void MipsTargetStreamer::emitDirectiveSetHardFloat() { void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {} +void MipsTargetStreamer::emitDirectiveCpRestore( + SmallVector<MCInst, 3> &StoreInsts, int Offset) { + forbidModuleDirective(); +} void MipsTargetStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) { } +void MipsTargetStreamer::emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) {} void MipsTargetStreamer::emitDirectiveModuleFP() {} @@ -358,6 +364,12 @@ void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) { forbidModuleDirective(); } +void MipsTargetAsmStreamer::emitDirectiveCpRestore( + SmallVector<MCInst, 3> &StoreInsts, int Offset) { + MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset); + OS << "\t.cprestore\t" << Offset << "\n"; +} + void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, @@ -373,7 +385,13 @@ void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo, OS << ", "; - OS << Sym.getName() << "\n"; + OS << Sym.getName(); + forbidModuleDirective(); +} + +void MipsTargetAsmStreamer::emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) { + OS << "\t.cpreturn"; forbidModuleDirective(); } @@ -595,8 +613,9 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { MCSectionELF *Sec = Context.getELFSection(".pdr", ELF::SHT_PROGBITS, ELF::SHF_ALLOC | ELF::SHT_REL); + MCSymbol *Sym = Context.getOrCreateSymbol(Name); const MCSymbolRefExpr *ExprRef = - MCSymbolRefExpr::create(Name, MCSymbolRefExpr::VK_None, Context); + MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, Context); MCA.registerSection(*Sec); Sec->setAlignment(4); @@ -622,10 +641,25 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) { GPRInfoSet = FPRInfoSet = FrameInfoSet = false; OS.PopSection(); + + // .end also implicitly sets the size. + MCSymbol *CurPCSym = Context.createTempSymbol(); + OS.EmitLabel(CurPCSym); + const MCExpr *Size = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context), + ExprRef, Context); + int64_t AbsSize; + if (!Size->evaluateAsAbsolute(AbsSize, MCA)) + llvm_unreachable("Function size must be evaluatable as absolute"); + Size = MCConstantExpr::create(AbsSize, Context); + static_cast<MCSymbolELF *>(Sym)->setSize(Size); } void MipsTargetELFStreamer::emitDirectiveEnt(const MCSymbol &Symbol) { GPRInfoSet = FPRInfoSet = FrameInfoSet = false; + + // .ent also acts like an implicit '.type symbol, STT_FUNC' + static_cast<const MCSymbolELF &>(Symbol).setType(ELF::STT_FUNC); } void MipsTargetELFStreamer::emitDirectiveAbiCalls() { @@ -752,6 +786,24 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { forbidModuleDirective(); } +void MipsTargetELFStreamer::emitDirectiveCpRestore( + SmallVector<MCInst, 3> &StoreInsts, int Offset) { + MipsTargetStreamer::emitDirectiveCpRestore(StoreInsts, Offset); + // .cprestore offset + // When PIC mode is enabled and the O32 ABI is used, this directive expands + // to: + // sw $gp, offset($sp) + // and adds a corresponding LW after every JAL. + + // Note that .cprestore is ignored if used with the N32 and N64 ABIs or if it + // is used in non-PIC mode. + if (!Pic || (getABI().IsN32() || getABI().IsN64())) + return; + + for (const MCInst &Inst : StoreInsts) + getStreamer().EmitInstruction(Inst, STI); +} + void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, @@ -766,7 +818,7 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, // Either store the old $gp in a register or on the stack if (IsReg) { // move $save, $gpreg - Inst.setOpcode(Mips::DADDu); + Inst.setOpcode(Mips::OR64); Inst.addOperand(MCOperand::createReg(RegOrOffset)); Inst.addOperand(MCOperand::createReg(Mips::GP)); Inst.addOperand(MCOperand::createReg(Mips::ZERO)); @@ -810,6 +862,30 @@ void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo, forbidModuleDirective(); } +void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) { + // Only N32 and N64 emit anything for .cpreturn iff PIC is set. + if (!Pic || !(getABI().IsN32() || getABI().IsN64())) + return; + + MCInst Inst; + // Either restore the old $gp from a register or on the stack + if (SaveLocationIsRegister) { + Inst.setOpcode(Mips::OR); + Inst.addOperand(MCOperand::createReg(Mips::GP)); + Inst.addOperand(MCOperand::createReg(SaveLocation)); + Inst.addOperand(MCOperand::createReg(Mips::ZERO)); + } else { + Inst.setOpcode(Mips::LD); + Inst.addOperand(MCOperand::createReg(Mips::GP)); + Inst.addOperand(MCOperand::createReg(Mips::SP)); + Inst.addOperand(MCOperand::createImm(SaveLocation)); + } + getStreamer().EmitInstruction(Inst, STI); + + forbidModuleDirective(); +} + void MipsTargetELFStreamer::emitMipsAbiFlags() { MCAssembler &MCA = getStreamer().getAssembler(); MCContext &Context = MCA.getContext(); diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td index 187a022..400f6eef 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td @@ -16,6 +16,64 @@ class MMR6Arch<string opstr> { string BaseOpcode = opstr; } +// Class used for microMIPS32r6 and microMIPS64r6 instructions. +class MicroMipsR6Inst16 : PredicateControl { + string DecoderNamespace = "MicroMipsR6"; + let InsnPredicates = [HasMicroMips32r6]; +} + +class BC16_FM_MM16R6 { + bits<10> offset; + + bits<16> Inst; + + let Inst{15-10} = 0x33; + let Inst{9-0} = offset; +} + +class BEQZC_BNEZC_FM_MM16R6<bits<6> op> : MicroMipsR6Inst16 { + bits<3> rs; + bits<7> offset; + + bits<16> Inst; + + let Inst{15-10} = op; + let Inst{9-7} = rs; + let Inst{6-0} = offset; +} + +class POOL16C_JALRC_FM_MM16R6<bits<5> op> { + bits<5> rs; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-5} = rs; + let Inst{4-0} = op; +} + +class POOL16C_JRCADDIUSP_FM_MM16R6<bits<5> op> { + bits<5> imm; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-5} = imm; + let Inst{4-0} = op; +} + +class POOL16C_LWM_SWM_FM_MM16R6<bits<4> funct> { + bits<2> rt; + bits<4> addr; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-8} = rt; + let Inst{7-4} = addr; + let Inst{3-0} = funct; +} + class POOL32A_BITSWAP_FM_MMR6<bits<6> funct> : MipsR6Inst { bits<5> rd; bits<5> rt; @@ -71,6 +129,64 @@ class ADDI_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> { let Inst{15-0} = imm16; } +class POOL32C_ST_EVA_FM_MMR6<bits<6> op, bits<3> funct> : MipsR6Inst { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = hint; + let Inst{20-16} = base; + let Inst{15-12} = 0b1010; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class LB32_FM_MMR6 : MipsR6Inst { + bits<21> addr; + bits<5> rt; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b000111; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class LBU32_FM_MMR6 : MipsR6Inst { + bits<21> addr; + bits<5> rt; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b000101; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class POOL32C_LB_LBU_FM_MMR6<bits<3> funct> : MipsR6Inst { + bits<21> addr; + bits<5> rt; + + bits<32> Inst; + + let Inst{31-26} = 0b011000; + let Inst{25-21} = rt; + let Inst{20-16} = addr{20-16}; + let Inst{15-12} = 0b0110; + let Inst{11-9} = funct; + let Inst{8-0} = addr{8-0}; +} + class SIGN_EXTEND_FM_MMR6<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> { bits<5> rd; @@ -124,6 +240,69 @@ class POOL32A_FM_MMR6<bits<10> funct> : MipsR6Inst { let Inst{9-0} = funct; } +class POOL32A_PAUSE_FM_MMR6<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> { + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = 0; + let Inst{15-11} = op; + let Inst{10-6} = 0; + let Inst{5-0} = 0; +} + +class POOL32A_RDPGPR_FM_MMR6<bits<10> funct> { + bits<5> rt; + bits<5> rd; + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = rt; + let Inst{20-16} = rd; + let Inst{15-6} = funct; + let Inst{5-0} = 0b111100; +} + +class POOL32A_RDHWR_FM_MMR6 { + bits<5> rt; + bits<5> rs; + bits<3> sel; + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-14} = 0; + let Inst{13-11} = sel; + let Inst{10} = 0; + let Inst{9-0} = 0b0111000000; +} + +class POOL32A_SYNC_FM_MMR6 { + bits<5> stype; + + bits<32> Inst; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = stype; + let Inst{15-6} = 0b0110101101; + let Inst{5-0} = 0b111100; +} + +class POOL32I_SYNCI_FM_MMR6 { + bits<21> addr; + bits<5> base = addr{20-16}; + bits<16> immediate = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b010000; + let Inst{25-21} = 0b01100; + let Inst{20-16} = base; + let Inst{15-0} = immediate; +} + class POOL32A_2R_FM_MMR6<bits<10> funct> : MipsR6Inst { bits<5> rs; bits<5> rt; @@ -198,6 +377,78 @@ class POOL32A_LSA_FM<bits<6> funct> : MipsR6Inst { let Inst{5-0} = funct; } +class SB32_SH32_STORE_FM_MMR6<bits<6> op> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class POOL32C_STORE_EVA_FM_MMR6<bits<3> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b011000; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = 0b1010; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class LOAD_WORD_EVA_FM_MMR6<bits<3> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b011000; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = 0b0110; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class LOAD_WORD_FM_MMR6 { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<16> offset = addr{15-0}; + + bits<32> Inst; + + let Inst{31-26} = 0b111111; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-0} = offset; +} + +class LOAD_UPPER_IMM_FM_MMR6 { + bits<5> rt; + bits<16> imm16; + + bits<32> Inst; + + let Inst{31-26} = 0b000100; + let Inst{25-21} = rt; + let Inst{20-16} = 0; + let Inst{15-0} = imm16; +} + class CMP_BRANCH_1R_RT_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst { bits<5> rt; bits<16> offset; @@ -222,12 +473,13 @@ class CMP_BRANCH_1R_BOTH_OFF16_FM_MMR6<bits<6> funct> : MipsR6Inst { let Inst{15-0} = offset; } -class ERET_FM_MMR6<string instr_asm> : MMR6Arch<instr_asm> { +class POOL32A_ERET_FM_MMR6<string instr_asm, bits<10> funct> + : MMR6Arch<instr_asm> { bits<32> Inst; let Inst{31-26} = 0x00; let Inst{25-16} = 0x00; - let Inst{15-6} = 0x3cd; + let Inst{15-6} = funct; let Inst{5-0} = 0x3c; } @@ -262,7 +514,8 @@ class BARRIER_MMR6_ENC<string instr_asm, bits<5> op> : MMR6Arch<instr_asm> { let Inst{5-0} = 0x0; } -class EIDI_MMR6_ENC<string instr_asm, bits<10> funct> : MMR6Arch<instr_asm> { +class POOL32A_EIDI_MMR6_ENC<string instr_asm, bits<10> funct> + : MMR6Arch<instr_asm> { bits<32> Inst; bits<5> rt; // Actually rs but we're sharing code with the standard encodings which call it rt @@ -287,3 +540,323 @@ class SHIFT_MMR6_ENC<string instr_asm, bits<10> funct, bit rotate> : MMR6Arch<in let Inst{10} = rotate; let Inst{9-0} = funct; } + +class SW32_FM_MMR6<string instr_asm, bits<6> op> : MMR6Arch<instr_asm> { + bits<5> rt; + bits<21> addr; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = addr{20-16}; + let Inst{15-0} = addr{15-0}; +} + +class POOL32C_SWE_FM_MMR6<string instr_asm, bits<6> op, bits<4> fmt, + bits<3> funct> : MMR6Arch<instr_asm> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = fmt; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class POOL32F_ARITH_FM_MMR6<string instr_asm, bits<2> fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10} = 0; + let Inst{9-8} = fmt; + let Inst{7-0} = funct; +} + +class POOL32F_ARITHF_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_MOV_NEG_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14-13} = fmt; + let Inst{12-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_MINMAX_FM<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_CMP_FM<string instr_asm, bits<6> format, FIELD_CMP_COND Cond> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-6} = Cond.Value; + let Inst{5-0} = format; +} + +class POOL32F_CVT_LW_FM<string instr_asm, bit fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_CVT_DS_FM<string instr_asm, bits<2> fmt, bits<7> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14-13} = fmt; + let Inst{12-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_ABS_FM_MMR6<string instr_asm, bits<2> fmt, bits<7> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14-13} = fmt; + let Inst{12-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_MATH_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL16A_ADDU16_FM_MMR6 : MicroMipsR6Inst16 { + bits<3> rs; + bits<3> rt; + bits<3> rd; + + bits<16> Inst; + + let Inst{15-10} = 0b000001; + let Inst{9-7} = rs; + let Inst{6-4} = rt; + let Inst{3-1} = rd; + let Inst{0} = 0; +} + +class POOL16C_AND16_FM_MMR6 : MicroMipsR6Inst16 { + bits<3> rt; + bits<3> rs; + + bits<16> Inst; + + let Inst{15-10} = 0b010001; + let Inst{9-7} = rt; + let Inst{6-4} = rs; + let Inst{3-0} = 0b0001; +} + +class POOL16C_NOT16_FM_MMR6 : MicroMipsR6Inst16 { + bits<3> rt; + bits<3> rs; + + bits<16> Inst; + + let Inst{15-10} = 0x11; + let Inst{9-7} = rt; + let Inst{6-4} = rs; + let Inst{3-0} = 0b0000; +} + +class POOL16C_OR16_XOR16_FM_MMR6<bits<4> op> { + bits<3> rt; + bits<3> rs; + + bits<16> Inst; + + let Inst{15-10} = 0b010001; + let Inst{9-7} = rt; + let Inst{6-4} = rs; + let Inst{3-0} = op; +} + +class POOL16C_BREAKPOINT_FM_MMR6<bits<6> op> { + bits<4> code_; + bits<16> Inst; + + let Inst{15-10} = 0b010001; + let Inst{9-6} = code_; + let Inst{5-0} = op; +} + +class POOL16A_SUBU16_FM_MMR6 { + bits<3> rs; + bits<3> rt; + bits<3> rd; + + bits<16> Inst; + + let Inst{15-10} = 0b000001; + let Inst{9-7} = rs; + let Inst{6-4} = rt; + let Inst{3-1} = rd; + let Inst{0} = 0b1; +} + +class POOL32A_WRPGPR_WSBH_FM_MMR6<bits<10> funct> : MipsR6Inst { + bits<5> rt; + bits<5> rs; + + bits<32> Inst; + + let Inst{31-26} = 0x00; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-6} = funct; + let Inst{5-0} = 0x3c; +} + +class POOL32F_RECIP_ROUND_FM_MMR6<string instr_asm, bits<1> fmt, bits<8> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15} = 0; + let Inst{14} = fmt; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111011; +} + +class POOL32F_RINT_FM_MMR6<string instr_asm, bits<2> fmt> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0; + let Inst{10-9} = fmt; + let Inst{8-0} = 0b000100000; +} + +class POOL32F_SEL_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> ft; + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = ft; + let Inst{20-16} = fs; + let Inst{15-11} = fd; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} + +class POOL32F_CLASS_FM_MMR6<string instr_asm, bits<2> fmt, bits<9> funct> + : MMR6Arch<instr_asm>, MipsR6Inst { + bits<5> fs; + bits<5> fd; + + bits<32> Inst; + + let Inst{31-26} = 0b010101; + let Inst{25-21} = fs; + let Inst{20-16} = fd; + let Inst{15-11} = 0b00000; + let Inst{10-9} = fmt; + let Inst{8-0} = funct; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td index 53bde13..31b5db0 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td @@ -11,6 +11,13 @@ // //===----------------------------------------------------------------------===// +def brtarget26_mm : Operand<OtherVT> { + let EncoderMethod = "getBranchTarget26OpValueMM"; + let OperandType = "OPERAND_PCREL"; + let DecoderMethod = "DecodeBranchTarget26MM"; + let ParserMatchClass = MipsJumpTargetAsmOperand; +} + //===----------------------------------------------------------------------===// // // Instruction Encodings @@ -28,6 +35,9 @@ class ALIGN_MMR6_ENC : POOL32A_ALIGN_FM_MMR6<0b011111>; class AUI_MMR6_ENC : AUI_FM_MMR6; class BALC_MMR6_ENC : BRANCH_OFF26_FM<0b101101>; class BC_MMR6_ENC : BRANCH_OFF26_FM<0b100101>; +class BC16_MMR6_ENC : BC16_FM_MM16R6; +class BEQZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x23>; +class BNEZC16_MMR6_ENC : BEQZC_BNEZC_FM_MM16R6<0x2b>; class BITSWAP_MMR6_ENC : POOL32A_BITSWAP_FM_MMR6<0b101100>; class BRK_MMR6_ENC : BREAK_MMR6_ENC<"break">; class BEQZALC_MMR6_ENC : CMP_BRANCH_1R_RT_OFF16_FM_MMR6<0b011101>; @@ -42,13 +52,19 @@ class CLZ_MMR6_ENC : SPECIAL_2R_FM_MMR6<0b010000>; class DIV_MMR6_ENC : ARITH_FM_MMR6<"div", 0x118>; class DIVU_MMR6_ENC : ARITH_FM_MMR6<"divu", 0x198>; class EHB_MMR6_ENC : BARRIER_MMR6_ENC<"ehb", 0x3>; -class EI_MMR6_ENC : EIDI_MMR6_ENC<"ei", 0x15d>; -class ERET_MMR6_ENC : ERET_FM_MMR6<"eret">; +class EI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"ei", 0x15d>; +class DI_MMR6_ENC : POOL32A_EIDI_MMR6_ENC<"di", 0b0100011101>; +class ERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0x3cd>; +class DERET_MMR6_ENC : POOL32A_ERET_FM_MMR6<"eret", 0b1110001101>; class ERETNC_MMR6_ENC : ERETNC_FM_MMR6<"eretnc">; +class JALRC16_MMR6_ENC : POOL16C_JALRC_FM_MM16R6<0xb>; class JIALC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b100000>; class JIC_MMR6_ENC : JMP_IDX_COMPACT_FM<0b101000>; +class JRC16_MMR6_ENC: POOL16C_JALRC_FM_MM16R6<0x3>; +class JRCADDIUSP_MMR6_ENC : POOL16C_JRCADDIUSP_FM_MM16R6<0x13>; class LSA_MMR6_ENC : POOL32A_LSA_FM<0b001111>; class LWPC_MMR6_ENC : PCREL19_FM_MMR6<0b01>; +class LWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0x2>; class MOD_MMR6_ENC : ARITH_FM_MMR6<"mod", 0x158>; class MODU_MMR6_ENC : ARITH_FM_MMR6<"modu", 0x1d8>; class MUL_MMR6_ENC : ARITH_FM_MMR6<"mul", 0x18>; @@ -59,15 +75,99 @@ class NOR_MMR6_ENC : ARITH_FM_MMR6<"nor", 0x2d0>; class OR_MMR6_ENC : ARITH_FM_MMR6<"or", 0x290>; class ORI_MMR6_ENC : ADDI_FM_MMR6<"ori", 0x14>; class PREF_MMR6_ENC : CACHE_PREF_FM_MMR6<0b011000, 0b0010>; +class SB16_MMR6_ENC : LOAD_STORE_FM_MM16<0x22>; class SEB_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seb", 0b0010101100>; class SEH_MMR6_ENC : SIGN_EXTEND_FM_MMR6<"seh", 0b0011101100>; class SELEQZ_MMR6_ENC : POOL32A_FM_MMR6<0b0101000000>; class SELNEZ_MMR6_ENC : POOL32A_FM_MMR6<0b0110000000>; +class SH16_MMR6_ENC : LOAD_STORE_FM_MM16<0x2a>; class SLL_MMR6_ENC : SHIFT_MMR6_ENC<"sll", 0x00, 0b0>; class SUB_MMR6_ENC : ARITH_FM_MMR6<"sub", 0x190>; class SUBU_MMR6_ENC : ARITH_FM_MMR6<"subu", 0x1d0>; +class SW_MMR6_ENC : SW32_FM_MMR6<"sw", 0x3e>; +class SWE_MMR6_ENC : POOL32C_SWE_FM_MMR6<"swe", 0x18, 0xa, 0x7>; +class SW16_MMR6_ENC : LOAD_STORE_FM_MM16<0x3a>; +class SWM16_MMR6_ENC : POOL16C_LWM_SWM_FM_MM16R6<0xa>; +class SWSP_MMR6_ENC : LOAD_STORE_SP_FM_MM16<0x32>; +class PREFE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b010>; +class CACHEE_MMR6_ENC : POOL32C_ST_EVA_FM_MMR6<0b011000, 0b011>; +class WRPGPR_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x3c5>; +class WSBH_MMR6_ENC : POOL32A_WRPGPR_WSBH_FM_MMR6<0x1ec>; +class LB_MMR6_ENC : LB32_FM_MMR6; +class LBU_MMR6_ENC : LBU32_FM_MMR6; +class LBE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b100>; +class LBUE_MMR6_ENC : POOL32C_LB_LBU_FM_MMR6<0b000>; +class PAUSE_MMR6_ENC : POOL32A_PAUSE_FM_MMR6<"pause", 0b00101>; +class RDHWR_MMR6_ENC : POOL32A_RDHWR_FM_MMR6; +class WAIT_MMR6_ENC : WAIT_FM_MM, MMR6Arch<"wait">; +class SSNOP_MMR6_ENC : BARRIER_FM_MM<0x1>, MMR6Arch<"ssnop">; +class SYNC_MMR6_ENC : POOL32A_SYNC_FM_MMR6; +class SYNCI_MMR6_ENC : POOL32I_SYNCI_FM_MMR6, MMR6Arch<"synci">; +class RDPGPR_MMR6_ENC : POOL32A_RDPGPR_FM_MMR6<0b1110000101>; +class SDBBP_MMR6_ENC : SDBBP_FM_MM, MMR6Arch<"sdbbp">; class XOR_MMR6_ENC : ARITH_FM_MMR6<"xor", 0x310>; class XORI_MMR6_ENC : ADDI_FM_MMR6<"xori", 0x1c>; +class ABS_S_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.s", 0, 0b0001101>; +class ABS_D_MMR6_ENC : POOL32F_ABS_FM_MMR6<"abs.d", 1, 0b0001101>; +class FLOOR_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.s", 0, 0b00001100>; +class FLOOR_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.l.d", 1, 0b00001100>; +class FLOOR_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.s", 0, 0b00101100>; +class FLOOR_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"floor.w.d", 1, 0b00101100>; +class CEIL_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.s", 0, 0b01001100>; +class CEIL_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.l.d", 1, 0b01001100>; +class CEIL_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.s", 0, 0b01101100>; +class CEIL_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"ceil.w.d", 1, 0b01101100>; +class TRUNC_L_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.s", 0, 0b10001100>; +class TRUNC_L_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.l.d", 1, 0b10001100>; +class TRUNC_W_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.s", 0, 0b10101100>; +class TRUNC_W_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"trunc.w.d", 1, 0b10101100>; +class SQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.s", 0, 0b00101000>; +class SQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"sqrt.d", 1, 0b00101000>; +class RSQRT_S_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.s", 0, 0b00001000>; +class RSQRT_D_MMR6_ENC : POOL32F_MATH_FM_MMR6<"rsqrt.d", 1, 0b00001000>; +class SB_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b000110>; +class SBE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b100>; +class SCE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b110>; +class SH_MMR6_ENC : SB32_SH32_STORE_FM_MMR6<0b001110>; +class SHE_MMR6_ENC : POOL32C_STORE_EVA_FM_MMR6<0b101>; +class LLE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b110>; +class LWE_MMR6_ENC : LOAD_WORD_EVA_FM_MMR6<0b111>; +class LW_MMR6_ENC : LOAD_WORD_FM_MMR6; +class LUI_MMR6_ENC : LOAD_UPPER_IMM_FM_MMR6; +class RECIP_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.s", 0, 0b01001000>; +class RECIP_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"recip.d", 1, 0b01001000>; +class RINT_S_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.s", 0>; +class RINT_D_MMR6_ENC : POOL32F_RINT_FM_MMR6<"rint.d", 1>; +class ROUND_L_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.s", 0, + 0b11001100>; +class ROUND_L_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.l.d", 1, + 0b11001100>; +class ROUND_W_S_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.s", 0, + 0b11101100>; +class ROUND_W_D_MMR6_ENC : POOL32F_RECIP_ROUND_FM_MMR6<"round.w.d", 1, + 0b11101100>; +class SEL_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.s", 0, 0b010111000>; +class SEL_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"sel.d", 1, 0b010111000>; +class SELEQZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.s", 0, 0b000111000>; +class SELEQZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"seleqz.d", 1, 0b000111000>; +class SELENZ_S_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.s", 0, 0b001111000>; +class SELENZ_D_MMR6_ENC : POOL32F_SEL_FM_MMR6<"selenz.d", 1, 0b001111000>; +class CLASS_S_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.s", 0, 0b001100000>; +class CLASS_D_MMR6_ENC : POOL32F_CLASS_FM_MMR6<"class.d", 1, 0b001100000>; + +class ADDU16_MMR6_ENC : POOL16A_ADDU16_FM_MMR6; +class AND16_MMR6_ENC : POOL16C_AND16_FM_MMR6; +class ANDI16_MMR6_ENC : ANDI_FM_MM16<0b001011>, MicroMipsR6Inst16; +class NOT16_MMR6_ENC : POOL16C_NOT16_FM_MMR6; +class OR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1001>; +class SLL16_MMR6_ENC : SHIFT_FM_MM16<0>, MicroMipsR6Inst16; +class SRL16_MMR6_ENC : SHIFT_FM_MM16<1>, MicroMipsR6Inst16; +class BREAK16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b011011>; +class LI16_MMR6_ENC : LI_FM_MM16; +class MOVE16_MMR6_ENC : MOVE_FM_MM16<0b000011>; +class SDBBP16_MMR6_ENC : POOL16C_BREAKPOINT_FM_MMR6<0b111011>; +class SUBU16_MMR6_ENC : POOL16A_SUBU16_FM_MMR6; +class XOR16_MMR6_ENC : POOL16C_OR16_XOR16_FM_MMR6<0b1000>; class CMP_CBR_RT_Z_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd, RegisterOperand GPROpnd> @@ -108,6 +208,43 @@ class BNEZALC_MMR6_DESC : CMP_CBR_RT_Z_MMR6_DESC_BASE<"bnezalc", brtarget_mm, list<Register> Defs = [RA]; } +/// Floating Point Instructions +class FADD_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.s", 0, 0b00110000>; +class FADD_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"add.d", 1, 0b00110000>; +class FSUB_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.s", 0, 0b01110000>; +class FSUB_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"sub.d", 1, 0b01110000>; +class FMUL_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.s", 0, 0b10110000>; +class FMUL_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"mul.d", 1, 0b10110000>; +class FDIV_S_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.s", 0, 0b11110000>; +class FDIV_D_MMR6_ENC : POOL32F_ARITH_FM_MMR6<"div.d", 1, 0b11110000>; +class MADDF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.s", 0, 0b110111000>; +class MADDF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"maddf.d", 1, 0b110111000>; +class MSUBF_S_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.s", 0, 0b111111000>; +class MSUBF_D_MMR6_ENC : POOL32F_ARITHF_FM_MMR6<"msubf.d", 1, 0b111111000>; +class FMOV_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.s", 0, 0b0000001>; +class FMOV_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"mov.d", 1, 0b0000001>; +class FNEG_S_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.s", 0, 0b0101101>; +class FNEG_D_MMR6_ENC : POOL32F_MOV_NEG_FM_MMR6<"neg.d", 1, 0b0101101>; +class MAX_S_MMR6_ENC : POOL32F_MINMAX_FM<"max.s", 0, 0b000001011>; +class MAX_D_MMR6_ENC : POOL32F_MINMAX_FM<"max.d", 1, 0b000001011>; +class MAXA_S_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.s", 0, 0b000101011>; +class MAXA_D_MMR6_ENC : POOL32F_MINMAX_FM<"maxa.d", 1, 0b000101011>; +class MIN_S_MMR6_ENC : POOL32F_MINMAX_FM<"min.s", 0, 0b000000011>; +class MIN_D_MMR6_ENC : POOL32F_MINMAX_FM<"min.d", 1, 0b000000011>; +class MINA_S_MMR6_ENC : POOL32F_MINMAX_FM<"mina.s", 0, 0b000100011>; +class MINA_D_MMR6_ENC : POOL32F_MINMAX_FM<"mina.d", 1, 0b000100011>; + +class CVT_L_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.s", 0, 0b00000100>; +class CVT_L_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.l.d", 1, 0b00000100>; +class CVT_W_S_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.s", 0, 0b00100100>; +class CVT_W_D_MMR6_ENC : POOL32F_CVT_LW_FM<"cvt.w.d", 1, 0b00100100>; +class CVT_D_S_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.s", 0, 0b1001101>; +class CVT_D_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.w", 1, 0b1001101>; +class CVT_D_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.d.l", 2, 0b1001101>; +class CVT_S_D_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.d", 0, 0b1101101>; +class CVT_S_W_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.w", 1, 0b1101101>; +class CVT_S_L_MMR6_ENC : POOL32F_CVT_DS_FM<"cvt.s.l", 2, 0b1101101>; + //===----------------------------------------------------------------------===// // // Instruction Descriptions @@ -130,11 +267,34 @@ class BC_MMR6_DESC_BASE<string instr_asm, DAGOperand opnd> bit isBarrier = 1; } -class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26> { +class BALC_MMR6_DESC : BC_MMR6_DESC_BASE<"balc", brtarget26_mm> { bit isCall = 1; list<Register> Defs = [RA]; } -class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26>; +class BC_MMR6_DESC : BC_MMR6_DESC_BASE<"bc", brtarget26_mm>; + +class BC16_MMR6_DESC : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset), + !strconcat("bc16", "\t$offset"), [], + II_BC, FrmI>, + MMR6Arch<"bc16">, MicroMipsR6Inst16 { + let isBranch = 1; + let isTerminator = 1; + let isBarrier = 1; + let hasDelaySlot = 0; + let AdditionalPredicates = [RelocPIC]; + let Defs = [AT]; +} + +class BEQZC_BNEZC_MM16R6_DESC_BASE<string instr_asm> + : CBranchZeroMM<instr_asm, brtarget7_mm, GPRMM16Opnd>, MMR6Arch<instr_asm> { + let isBranch = 1; + let isTerminator = 1; + let hasDelaySlot = 0; + let Defs = [AT]; +} +class BEQZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"beqzc16">; +class BNEZC16_MMR6_DESC : BEQZC_BNEZC_MM16R6_DESC_BASE<"bnezc16">; + class SUB_MMR6_DESC : ArithLogicR<"sub", GPR32Opnd>; class SUBU_MMR6_DESC : ArithLogicR<"subu", GPR32Opnd>; @@ -162,6 +322,35 @@ class CACHE_HINT_MMR6_DESC<string instr_asm, Operand MemOpnd, class CACHE_MMR6_DESC : CACHE_HINT_MMR6_DESC<"cache", mem_mm_12, GPR32Opnd>; class PREF_MMR6_DESC : CACHE_HINT_MMR6_DESC<"pref", mem_mm_12, GPR32Opnd>; +class PREFE_CACHEE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd, + RegisterOperand GPROpnd> : + CACHE_HINT_MMR6_DESC<instr_asm, MemOpnd, + GPROpnd> { + string DecoderMethod = "DecodePrefeOpMM"; +} + +class PREFE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"prefe", mem_mm_9, GPR32Opnd>; +class CACHEE_MMR6_DESC : PREFE_CACHEE_MMR6_DESC_BASE<"cachee", mem_mm_9, GPR32Opnd>; + +class LB_LBU_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd, + RegisterOperand GPROpnd> : MMR6Arch<instr_asm> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins MemOpnd:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + string DecoderMethod = "DecodeLoadByte15"; + bit mayLoad = 1; +} +class LB_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lb", mem_mm_16, GPR32Opnd>; +class LBU_MMR6_DESC : LB_LBU_MMR6_DESC_BASE<"lbu", mem_mm_16, GPR32Opnd>; + +class LBE_LBUE_MMR6_DESC_BASE<string instr_asm, Operand MemOpnd, + RegisterOperand GPROpnd> + : LB_LBU_MMR6_DESC_BASE<instr_asm, MemOpnd, GPROpnd> { + let DecoderMethod = "DecodeLoadByte9"; +} +class LBE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbe", mem_mm_9, GPR32Opnd>; +class LBUE_MMR6_DESC : LBE_LBUE_MMR6_DESC_BASE<"lbue", mem_mm_9, GPR32Opnd>; + class CLO_CLZ_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> : MMR6Arch<instr_asm> { dag OutOperandList = (outs GPROpnd:$rt); @@ -174,10 +363,22 @@ class CLZ_MMR6_DESC : CLO_CLZ_MMR6_DESC_BASE<"clz", GPR32Opnd>; class EHB_MMR6_DESC : Barrier<"ehb">; class EI_MMR6_DESC : DEI_FT<"ei", GPR32Opnd>; +class DI_MMR6_DESC : DEI_FT<"di", GPR32Opnd>; class ERET_MMR6_DESC : ER_FT<"eret">; +class DERET_MMR6_DESC : ER_FT<"deret">; class ERETNC_MMR6_DESC : ER_FT<"eretnc">; +class JALRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO> + : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), + [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, + MMR6Arch<opstr>, MicroMipsR6Inst16 { + let isCall = 1; + let hasDelaySlot = 0; + let Defs = [RA]; +} +class JALRC16_MMR6_DESC : JALRC16_MMR6_DESC_BASE<"jalr", GPR32Opnd>; + class JMP_MMR6_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd, RegisterOperand GPROpnd> : MMR6Arch<opstr> { @@ -200,6 +401,27 @@ class JIC_MMR6_DESC : JMP_MMR6_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, list<Register> Defs = [AT]; } +class JRC16_MMR6_DESC_BASE<string opstr, RegisterOperand RO> + : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), + [], II_JR, FrmR>, + MMR6Arch<opstr>, MicroMipsR6Inst16 { + let hasDelaySlot = 0; + let isBranch = 1; + let isIndirectBranch = 1; +} +class JRC16_MMR6_DESC : JRC16_MMR6_DESC_BASE<"jrc16", GPR32Opnd>; + +class JRCADDIUSP_MMR6_DESC + : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jrcaddiusp\t$imm", + [], II_JRADDIUSP, FrmR>, + MMR6Arch<"jrcaddiusp">, MicroMipsR6Inst16 { + let hasDelaySlot = 0; + let isTerminator = 1; + let isBarrier = 1; + let isBranch = 1; + let isIndirectBranch = 1; +} + class ALIGN_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, Operand ImmOpnd> : MMR6Arch<instr_asm> { dag OutOperandList = (outs GPROpnd:$rd); @@ -241,7 +463,7 @@ class LSA_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, list<dag> Pattern = []; } -class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2>; +class LSA_MMR6_DESC : LSA_MMR6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>; class PCREL_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, Operand ImmOpnd> : MMR6Arch<instr_asm> { @@ -264,6 +486,18 @@ class SELEQNE_Z_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> class SELEQZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"seleqz", GPR32Opnd>; class SELNEZ_MMR6_DESC : SELEQNE_Z_MMR6_DESC_BASE<"selnez", GPR32Opnd>; +class PAUSE_MMR6_DESC : Barrier<"pause">; +class RDHWR_MMR6_DESC : MMR6Arch<"rdhwr">, MipsR6Inst { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins HWRegsOpnd:$rs, uimm3:$sel); + string AsmString = !strconcat("rdhwr", "\t$rt, $rs, $sel"); + list<dag> Pattern = []; + InstrItinClass Itinerary = II_RDHWR; + Format Form = FrmR; +} + +class WAIT_MMR6_DESC : WaitMM<"wait">; +class SSNOP_MMR6_DESC : Barrier<"ssnop">; class SLL_MMR6_DESC : shift_rotate_imm<"sll", uimm5, GPR32Opnd, II_SLL>; class DIV_MMR6_DESC : ArithLogicR<"div", GPR32Opnd>; class DIVU_MMR6_DESC : ArithLogicR<"divu", GPR32Opnd>; @@ -277,13 +511,426 @@ class ORI_MMR6_DESC : ArithLogicI<"ori", simm16, GPR32Opnd>; class XOR_MMR6_DESC : ArithLogicR<"xor", GPR32Opnd>; class XORI_MMR6_DESC : ArithLogicI<"xori", simm16, GPR32Opnd>; +class SWE_MMR6_DESC_BASE<string opstr, DAGOperand RO, DAGOperand MO, + SDPatternOperator OpNode = null_frag, + InstrItinClass Itin = NoItinerary, + ComplexPattern Addr = addr> : + InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"), + [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> { + let DecoderMethod = "DecodeMem"; + let mayStore = 1; +} +class SW_MMR6_DESC : Store<"sw", GPR32Opnd>; +class SWE_MMR6_DESC : SWE_MMR6_DESC_BASE<"swe", GPR32Opnd, mem_simm9>; + +class WRPGPR_WSBH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> + : MMR6Arch<instr_asm> { + dag InOperandList = (ins RO:$rs); + dag OutOperandList = (outs RO:$rt); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs"); + list<dag> Pattern = []; + Format f = FrmR; + string BaseOpcode = instr_asm; + bit hasSideEffects = 0; +} +class WRPGPR_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wrpgpr", GPR32Opnd>; +class WSBH_MMR6_DESC : WRPGPR_WSBH_MMR6_DESC_BASE<"wsbh", GPR32Opnd>; + +/// Floating Point Instructions +class FARITH_MMR6_DESC_BASE<string instr_asm, RegisterOperand RC, + InstrItinClass Itin, bit isComm, + SDPatternOperator OpNode = null_frag> : HARDFLOAT { + dag OutOperandList = (outs RC:$fd); + dag InOperandList = (ins RC:$ft, RC:$fs); + string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft"); + list<dag> Pattern = [(set RC:$fd, (OpNode RC:$fs, RC:$ft))]; + InstrItinClass Itinerary = Itin; + bit isCommutable = isComm; +} +class FADD_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"add.s", FGR32Opnd, II_ADD_S, 1, fadd>; +class FADD_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"add.d", AFGR64Opnd, II_ADD_D, 1, fadd>; +class FSUB_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>; +class FSUB_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"sub.d", AFGR64Opnd, II_SUB_D, 0, fsub>; +class FMUL_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"mul.s", FGR32Opnd, II_MUL_S, 1, fmul>; +class FMUL_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"mul.d", AFGR64Opnd, II_MUL_D, 1, fmul>; +class FDIV_S_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"div.s", FGR32Opnd, II_DIV_S, 0, fdiv>; +class FDIV_D_MMR6_DESC + : FARITH_MMR6_DESC_BASE<"div.d", AFGR64Opnd, II_DIV_D, 0, fdiv>; +class MADDF_S_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>, HARDFLOAT; +class MADDF_D_MMR6_DESC : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>, HARDFLOAT; +class MSUBF_S_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>, HARDFLOAT; +class MSUBF_D_MMR6_DESC : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>, HARDFLOAT; + +class FMOV_FNEG_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC, + RegisterOperand SrcRC, InstrItinClass Itin, + SDPatternOperator OpNode = null_frag> + : HARDFLOAT, NeverHasSideEffects { + dag OutOperandList = (outs DstRC:$ft); + dag InOperandList = (ins SrcRC:$fs); + string AsmString = !strconcat(instr_asm, "\t$ft, $fs"); + list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))]; + InstrItinClass Itinerary = Itin; + Format Form = FrmFR; +} +class FMOV_S_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>; +class FMOV_D_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>; +class FNEG_S_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>; +class FNEG_D_MMR6_DESC + : FMOV_FNEG_MMR6_DESC_BASE<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>; + +class MAX_S_MMR6_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd>, HARDFLOAT; +class MAX_D_MMR6_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd>, HARDFLOAT; +class MIN_S_MMR6_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd>, HARDFLOAT; +class MIN_D_MMR6_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd>, HARDFLOAT; + +class MAXA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd>, HARDFLOAT; +class MAXA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd>, HARDFLOAT; +class MINA_S_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd>, HARDFLOAT; +class MINA_D_MMR6_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd>, HARDFLOAT; + +class CVT_MMR6_DESC_BASE< + string instr_asm, RegisterOperand DstRC, RegisterOperand SrcRC, + InstrItinClass Itin, SDPatternOperator OpNode = null_frag> + : HARDFLOAT, NeverHasSideEffects { + dag OutOperandList = (outs DstRC:$ft); + dag InOperandList = (ins SrcRC:$fs); + string AsmString = !strconcat(instr_asm, "\t$ft, $fs"); + list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))]; + InstrItinClass Itinerary = Itin; + Format Form = FrmFR; +} + +class CVT_L_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.s", FGR64Opnd, FGR32Opnd, + II_CVT>; +class CVT_L_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.l.d", FGR64Opnd, FGR64Opnd, + II_CVT>; +class CVT_W_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.s", FGR32Opnd, FGR32Opnd, + II_CVT>; +class CVT_W_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.w.d", FGR32Opnd, AFGR64Opnd, + II_CVT>; +class CVT_D_S_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.s", FGR32Opnd, AFGR64Opnd, + II_CVT>; +class CVT_D_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.w", FGR32Opnd, AFGR64Opnd, + II_CVT>; +class CVT_D_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.d.l", FGR64Opnd, FGR64Opnd, + II_CVT>, FGR_64; +class CVT_S_D_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.d", AFGR64Opnd, FGR32Opnd, + II_CVT>; +class CVT_S_W_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.w", FGR32Opnd, FGR32Opnd, + II_CVT>; +class CVT_S_L_MMR6_DESC : CVT_MMR6_DESC_BASE<"cvt.s.l", FGR64Opnd, FGR32Opnd, + II_CVT>, FGR_64; + +multiclass CMP_CC_MMR6<bits<6> format, string Typestr, + RegisterOperand FGROpnd> { + def CMP_AF_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.af.", Typestr), format, FIELD_CMP_COND_AF>, + CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_UN_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.un.", Typestr), format, FIELD_CMP_COND_UN>, + CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_EQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.eq.", Typestr), format, FIELD_CMP_COND_EQ>, + CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_UEQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.ueq.", Typestr), format, FIELD_CMP_COND_UEQ>, + CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_LT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.lt.", Typestr), format, FIELD_CMP_COND_LT>, + CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_ULT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.ult.", Typestr), format, FIELD_CMP_COND_ULT>, + CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_LE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.le.", Typestr), format, FIELD_CMP_COND_LE>, + CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_ULE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.ule.", Typestr), format, FIELD_CMP_COND_ULE>, + CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SAF_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.saf.", Typestr), format, FIELD_CMP_COND_SAF>, + CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SUN_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sun.", Typestr), format, FIELD_CMP_COND_SUN>, + CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SEQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.seq.", Typestr), format, FIELD_CMP_COND_SEQ>, + CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SUEQ_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sueq.", Typestr), format, FIELD_CMP_COND_SUEQ>, + CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SLT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.slt.", Typestr), format, FIELD_CMP_COND_SLT>, + CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SULT_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sult.", Typestr), format, FIELD_CMP_COND_SULT>, + CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SLE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sle.", Typestr), format, FIELD_CMP_COND_SLE>, + CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; + def CMP_SULE_#NAME : POOL32F_CMP_FM< + !strconcat("cmp.sule.", Typestr), format, FIELD_CMP_COND_SULE>, + CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, HARDFLOAT, R6MMR6Rel, + ISA_MICROMIPS32R6; +} + +class ABSS_FT_MMR6_DESC_BASE<string instr_asm, RegisterOperand DstRC, + RegisterOperand SrcRC, InstrItinClass Itin, + SDPatternOperator OpNode = null_frag> + : HARDFLOAT, NeverHasSideEffects { + dag OutOperandList = (outs DstRC:$ft); + dag InOperandList = (ins SrcRC:$fs); + string AsmString = !strconcat(instr_asm, "\t$ft, $fs"); + list<dag> Pattern = [(set DstRC:$ft, (OpNode SrcRC:$fs))]; + InstrItinClass Itinerary = Itin; + Format Form = FrmFR; + list<Predicate> EncodingPredicates = [HasStdEnc]; +} + +class ABS_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.s", FGR32Opnd, FGR32Opnd, + II_ABS, fabs>; +class ABS_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"abs.d", AFGR64Opnd, AFGR64Opnd, + II_ABS, fabs>; +class FLOOR_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.s", FGR64Opnd, + FGR32Opnd, II_FLOOR>; +class FLOOR_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.l.d", FGR64Opnd, + FGR64Opnd, II_FLOOR>; +class FLOOR_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.s", FGR32Opnd, + FGR32Opnd, II_FLOOR>; +class FLOOR_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"floor.w.d", FGR32Opnd, + AFGR64Opnd, II_FLOOR>; +class CEIL_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.s", FGR64Opnd, + FGR32Opnd, II_CEIL>; +class CEIL_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.l.d", FGR64Opnd, + FGR64Opnd, II_CEIL>; +class CEIL_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.s", FGR32Opnd, + FGR32Opnd, II_CEIL>; +class CEIL_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"ceil.w.d", FGR32Opnd, + AFGR64Opnd, II_CEIL>; +class TRUNC_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.s", FGR64Opnd, + FGR32Opnd, II_TRUNC>; +class TRUNC_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.l.d", FGR64Opnd, + FGR64Opnd, II_TRUNC>; +class TRUNC_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.s", FGR32Opnd, + FGR32Opnd, II_TRUNC>; +class TRUNC_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"trunc.w.d", FGR32Opnd, + AFGR64Opnd, II_TRUNC>; +class SQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.s", FGR32Opnd, FGR32Opnd, + II_SQRT_S, fsqrt>; +class SQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"sqrt.d", AFGR64Opnd, AFGR64Opnd, + II_SQRT_D, fsqrt>; +class RSQRT_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.s", FGR32Opnd, + FGR32Opnd, II_TRUNC>; +class RSQRT_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"rsqrt.d", FGR32Opnd, + AFGR64Opnd, II_TRUNC>; +class RECIP_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class RECIP_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"recip.d", FGR32Opnd, FGR32Opnd, + II_ROUND>; +class ROUND_L_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.s", FGR64Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_L_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.l.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; +class ROUND_W_S_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.s", FGR32Opnd, + FGR32Opnd, II_ROUND>; +class ROUND_W_D_MMR6_DESC : ABSS_FT_MMR6_DESC_BASE<"round.w.d", FGR64Opnd, + FGR64Opnd, II_ROUND>; + +class SEL_S_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.s", FGR32Opnd>; +class SEL_D_MMR6_DESC : COP1_SEL_DESC_BASE<"sel.d", FGR64Opnd> { + // We must insert a SUBREG_TO_REG around $fd_in + bit usesCustomInserter = 1; +} + +class SELEQZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>; +class SELEQZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>; +class SELENZ_S_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>; +class SELENZ_D_MMR6_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>; +class RINT_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>; +class RINT_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>; +class CLASS_S_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>; +class CLASS_D_MMR6_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>; + +class STORE_MMR6_DESC_BASE<string opstr, DAGOperand RO> + : Store<opstr, RO>, MMR6Arch<opstr> { + let DecoderMethod = "DecodeMemMMImm16"; +} +class SB_MMR6_DESC : STORE_MMR6_DESC_BASE<"sb", GPR32Opnd>; + +class STORE_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs); + dag InOperandList = (ins RO:$rt, mem_mm_9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + string DecoderMethod = "DecodeStoreEvaOpMM"; + bit mayStore = 1; +} +class SBE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sbe", GPR32Opnd>; +class SCE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"sce", GPR32Opnd>; +class SH_MMR6_DESC : STORE_MMR6_DESC_BASE<"sh", GPR32Opnd>; +class SHE_MMR6_DESC : STORE_EVA_MMR6_DESC_BASE<"she", GPR32Opnd>; +class LOAD_WORD_EVA_MMR6_DESC_BASE<string instr_asm, RegisterOperand RO> : + MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs RO:$rt); + dag InOperandList = (ins mem_mm_12:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + string DecoderMethod = "DecodeMemMMImm9"; + bit mayLoad = 1; +} +class LLE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lle", GPR32Opnd>; +class LWE_MMR6_DESC : LOAD_WORD_EVA_MMR6_DESC_BASE<"lwe", GPR32Opnd>; +class ADDU16_MMR6_DESC : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>, + MMR6Arch<"addu16">; +class AND16_MMR6_DESC : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>, + MMR6Arch<"and16">; +class ANDI16_MMR6_DESC : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, + MMR6Arch<"andi16">; +class NOT16_MMR6_DESC : NotMM16<"not16", GPRMM16Opnd>, MMR6Arch<"not16">; +class OR16_MMR6_DESC : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, + MMR6Arch<"or16">; +class SLL16_MMR6_DESC : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>, + MMR6Arch<"sll16">; +class SRL16_MMR6_DESC : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>, + MMR6Arch<"srl16">; +class BREAK16_MMR6_DESC : BrkSdbbp16MM<"break16">, MMR6Arch<"srl16">, + MicroMipsR6Inst16; +class LI16_MMR6_DESC : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, + MMR6Arch<"srl16">, MicroMipsR6Inst16, IsAsCheapAsAMove; +class MOVE16_MMR6_DESC : MoveMM16<"move16", GPR32Opnd>, MMR6Arch<"srl16">, + MicroMipsR6Inst16; +class SDBBP16_MMR6_DESC : BrkSdbbp16MM<"sdbbp16">, MMR6Arch<"sdbbp16">, + MicroMipsR6Inst16; +class SUBU16_MMR6_DESC : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>, + MMR6Arch<"sdbbp16">, MicroMipsR6Inst16; +class XOR16_MMR6_DESC : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>, + MMR6Arch<"sdbbp16">, MicroMipsR6Inst16; + +class LW_MMR6_DESC : MMR6Arch<"lw">, MipsR6Inst { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins mem:$addr); + string AsmString = "lw\t$rt, $addr"; + let DecoderMethod = "DecodeMemMMImm16"; + let canFoldAsLoad = 1; + let mayLoad = 1; + list<dag> Pattern = [(set GPR32Opnd:$rt, (load addrDefault:$addr))]; + InstrItinClass Itinerary = II_LW; +} + +class LUI_MMR6_DESC : IsAsCheapAsAMove, MMR6Arch<"lui">, MipsR6Inst{ + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins uimm16:$imm16); + string AsmString = "lui\t$rt, $imm16"; + list<dag> Pattern = []; + bit hasSideEffects = 0; + bit isReMaterializable = 1; + InstrItinClass Itinerary = II_LUI; + Format Form = FrmI; +} + +class SYNC_MMR6_DESC : MMR6Arch<"sync">, MipsR6Inst { + dag OutOperandList = (outs); + dag InOperandList = (ins i32imm:$stype); + string AsmString = !strconcat("sync", "\t$stype"); + list<dag> Pattern = [(MipsSync imm:$stype)]; + InstrItinClass Itinerary = NoItinerary; + bit HasSideEffects = 1; +} + +class SYNCI_MMR6_DESC : SYNCI_FT<"synci"> { + let DecoderMethod = "DecodeSynciR6"; +} + +class RDPGPR_MMR6_DESC : MMR6Arch<"rdpgpr">, MipsR6Inst { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins GPR32Opnd:$rd); + string AsmString = !strconcat("rdpgpr", "\t$rt, $rd"); +} + +class SDBBP_MMR6_DESC : MipsR6Inst { + dag OutOperandList = (outs); + dag InOperandList = (ins uimm20:$code_); + string AsmString = !strconcat("sdbbp", "\t$code_"); + list<dag> Pattern = []; +} + +class LWM16_MMR6_DESC + : MicroMipsInst16<(outs reglist16:$rt), (ins mem_mm_4sp:$addr), + !strconcat("lwm16", "\t$rt, $addr"), [], + NoItinerary, FrmI>, + MMR6Arch<"lwm16">, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMReglistImm4Lsl2"; + let mayLoad = 1; + InstrItinClass Itin = NoItinerary; + ComplexPattern Addr = addr; +} + +class SWM16_MMR6_DESC + : MicroMipsInst16<(outs), (ins reglist16:$rt, mem_mm_4sp:$addr), + !strconcat("swm16", "\t$rt, $addr"), [], + NoItinerary, FrmI>, + MMR6Arch<"swm16">, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMReglistImm4Lsl2"; + let mayStore = 1; + InstrItinClass Itin = NoItinerary; + ComplexPattern Addr = addr; +} + +class SB16_MMR6_DESC_BASE<string opstr, DAGOperand RTOpnd, DAGOperand RO, + SDPatternOperator OpNode, InstrItinClass Itin, + Operand MemOpnd> + : MicroMipsInst16<(outs), (ins RTOpnd:$rt, MemOpnd:$addr), + !strconcat(opstr, "\t$rt, $addr"), [], Itin, FrmI>, + MMR6Arch<opstr>, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMImm4"; + let mayStore = 1; +} +class SB16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sb16", GPRMM16OpndZero, GPRMM16Opnd, + truncstorei8, II_SB, mem_mm_4>; +class SH16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sh16", GPRMM16OpndZero, GPRMM16Opnd, + truncstorei16, II_SH, mem_mm_4_lsl1>; +class SW16_MMR6_DESC : SB16_MMR6_DESC_BASE<"sw16", GPRMM16OpndZero, GPRMM16Opnd, + store, II_SW, mem_mm_4_lsl2>; + +class SWSP_MMR6_DESC + : MicroMipsInst16<(outs), (ins GPR32Opnd:$rt, mem_mm_sp_imm5_lsl2:$offset), + !strconcat("sw", "\t$rt, $offset"), [], II_SW, FrmI>, + MMR6Arch<"sw">, MicroMipsR6Inst16 { + let DecoderMethod = "DecodeMemMMSPImm5Lsl2"; + let mayStore = 1; +} + //===----------------------------------------------------------------------===// // // Instruction Definitions // //===----------------------------------------------------------------------===// -let DecoderNamespace = "MicroMips32r6" in { +let DecoderNamespace = "MicroMipsR6" in { def ADD_MMR6 : StdMMR6Rel, ADD_MMR6_DESC, ADD_MMR6_ENC, ISA_MICROMIPS32R6; def ADDIU_MMR6 : StdMMR6Rel, ADDIU_MMR6_DESC, ADDIU_MMR6_ENC, ISA_MICROMIPS32R6; def ADDU_MMR6 : StdMMR6Rel, ADDU_MMR6_DESC, ADDU_MMR6_ENC, ISA_MICROMIPS32R6; @@ -298,6 +945,11 @@ def ALIGN_MMR6 : R6MMR6Rel, ALIGN_MMR6_ENC, ALIGN_MMR6_DESC, ISA_MICROMIPS32R6; def AUI_MMR6 : R6MMR6Rel, AUI_MMR6_ENC, AUI_MMR6_DESC, ISA_MICROMIPS32R6; def BALC_MMR6 : R6MMR6Rel, BALC_MMR6_ENC, BALC_MMR6_DESC, ISA_MICROMIPS32R6; def BC_MMR6 : R6MMR6Rel, BC_MMR6_ENC, BC_MMR6_DESC, ISA_MICROMIPS32R6; +def BC16_MMR6 : StdMMR6Rel, BC16_MMR6_DESC, BC16_MMR6_ENC, ISA_MICROMIPS32R6; +def BEQZC16_MMR6 : StdMMR6Rel, BEQZC16_MMR6_DESC, BEQZC16_MMR6_ENC, + ISA_MICROMIPS32R6; +def BNEZC16_MMR6 : StdMMR6Rel, BNEZC16_MMR6_DESC, BNEZC16_MMR6_ENC, + ISA_MICROMIPS32R6; def BITSWAP_MMR6 : R6MMR6Rel, BITSWAP_MMR6_ENC, BITSWAP_MMR6_DESC, ISA_MICROMIPS32R6; def BEQZALC_MMR6 : R6MMR6Rel, BEQZALC_MMR6_ENC, BEQZALC_MMR6_DESC, @@ -320,13 +972,21 @@ def DIV_MMR6 : R6MMR6Rel, DIV_MMR6_DESC, DIV_MMR6_ENC, ISA_MICROMIPS32R6; def DIVU_MMR6 : R6MMR6Rel, DIVU_MMR6_DESC, DIVU_MMR6_ENC, ISA_MICROMIPS32R6; def EHB_MMR6 : StdMMR6Rel, EHB_MMR6_DESC, EHB_MMR6_ENC, ISA_MICROMIPS32R6; def EI_MMR6 : StdMMR6Rel, EI_MMR6_DESC, EI_MMR6_ENC, ISA_MICROMIPS32R6; -def ERET_MMR6 : R6MMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6; +def DI_MMR6 : StdMMR6Rel, DI_MMR6_DESC, DI_MMR6_ENC, ISA_MICROMIPS32R6; +def ERET_MMR6 : StdMMR6Rel, ERET_MMR6_DESC, ERET_MMR6_ENC, ISA_MICROMIPS32R6; +def DERET_MMR6 : StdMMR6Rel, DERET_MMR6_DESC, DERET_MMR6_ENC, ISA_MICROMIPS32R6; def ERETNC_MMR6 : R6MMR6Rel, ERETNC_MMR6_DESC, ERETNC_MMR6_ENC, ISA_MICROMIPS32R6; +def JALRC16_MMR6 : R6MMR6Rel, JALRC16_MMR6_DESC, JALRC16_MMR6_ENC, + ISA_MICROMIPS32R6; def JIALC_MMR6 : R6MMR6Rel, JIALC_MMR6_ENC, JIALC_MMR6_DESC, ISA_MICROMIPS32R6; def JIC_MMR6 : R6MMR6Rel, JIC_MMR6_ENC, JIC_MMR6_DESC, ISA_MICROMIPS32R6; +def JRC16_MMR6 : R6MMR6Rel, JRC16_MMR6_DESC, JRC16_MMR6_ENC, ISA_MICROMIPS32R6; +def JRCADDIUSP_MMR6 : R6MMR6Rel, JRCADDIUSP_MMR6_DESC, JRCADDIUSP_MMR6_ENC, + ISA_MICROMIPS32R6; def LSA_MMR6 : R6MMR6Rel, LSA_MMR6_ENC, LSA_MMR6_DESC, ISA_MICROMIPS32R6; def LWPC_MMR6 : R6MMR6Rel, LWPC_MMR6_ENC, LWPC_MMR6_DESC, ISA_MICROMIPS32R6; +def LWM16_MMR6 : StdMMR6Rel, LWM16_MMR6_DESC, LWM16_MMR6_ENC, ISA_MICROMIPS32R6; def MOD_MMR6 : R6MMR6Rel, MOD_MMR6_DESC, MOD_MMR6_ENC, ISA_MICROMIPS32R6; def MODU_MMR6 : R6MMR6Rel, MODU_MMR6_DESC, MODU_MMR6_ENC, ISA_MICROMIPS32R6; def MUL_MMR6 : R6MMR6Rel, MUL_MMR6_DESC, MUL_MMR6_ENC, ISA_MICROMIPS32R6; @@ -337,17 +997,211 @@ def NOR_MMR6 : StdMMR6Rel, NOR_MMR6_DESC, NOR_MMR6_ENC, ISA_MICROMIPS32R6; def OR_MMR6 : StdMMR6Rel, OR_MMR6_DESC, OR_MMR6_ENC, ISA_MICROMIPS32R6; def ORI_MMR6 : StdMMR6Rel, ORI_MMR6_DESC, ORI_MMR6_ENC, ISA_MICROMIPS32R6; def PREF_MMR6 : R6MMR6Rel, PREF_MMR6_ENC, PREF_MMR6_DESC, ISA_MICROMIPS32R6; +def SB16_MMR6 : StdMMR6Rel, SB16_MMR6_DESC, SB16_MMR6_ENC, ISA_MICROMIPS32R6; def SEB_MMR6 : StdMMR6Rel, SEB_MMR6_DESC, SEB_MMR6_ENC, ISA_MICROMIPS32R6; def SEH_MMR6 : StdMMR6Rel, SEH_MMR6_DESC, SEH_MMR6_ENC, ISA_MICROMIPS32R6; def SELEQZ_MMR6 : R6MMR6Rel, SELEQZ_MMR6_ENC, SELEQZ_MMR6_DESC, ISA_MICROMIPS32R6; def SELNEZ_MMR6 : R6MMR6Rel, SELNEZ_MMR6_ENC, SELNEZ_MMR6_DESC, ISA_MICROMIPS32R6; +def SH16_MMR6 : StdMMR6Rel, SH16_MMR6_DESC, SH16_MMR6_ENC, ISA_MICROMIPS32R6; def SLL_MMR6 : StdMMR6Rel, SLL_MMR6_DESC, SLL_MMR6_ENC, ISA_MICROMIPS32R6; def SUB_MMR6 : StdMMR6Rel, SUB_MMR6_DESC, SUB_MMR6_ENC, ISA_MICROMIPS32R6; def SUBU_MMR6 : StdMMR6Rel, SUBU_MMR6_DESC, SUBU_MMR6_ENC, ISA_MICROMIPS32R6; +def SW16_MMR6 : StdMMR6Rel, SW16_MMR6_DESC, SW16_MMR6_ENC, ISA_MICROMIPS32R6; +def SWM16_MMR6 : StdMMR6Rel, SWM16_MMR6_DESC, SWM16_MMR6_ENC, ISA_MICROMIPS32R6; +def SWSP_MMR6 : StdMMR6Rel, SWSP_MMR6_DESC, SWSP_MMR6_ENC, ISA_MICROMIPS32R6; +def PREFE_MMR6 : StdMMR6Rel, PREFE_MMR6_ENC, PREFE_MMR6_DESC, ISA_MICROMIPS32R6; +def CACHEE_MMR6 : StdMMR6Rel, CACHEE_MMR6_ENC, CACHEE_MMR6_DESC, + ISA_MICROMIPS32R6; +def WRPGPR_MMR6 : StdMMR6Rel, WRPGPR_MMR6_ENC, WRPGPR_MMR6_DESC, + ISA_MICROMIPS32R6; +def WSBH_MMR6 : StdMMR6Rel, WSBH_MMR6_ENC, WSBH_MMR6_DESC, ISA_MICROMIPS32R6; +def LB_MMR6 : R6MMR6Rel, LB_MMR6_ENC, LB_MMR6_DESC, ISA_MICROMIPS32R6; +def LBU_MMR6 : R6MMR6Rel, LBU_MMR6_ENC, LBU_MMR6_DESC, ISA_MICROMIPS32R6; +def LBE_MMR6 : R6MMR6Rel, LBE_MMR6_ENC, LBE_MMR6_DESC, ISA_MICROMIPS32R6; +def LBUE_MMR6 : R6MMR6Rel, LBUE_MMR6_ENC, LBUE_MMR6_DESC, ISA_MICROMIPS32R6; +def PAUSE_MMR6 : StdMMR6Rel, PAUSE_MMR6_DESC, PAUSE_MMR6_ENC, ISA_MICROMIPS32R6; +def RDHWR_MMR6 : R6MMR6Rel, RDHWR_MMR6_DESC, RDHWR_MMR6_ENC, ISA_MICROMIPS32R6; +def WAIT_MMR6 : StdMMR6Rel, WAIT_MMR6_DESC, WAIT_MMR6_ENC, ISA_MICROMIPS32R6; +def SSNOP_MMR6 : StdMMR6Rel, SSNOP_MMR6_DESC, SSNOP_MMR6_ENC, ISA_MICROMIPS32R6; +def SYNC_MMR6 : StdMMR6Rel, SYNC_MMR6_DESC, SYNC_MMR6_ENC, ISA_MICROMIPS32R6; +def SYNCI_MMR6 : StdMMR6Rel, SYNCI_MMR6_DESC, SYNCI_MMR6_ENC, ISA_MICROMIPS32R6; +def RDPGPR_MMR6 : R6MMR6Rel, RDPGPR_MMR6_DESC, RDPGPR_MMR6_ENC, + ISA_MICROMIPS32R6; +def SDBBP_MMR6 : R6MMR6Rel, SDBBP_MMR6_DESC, SDBBP_MMR6_ENC, ISA_MICROMIPS32R6; def XOR_MMR6 : StdMMR6Rel, XOR_MMR6_DESC, XOR_MMR6_ENC, ISA_MICROMIPS32R6; def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6; +let DecoderMethod = "DecodeMemMMImm16" in { + def SW_MMR6 : StdMMR6Rel, SW_MMR6_DESC, SW_MMR6_ENC, ISA_MICROMIPS32R6; +} +let DecoderMethod = "DecodeMemMMImm9" in { + def SWE_MMR6 : StdMMR6Rel, SWE_MMR6_DESC, SWE_MMR6_ENC, ISA_MICROMIPS32R6; +} +/// Floating Point Instructions +def FADD_S_MMR6 : StdMMR6Rel, FADD_S_MMR6_ENC, FADD_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FADD_D_MMR6 : StdMMR6Rel, FADD_D_MMR6_ENC, FADD_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FSUB_S_MMR6 : StdMMR6Rel, FSUB_S_MMR6_ENC, FSUB_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FSUB_D_MMR6 : StdMMR6Rel, FSUB_D_MMR6_ENC, FSUB_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMUL_S_MMR6 : StdMMR6Rel, FMUL_S_MMR6_ENC, FMUL_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMUL_D_MMR6 : StdMMR6Rel, FMUL_D_MMR6_ENC, FMUL_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FDIV_S_MMR6 : StdMMR6Rel, FDIV_S_MMR6_ENC, FDIV_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FDIV_D_MMR6 : StdMMR6Rel, FDIV_D_MMR6_ENC, FDIV_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MADDF_S_MMR6 : R6MMR6Rel, MADDF_S_MMR6_ENC, MADDF_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MADDF_D_MMR6 : R6MMR6Rel, MADDF_D_MMR6_ENC, MADDF_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MSUBF_S_MMR6 : R6MMR6Rel, MSUBF_S_MMR6_ENC, MSUBF_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MSUBF_D_MMR6 : R6MMR6Rel, MSUBF_D_MMR6_ENC, MSUBF_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMOV_S_MMR6 : StdMMR6Rel, FMOV_S_MMR6_ENC, FMOV_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FMOV_D_MMR6 : StdMMR6Rel, FMOV_D_MMR6_ENC, FMOV_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FNEG_S_MMR6 : StdMMR6Rel, FNEG_S_MMR6_ENC, FNEG_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FNEG_D_MMR6 : StdMMR6Rel, FNEG_D_MMR6_ENC, FNEG_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MAX_S_MMR6 : R6MMR6Rel, MAX_S_MMR6_ENC, MAX_S_MMR6_DESC, ISA_MICROMIPS32R6; +def MAX_D_MMR6 : R6MMR6Rel, MAX_D_MMR6_ENC, MAX_D_MMR6_DESC, ISA_MICROMIPS32R6; +def MIN_S_MMR6 : R6MMR6Rel, MIN_S_MMR6_ENC, MIN_S_MMR6_DESC, ISA_MICROMIPS32R6; +def MIN_D_MMR6 : R6MMR6Rel, MIN_D_MMR6_ENC, MIN_D_MMR6_DESC, ISA_MICROMIPS32R6; +def MAXA_S_MMR6 : R6MMR6Rel, MAXA_S_MMR6_ENC, MAXA_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MAXA_D_MMR6 : R6MMR6Rel, MAXA_D_MMR6_ENC, MAXA_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def MINA_S_MMR6 : R6MMR6Rel, MINA_S_MMR6_ENC, MINA_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def MINA_D_MMR6 : R6MMR6Rel, MINA_D_MMR6_ENC, MINA_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_L_S_MMR6 : StdMMR6Rel, CVT_L_S_MMR6_ENC, CVT_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_L_D_MMR6 : StdMMR6Rel, CVT_L_D_MMR6_ENC, CVT_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_W_S_MMR6 : StdMMR6Rel, CVT_W_S_MMR6_ENC, CVT_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_W_D_MMR6 : StdMMR6Rel, CVT_W_D_MMR6_ENC, CVT_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_D_S_MMR6 : StdMMR6Rel, CVT_D_S_MMR6_ENC, CVT_D_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_D_W_MMR6 : StdMMR6Rel, CVT_D_W_MMR6_ENC, CVT_D_W_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_D_L_MMR6 : StdMMR6Rel, CVT_D_L_MMR6_ENC, CVT_D_L_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_S_D_MMR6 : StdMMR6Rel, CVT_S_D_MMR6_ENC, CVT_S_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_S_W_MMR6 : StdMMR6Rel, CVT_S_W_MMR6_ENC, CVT_S_W_MMR6_DESC, + ISA_MICROMIPS32R6; +def CVT_S_L_MMR6 : StdMMR6Rel, CVT_S_L_MMR6_ENC, CVT_S_L_MMR6_DESC, + ISA_MICROMIPS32R6; +defm S_MMR6 : CMP_CC_MMR6<0b000101, "s", FGR32Opnd>; +defm D_MMR6 : CMP_CC_MMR6<0b010101, "d", FGR64Opnd>; +def ABS_S_MMR6 : StdMMR6Rel, ABS_S_MMR6_ENC, ABS_S_MMR6_DESC, ISA_MICROMIPS32R6; +def ABS_D_MMR6 : StdMMR6Rel, ABS_D_MMR6_ENC, ABS_D_MMR6_DESC, ISA_MICROMIPS32R6; +def FLOOR_L_S_MMR6 : StdMMR6Rel, FLOOR_L_S_MMR6_ENC, FLOOR_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FLOOR_L_D_MMR6 : StdMMR6Rel, FLOOR_L_D_MMR6_ENC, FLOOR_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def FLOOR_W_S_MMR6 : StdMMR6Rel, FLOOR_W_S_MMR6_ENC, FLOOR_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def FLOOR_W_D_MMR6 : StdMMR6Rel, FLOOR_W_D_MMR6_ENC, FLOOR_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_L_S_MMR6 : StdMMR6Rel, CEIL_L_S_MMR6_ENC, CEIL_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_L_D_MMR6 : StdMMR6Rel, CEIL_L_D_MMR6_ENC, CEIL_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_W_S_MMR6 : StdMMR6Rel, CEIL_W_S_MMR6_ENC, CEIL_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CEIL_W_D_MMR6 : StdMMR6Rel, CEIL_W_D_MMR6_ENC, CEIL_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_L_S_MMR6 : StdMMR6Rel, TRUNC_L_S_MMR6_ENC, TRUNC_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_L_D_MMR6 : StdMMR6Rel, TRUNC_L_D_MMR6_ENC, TRUNC_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_W_S_MMR6 : StdMMR6Rel, TRUNC_W_S_MMR6_ENC, TRUNC_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def TRUNC_W_D_MMR6 : StdMMR6Rel, TRUNC_W_D_MMR6_ENC, TRUNC_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SQRT_S_MMR6 : StdMMR6Rel, SQRT_S_MMR6_ENC, SQRT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SQRT_D_MMR6 : StdMMR6Rel, SQRT_D_MMR6_ENC, SQRT_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def RSQRT_S_MMR6 : StdMMR6Rel, RSQRT_S_MMR6_ENC, RSQRT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RSQRT_D_MMR6 : StdMMR6Rel, RSQRT_D_MMR6_ENC, RSQRT_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SB_MMR6 : StdMMR6Rel, SB_MMR6_DESC, SB_MMR6_ENC, ISA_MICROMIPS32R6; +def SBE_MMR6 : StdMMR6Rel, SBE_MMR6_DESC, SBE_MMR6_ENC, ISA_MICROMIPS32R6; +def SCE_MMR6 : StdMMR6Rel, SCE_MMR6_DESC, SCE_MMR6_ENC, ISA_MICROMIPS32R6; +def SH_MMR6 : StdMMR6Rel, SH_MMR6_DESC, SH_MMR6_ENC, ISA_MICROMIPS32R6; +def SHE_MMR6 : StdMMR6Rel, SHE_MMR6_DESC, SHE_MMR6_ENC, ISA_MICROMIPS32R6; +def LLE_MMR6 : StdMMR6Rel, LLE_MMR6_DESC, LLE_MMR6_ENC, ISA_MICROMIPS32R6; +def LWE_MMR6 : StdMMR6Rel, LWE_MMR6_DESC, LWE_MMR6_ENC, ISA_MICROMIPS32R6; +def LW_MMR6 : StdMMR6Rel, LW_MMR6_DESC, LW_MMR6_ENC, ISA_MICROMIPS32R6; +def LUI_MMR6 : R6MMR6Rel, LUI_MMR6_DESC, LUI_MMR6_ENC, ISA_MICROMIPS32R6; +def ADDU16_MMR6 : StdMMR6Rel, ADDU16_MMR6_DESC, ADDU16_MMR6_ENC, + ISA_MICROMIPS32R6; +def AND16_MMR6 : StdMMR6Rel, AND16_MMR6_DESC, AND16_MMR6_ENC, + ISA_MICROMIPS32R6; +def ANDI16_MMR6 : StdMMR6Rel, ANDI16_MMR6_DESC, ANDI16_MMR6_ENC, + ISA_MICROMIPS32R6; +def NOT16_MMR6 : StdMMR6Rel, NOT16_MMR6_DESC, NOT16_MMR6_ENC, + ISA_MICROMIPS32R6; +def OR16_MMR6 : StdMMR6Rel, OR16_MMR6_DESC, OR16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SLL16_MMR6 : StdMMR6Rel, SLL16_MMR6_DESC, SLL16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SRL16_MMR6 : StdMMR6Rel, SRL16_MMR6_DESC, SRL16_MMR6_ENC, + ISA_MICROMIPS32R6; +def BREAK16_MMR6 : StdMMR6Rel, BREAK16_MMR6_DESC, BREAK16_MMR6_ENC, + ISA_MICROMIPS32R6; +def LI16_MMR6 : StdMMR6Rel, LI16_MMR6_DESC, LI16_MMR6_ENC, + ISA_MICROMIPS32R6; +def MOVE16_MMR6 : StdMMR6Rel, MOVE16_MMR6_DESC, MOVE16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SDBBP16_MMR6 : StdMMR6Rel, SDBBP16_MMR6_DESC, SDBBP16_MMR6_ENC, + ISA_MICROMIPS32R6; +def SUBU16_MMR6 : StdMMR6Rel, SUBU16_MMR6_DESC, SUBU16_MMR6_ENC, + ISA_MICROMIPS32R6; +def XOR16_MMR6 : StdMMR6Rel, XOR16_MMR6_DESC, XOR16_MMR6_ENC, + ISA_MICROMIPS32R6; +def RECIP_S_MMR6 : StdMMR6Rel, RECIP_S_MMR6_ENC, RECIP_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RECIP_D_MMR6 : StdMMR6Rel, RECIP_D_MMR6_ENC, RECIP_D_MMR6_DESC, ISA_MICROMIPS32R6; +def RINT_S_MMR6 : StdMMR6Rel, RINT_S_MMR6_ENC, RINT_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def RINT_D_MMR6 : StdMMR6Rel, RINT_D_MMR6_ENC, RINT_D_MMR6_DESC, ISA_MICROMIPS32R6; +def ROUND_L_S_MMR6 : StdMMR6Rel, ROUND_L_S_MMR6_ENC, ROUND_L_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_L_D_MMR6 : StdMMR6Rel, ROUND_L_D_MMR6_ENC, ROUND_L_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_S_MMR6 : StdMMR6Rel, ROUND_W_S_MMR6_ENC, ROUND_W_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def ROUND_W_D_MMR6 : StdMMR6Rel, ROUND_W_D_MMR6_ENC, ROUND_W_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SEL_S_MMR6 : StdMMR6Rel, SEL_S_MMR6_ENC, SEL_S_MMR6_DESC, ISA_MICROMIPS32R6; +def SEL_D_MMR6 : StdMMR6Rel, SEL_D_MMR6_ENC, SEL_D_MMR6_DESC, ISA_MICROMIPS32R6; +def SELEQZ_S_MMR6 : StdMMR6Rel, SELEQZ_S_MMR6_ENC, SELEQZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELEQZ_D_MMR6 : StdMMR6Rel, SELEQZ_D_MMR6_ENC, SELEQZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_S_MMR6 : StdMMR6Rel, SELENZ_S_MMR6_ENC, SELENZ_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def SELENZ_D_MMR6 : StdMMR6Rel, SELENZ_D_MMR6_ENC, SELENZ_D_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_S_MMR6 : StdMMR6Rel, CLASS_S_MMR6_ENC, CLASS_S_MMR6_DESC, + ISA_MICROMIPS32R6; +def CLASS_D_MMR6 : StdMMR6Rel, CLASS_D_MMR6_ENC, CLASS_D_MMR6_DESC, + ISA_MICROMIPS32R6; } //===----------------------------------------------------------------------===// @@ -357,4 +1211,23 @@ def XORI_MMR6 : StdMMR6Rel, XORI_MMR6_DESC, XORI_MMR6_ENC, ISA_MICROMIPS32R6; //===----------------------------------------------------------------------===// def : MipsInstAlias<"ei", (EI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6; +def : MipsInstAlias<"di", (DI_MMR6 ZERO), 1>, ISA_MICROMIPS32R6; def : MipsInstAlias<"nop", (SLL_MMR6 ZERO, ZERO, 0), 1>, ISA_MICROMIPS32R6; +def B_MMR6_Pseudo : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset), + !strconcat("b", "\t$offset")> { + string DecoderNamespace = "MicroMipsR6"; +} +def : MipsInstAlias<"sync", (SYNC_MMR6 0), 1>, ISA_MICROMIPS32R6; +def : MipsInstAlias<"sdbbp", (SDBBP_MMR6 0), 1>, ISA_MICROMIPS32R6; +def : MipsInstAlias<"rdhwr $rt, $rs", + (RDHWR_MMR6 GPR32Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, + ISA_MICROMIPS32R6; + +//===----------------------------------------------------------------------===// +// +// MicroMips arbitrary patterns that map to one or more instructions +// +//===----------------------------------------------------------------------===// + +def : MipsPat<(store GPRMM16:$src, addrimm4lsl2:$addr), + (SW16_MMR6 GPRMM16:$src, addrimm4lsl2:$addr)>, ISA_MICROMIPS32R6; diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td new file mode 100644 index 0000000..da305a2 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrFormats.td @@ -0,0 +1,86 @@ +//=- MicroMips64r6InstrFormats.td - Instruction Formats -*- tablegen -* -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes microMIPS64r6 instruction formats. +// +//===----------------------------------------------------------------------===// + +class DAUI_FM_MMR6 { + bits<5> rt; + bits<5> rs; + bits<16> imm; + + bits<32> Inst; + + let Inst{31-26} = 0b111100; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-0} = imm; +} + +class POOL32I_ADD_IMM_FM_MMR6<bits<5> funct> { + bits<5> rs; + bits<16> imm; + + bits<32> Inst; + + let Inst{31-26} = 0b010000; + let Inst{25-21} = funct; + let Inst{20-16} = rs; + let Inst{15-0} = imm; +} + +class POOL32S_EXTBITS_FM_MMR6<bits<6> funct> { + bits<5> rt; + bits<5> rs; + bits<5> size; + bits<5> pos; + + bits<32> Inst; + + let Inst{31-26} = 0b010110; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = size; + let Inst{10-6} = pos; + let Inst{5-0} = funct; +} + +class POOL32S_DALIGN_FM_MMR6 { + bits<5> rs; + bits<5> rt; + bits<5> rd; + bits<3> bp; + + bits<32> Inst; + + let Inst{31-26} = 0b010110; + let Inst{25-21} = rs; + let Inst{20-16} = rt; + let Inst{15-11} = rd; + let Inst{10-8} = bp; + let Inst{7-6} = 0b00; + let Inst{5-0} = 0b011100; +} + +class POOL32A_DIVMOD_FM_MMR6<string instr_asm, bits<9> funct> + : MMR6Arch<instr_asm> { + bits<5> rd; + bits<5> rs; + bits<5> rt; + + bits<32> Inst; + + let Inst{31-26} = 0b010110; + let Inst{25-21} = rd; + let Inst{20-16} = rs; + let Inst{15-11} = rt; + let Inst{10-9} = 0b00; + let Inst{8-0} = funct; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td new file mode 100644 index 0000000..ec1aef8 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMips64r6InstrInfo.td @@ -0,0 +1,119 @@ +//=- MicroMips64r6InstrInfo.td - Instruction Information -*- tablegen -*- -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes MicroMips64r6 instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// Instruction Encodings +// +//===----------------------------------------------------------------------===// + +class DAUI_MMR6_ENC : DAUI_FM_MMR6; +class DAHI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10001>; +class DATI_MMR6_ENC : POOL32I_ADD_IMM_FM_MMR6<0b10000>; +class DEXT_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b101100>; +class DEXTM_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b100100>; +class DEXTU_MMR6_ENC : POOL32S_EXTBITS_FM_MMR6<0b010100>; +class DALIGN_MMR6_ENC : POOL32S_DALIGN_FM_MMR6; +class DDIV_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddiv", 0b100011000>; +class DMOD_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmod", 0b101011000>; +class DDIVU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"ddivu", 0b110011000>; +class DMODU_MM64R6_ENC : POOL32A_DIVMOD_FM_MMR6<"dmodu", 0b111011000>; + +//===----------------------------------------------------------------------===// +// +// Instruction Descriptions +// +//===----------------------------------------------------------------------===// + +class DAUI_MMR6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins GPROpnd:$rs, simm16:$imm); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $imm"); + list<dag> Pattern = []; +} +class DAUI_MMR6_DESC : DAUI_MMR6_DESC_BASE<"daui", GPR64Opnd>; + +class DAHI_DATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs GPROpnd:$rs); + dag InOperandList = (ins GPROpnd:$rt, simm16:$imm); + string AsmString = !strconcat(instr_asm, "\t$rt, $imm"); + string Constraints = "$rs = $rt"; +} +class DAHI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dahi", GPR64Opnd>; +class DATI_MMR6_DESC : DAHI_DATI_DESC_BASE<"dati", GPR64Opnd>; + +class EXTBITS_DESC_BASE<string instr_asm, RegisterOperand RO, Operand PosOpnd, + Operand SizeOpnd, SDPatternOperator Op = null_frag> + : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs RO:$rt); + dag InOperandList = (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $pos, $size"); + list<dag> Pattern = [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))]; + InstrItinClass Itinerary = II_EXT; + Format Form = FrmR; + string BaseOpcode = instr_asm; +} +// TODO: Add 'pos + size' constraint check to dext* instructions +// DEXT: 0 < pos + size <= 63 +// DEXTM, DEXTU: 32 < pos + size <= 64 +class DEXT_MMR6_DESC : EXTBITS_DESC_BASE<"dext", GPR64Opnd, uimm5, + uimm5_plus1, MipsExt>; +class DEXTM_MMR6_DESC : EXTBITS_DESC_BASE<"dextm", GPR64Opnd, uimm5, + uimm5_plus33, MipsExt>; +class DEXTU_MMR6_DESC : EXTBITS_DESC_BASE<"dextu", GPR64Opnd, uimm5_plus32, + uimm5_plus1, MipsExt>; + +class DALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + Operand ImmOpnd> : MMR6Arch<instr_asm>, MipsR6Inst { + dag OutOperandList = (outs GPROpnd:$rd); + dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp); + string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp"); + list<dag> Pattern = []; +} + +class DALIGN_MMR6_DESC : DALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>; + +class DDIV_MM64R6_DESC : ArithLogicR<"ddiv", GPR32Opnd>; +class DMOD_MM64R6_DESC : ArithLogicR<"dmod", GPR32Opnd>; +class DDIVU_MM64R6_DESC : ArithLogicR<"ddivu", GPR32Opnd>; +class DMODU_MM64R6_DESC : ArithLogicR<"dmodu", GPR32Opnd>; + +//===----------------------------------------------------------------------===// +// +// Instruction Definitions +// +//===----------------------------------------------------------------------===// + +let DecoderNamespace = "MicroMipsR6" in { + def DAUI_MM64R6 : StdMMR6Rel, DAUI_MMR6_DESC, DAUI_MMR6_ENC, ISA_MICROMIPS64R6; + def DAHI_MM64R6 : StdMMR6Rel, DAHI_MMR6_DESC, DAHI_MMR6_ENC, ISA_MICROMIPS64R6; + def DATI_MM64R6 : StdMMR6Rel, DATI_MMR6_DESC, DATI_MMR6_ENC, ISA_MICROMIPS64R6; + def DEXT_MM64R6 : StdMMR6Rel, DEXT_MMR6_DESC, DEXT_MMR6_ENC, + ISA_MICROMIPS64R6; + def DEXTM_MM64R6 : StdMMR6Rel, DEXTM_MMR6_DESC, DEXTM_MMR6_ENC, + ISA_MICROMIPS64R6; + def DEXTU_MM64R6 : StdMMR6Rel, DEXTU_MMR6_DESC, DEXTU_MMR6_ENC, + ISA_MICROMIPS64R6; + def DALIGN_MM64R6 : StdMMR6Rel, DALIGN_MMR6_DESC, DALIGN_MMR6_ENC, + ISA_MICROMIPS64R6; + def DDIV_MM64R6 : R6MMR6Rel, DDIV_MM64R6_DESC, DDIV_MM64R6_ENC, + ISA_MICROMIPS64R6; + def DMOD_MM64R6 : R6MMR6Rel, DMOD_MM64R6_DESC, DMOD_MM64R6_ENC, + ISA_MICROMIPS64R6; + def DDIVU_MM64R6 : R6MMR6Rel, DDIVU_MM64R6_DESC, DDIVU_MM64R6_ENC, + ISA_MICROMIPS64R6; + def DMODU_MM64R6 : R6MMR6Rel, DMODU_MM64R6_DESC, DMODU_MM64R6_ENC, + ISA_MICROMIPS64R6; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td new file mode 100644 index 0000000..f11c09a --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrFormats.td @@ -0,0 +1,244 @@ +//===-- MicroMipsDSPInstrFormats.td - Instruction Formats --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +class MMDSPInst<string opstr = ""> + : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl { + let InsnPredicates = [HasDSP]; + let AdditionalPredicates = [InMicroMips]; + string BaseOpcode = opstr; + string Arch = "mmdsp"; + let DecoderNamespace = "MicroMips"; +} + +class MMDSPInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, PredicateControl { + let InsnPredicates = [HasDSP]; + let AdditionalPredicates = [InMicroMips]; +} + +class POOL32A_3R_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> { + bits<5> rd; + bits<5> rs; + bits<5> rt; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = rd; + let Inst{10-0} = op; +} + +class POOL32A_2R_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_2RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_3RB0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rd; + bits<5> rs; + bits<5> rt; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = rd; + let Inst{10} = 0b0; + let Inst{9-0} = op; +} + +class POOL32A_2RSA4_FMT<string opstr, bits<12> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<4> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-12} = sa; + let Inst{11-0} = op; +} + +class POOL32A_2RSA3_FMT<string opstr, bits<7> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<3> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-13} = sa; + let Inst{12-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_2RSA5B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<5> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = sa; + let Inst{10} = 0b0; + let Inst{9-0} = op; +} + +class POOL32A_2RSA4B0_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<4> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-12} = sa; + let Inst{11} = 0b0; + let Inst{10-0} = op; +} + +class POOL32A_2RSA4OP6_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<4> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-12} = sa; + let Inst{11-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RIMM5AC_FMT<string opstr, bits<8> funct> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> imm; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = imm; + let Inst{15-14} = ac; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111100; +} + +class POOL32A_2RSA5_FMT<string opstr, bits<11> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<5> rs; + bits<5> sa; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-16} = rs; + let Inst{15-11} = sa; + let Inst{10-0} = op; +} + +class POOL32A_1RMEMB0_FMT<string opstr, bits<10> funct> : MMDSPInst<opstr> { + bits<5> index; + bits<5> base; + bits<5> rd; + + let Inst{31-26} = 0; + let Inst{25-21} = index; + let Inst{20-16} = base; + let Inst{15-11} = rd; + let Inst{10} = 0b0; + let Inst{9-0} = funct; +} + +class POOL32A_1RAC_FMT<string instr_asm, bits<8> funct> : MMDSPInst<instr_asm> { + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0; + let Inst{25-21} = 0; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = funct; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RMASK7_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<7> mask; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = rt; + let Inst{20-14} = mask; + let Inst{13-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_1RIMM10_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<5> rd; + bits<10> imm; + + let Inst{31-26} = 0; + let Inst{25-16} = imm; + let Inst{15-11} = rd; + let Inst{10} = 0; + let Inst{9-0} = op; +} + +class POOL32A_1RIMM8_FMT<string opstr, bits<6> op> : MMDSPInst<opstr> { + bits<5> rt; + bits<8> imm; + + let Inst{31-26} = 0; + let Inst{25-21} = rt; + let Inst{20-13} = imm; + let Inst{12} = 0; + let Inst{11-6} = op; + let Inst{5-0} = 0b111100; +} + +class POOL32A_4B0SHIFT6AC4B0_FMT<string opstr, bits<10> op> : MMDSPInst<opstr> { + bits<6> shift; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-22} = 0b0000; + let Inst{21-16} = shift; + let Inst{15-14} = ac; + let Inst{13-10} = 0b0000; + let Inst{9-0} = op; +} + +class POOL32A_5B01RAC_FMT<string opstr, bits<8> op> : MMDSPInst<opstr> { + bits<5> rs; + bits<2> ac; + + let Inst{31-26} = 0b000000; + let Inst{25-21} = 0b00000; + let Inst{20-16} = rs; + let Inst{15-14} = ac; + let Inst{13-6} = op; + let Inst{5-0} = 0b111100; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td new file mode 100644 index 0000000..b342e23 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MicroMipsDSPInstrInfo.td @@ -0,0 +1,528 @@ +//===- MicroMipsDSPInstrInfo.td - Micromips DSP instructions -*- tablegen *-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes MicroMips DSP instructions. +// +//===----------------------------------------------------------------------===// + +// Instruction encoding. +class ADDQ_PH_MM_ENC : POOL32A_3R_FMT<"addq.ph", 0b00000001101>; +class ADDQ_S_PH_MM_ENC : POOL32A_3R_FMT<"addq_s.ph", 0b10000001101>; +class ADDQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"addq_s.w", 0b1100000101>; +class ADDQH_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh.ph", 0b00001001101>; +class ADDQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.ph", 0b10001001101>; +class ADDQH_W_MMR2_ENC: POOL32A_3R_FMT<"addqh.w", 0b00010001101>; +class ADDQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"addqh_r.w", 0b10010001101>; +class ADDU_PH_MMR2_ENC : POOL32A_3R_FMT<"addu.ph", 0b00100001101>; +class ADDU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"addu_s.ph", 0b10100001101>; +class ADDU_QB_MM_ENC : POOL32A_3R_FMT<"addu.qb", 0b00011001101>; +class ADDU_S_QB_MM_ENC : POOL32A_3R_FMT<"addu_s.qb", 0b10011001101>; +class ADDUH_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh.qb", 0b00101001101>; +class ADDUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"adduh_r.qb", 0b10101001101>; +class ADDSC_MM_ENC : POOL32A_3RB0_FMT<"addsc", 0b1110000101>; +class ADDWC_MM_ENC : POOL32A_3RB0_FMT<"addwc", 0b1111000101>; +class DPA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpa.w.ph", 0b00000010>; +class DPAQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpaq_s.w.ph", 0b00001010>; +class DPAQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpaq_sa.l.w", 0b01001010>; +class DPAQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_s.w.ph", 0b10001010>; +class DPAQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpaqx_sa.w.ph", 0b11001010>; +class DPAU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbl", 0b10000010>; +class DPAU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpau.h.qbr", 0b11000010>; +class DPAX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpax.w.ph", 0b01000010>; +class ABSQ_S_PH_MM_ENC : POOL32A_2R_FMT<"absq_s.ph", 0b0001000100>; +class ABSQ_S_W_MM_ENC : POOL32A_2R_FMT<"absq_s.w", 0b0010000100>; +class ABSQ_S_QB_MMR2_ENC : POOL32A_2R_FMT<"absq_s.qb", 0b0000000100>; +class INSV_MM_ENC : POOL32A_2R_FMT<"insv", 0b0100000100>; +class MADD_DSP_MM_ENC : POOL32A_2RAC_FMT<"madd", 0b00101010>; +class MADDU_DSP_MM_ENC : POOL32A_2RAC_FMT<"maddu", 0b01101010>; +class MSUB_DSP_MM_ENC : POOL32A_2RAC_FMT<"msub", 0b10101010>; +class MSUBU_DSP_MM_ENC : POOL32A_2RAC_FMT<"msubu", 0b11101010>; +class MULT_DSP_MM_ENC : POOL32A_2RAC_FMT<"mult", 0b00110010>; +class MULTU_DSP_MM_ENC : POOL32A_2RAC_FMT<"multu", 0b01110010>; +class SHLL_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll.ph", 0b001110110101>; +class SHLL_S_PH_MM_ENC : POOL32A_2RSA4_FMT<"shll_s.ph", 0b101110110101>; +class SHLL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shll.qb", 0b0100001>; +class SHLLV_PH_MM_ENC : POOL32A_3R_FMT<"shllv.ph", 0b00000001110>; +class SHLLV_S_PH_MM_ENC : POOL32A_3R_FMT<"shllv_s.ph", 0b10000001110>; +class SHLLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shllv.qb", 0b1110010101>; +class SHLLV_S_W_MM_ENC : POOL32A_3RB0_FMT<"shllv_s.w", 0b1111010101>; +class SHLL_S_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shll_s.w", 0b1111110101>; +class SHRA_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra.qb", 0b0000111>; +class SHRA_R_QB_MMR2_ENC : POOL32A_2RSA3_FMT<"shra_r.qb", 0b1000111>; +class SHRA_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra.ph", 0b01100110101>; +class SHRA_R_PH_MM_ENC : POOL32A_2RSA4B0_FMT<"shra_r.ph", 0b11100110101>; +class SHRAV_PH_MM_ENC : POOL32A_3R_FMT<"shrav.ph", 0b00110001101>; +class SHRAV_R_PH_MM_ENC : POOL32A_3R_FMT<"shrav_r.ph", 0b10110001101>; +class SHRAV_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav.qb", 0b00111001101>; +class SHRAV_R_QB_MMR2_ENC : POOL32A_3R_FMT<"shrav_r.qb", 0b10111001101>; +class SHRAV_R_W_MM_ENC : POOL32A_3RB0_FMT<"shrav_r.w", 0b1011010101>; +class SHRA_R_W_MM_ENC : POOL32A_2RSA5B0_FMT<"shra_r.w", 0b1011110101>; +class SHRL_PH_MMR2_ENC : POOL32A_2RSA4OP6_FMT<"shrl.ph", 0b001111>; +class SHRL_QB_MM_ENC : POOL32A_2RSA3_FMT<"shrl.qb", 0b1100001>; +class SHRLV_PH_MMR2_ENC : POOL32A_3RB0_FMT<"shrlv.ph", 0b1100010101>; +class SHRLV_QB_MM_ENC : POOL32A_3RB0_FMT<"shrlv.qb", 0b1101010101>; +class PRECEQ_W_PHL_MM_ENC : POOL32A_2R_FMT<"preceq.w.phl", 0b0101000100>; +class PRECEQ_W_PHR_MM_ENC : POOL32A_2R_FMT<"preceq.w.phr", 0b0110000100>; +class PRECEQU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbl", 0b0111000100>; +class PRECEQU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbla", 0b0111001100>; +class PRECEQU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbr", 0b1001000100>; +class PRECEQU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"precequ.ph.qbra", 0b1001001100>; +class PRECEU_PH_QBL_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbl", 0b1011000100>; +class PRECEU_PH_QBLA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbla", 0b1011001100>; +class PRECEU_PH_QBR_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbr", 0b1101000100>; +class PRECEU_PH_QBRA_MM_ENC : POOL32A_2R_FMT<"preceu.ph.qbra", 0b1101001100>; +class SUBQ_PH_MM_ENC : POOL32A_3R_FMT<"subq.ph", 0b01000001101>; +class SUBQ_S_PH_MM_ENC : POOL32A_3R_FMT<"subq_s.ph", 0b11000001101>; +class SUBQ_S_W_MM_ENC : POOL32A_3RB0_FMT<"subq_s.w", 0b1101000101>; +class SUBQH_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh.ph", 0b01001001101>; +class SUBQH_R_PH_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.ph", 0b11001001101>; +class SUBQH_W_MMR2_ENC : POOL32A_3R_FMT<"subqh.w", 0b01010001101>; +class SUBQH_R_W_MMR2_ENC : POOL32A_3R_FMT<"subqh_r.w", 0b11010001101>; +class SUBU_PH_MMR2_ENC : POOL32A_3R_FMT<"subu.ph", 0b01100001101>; +class SUBU_S_PH_MMR2_ENC : POOL32A_3R_FMT<"subu_s.ph", 0b11100001101>; +class SUBU_QB_MM_ENC : POOL32A_3R_FMT<"subu.qb", 0b01011001101>; +class SUBU_S_QB_MM_ENC : POOL32A_3R_FMT<"subu_s.qb", 0b11011001101>; +class SUBUH_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh.qb", 0b01101001101>; +class SUBUH_R_QB_MMR2_ENC : POOL32A_3R_FMT<"subuh_r.qb", 0b11101001101>; +class EXTP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extp", 0b10011001>; +class EXTPDP_MM_ENC : POOL32A_1RIMM5AC_FMT<"extpdp", 0b11011001>; +class EXTPDPV_MM_ENC : POOL32A_2RAC_FMT<"extpdpv", 0b11100010>; +class EXTPV_MM_ENC : POOL32A_2RAC_FMT<"extpv", 0b10100010>; +class EXTR_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr.w", 0b00111001>; +class EXTR_R_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_r.w", 0b01111001>; +class EXTR_RS_W_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_rs.w", 0b10111001>; +class EXTR_S_H_MM_ENC : POOL32A_1RIMM5AC_FMT<"extr_s.h", 0b11111001>; +class EXTRV_W_MM_ENC : POOL32A_2RAC_FMT<"extrv.w", 0b00111010>; +class EXTRV_R_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_r.w", 0b01111010>; +class EXTRV_RS_W_MM_ENC : POOL32A_2RAC_FMT<"extrv_rs.w", 0b10111010>; +class EXTRV_S_H_MM_ENC : POOL32A_2RAC_FMT<"extrv_s.h", 0b11111010>; +class DPS_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dps.w.ph", 0b00010010>; +class DPSQ_S_W_PH_MM_ENC : POOL32A_2RAC_FMT<"dpsq_s.w.ph", 0b00011010>; +class DPSQ_SA_L_W_MM_ENC : POOL32A_2RAC_FMT<"dpsq_sa.l.w", 0b01011010>; +class DPSQX_S_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_s.w.ph", 0b10011010>; +class DPSQX_SA_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsqx_sa.w.ph", 0b11011010>; +class DPSU_H_QBL_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbl", 0b10010010>; +class DPSU_H_QBR_MM_ENC : POOL32A_2RAC_FMT<"dpsu.h.qbr", 0b11010010>; +class DPSX_W_PH_MMR2_ENC : POOL32A_2RAC_FMT<"dpsx.w.ph", 0b01010010>; +class MUL_PH_MMR2_ENC : POOL32A_3R_FMT<"mul.ph", 0b00000101101>; +class MUL_S_PH_MMR2_ENC : POOL32A_3R_FMT<"mul_s.ph", 0b10000101101>; +class MULEQ_S_W_PHL_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phl", 0b0000100101>; +class MULEQ_S_W_PHR_MM_ENC : POOL32A_3RB0_FMT<"muleq_s.w.phr", 0b0001100101>; +class MULEU_S_PH_QBL_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbl", 0b0010010101>; +class MULEU_S_PH_QBR_MM_ENC : POOL32A_3RB0_FMT<"muleu_s.ph.qbr", 0b0011010101>; +class MULQ_RS_PH_MM_ENC : POOL32A_3RB0_FMT<"mulq_rs.ph", 0b0100010101>; +class MULQ_RS_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_rs.w", 0b0110010101>; +class MULQ_S_PH_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.ph", 0b0101010101>; +class MULQ_S_W_MMR2_ENC : POOL32A_3RB0_FMT<"mulq_s.w", 0b0111010101>; +class PRECR_QB_PH_MMR2_ENC : POOL32A_3RB0_FMT<"precr.qb.ph", 0b0001101101>; +class PRECR_SRA_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra.ph.w", 0b01111001101>; +class PRECR_SRA_R_PH_W_MMR2_ENC + : POOL32A_2RSA5_FMT<"precr_sra_r.ph.w", 0b11111001101>; +class PRECRQ_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq.ph.w", 0b0011101101>; +class PRECRQ_QB_PH_MM_ENC : POOL32A_3RB0_FMT<"precrq.qb.ph", 0b0010101101>; +class PRECRQU_S_QB_PH_MM_ENC + : POOL32A_3RB0_FMT<"precrqu_s.qb.ph", 0b0101101101>; +class PRECRQ_RS_PH_W_MM_ENC : POOL32A_3RB0_FMT<"precrq_rs.ph.w", 0b0100101101>; +class LBUX_MM_ENC : POOL32A_1RMEMB0_FMT<"lbux", 0b1000100101>; +class LHX_MM_ENC : POOL32A_1RMEMB0_FMT<"lhx", 0b0101100101>; +class LWX_MM_ENC : POOL32A_1RMEMB0_FMT<"lwx", 0b0110100101>; +class MAQ_S_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phl", 0b01101001>; +class MAQ_SA_W_PHL_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phl", 0b11101001>; +class MAQ_S_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_s.w.phr", 0b00101001>; +class MAQ_SA_W_PHR_MM_ENC : POOL32A_2RAC_FMT<"maq_sa.w.phr", 0b10101001>; +class MFHI_MM_ENC : POOL32A_1RAC_FMT<"mfhi", 0b00000001>; +class MFLO_MM_ENC : POOL32A_1RAC_FMT<"mflo", 0b01000001>; +class MTHI_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b10000001>; +class MTLO_MM_ENC : POOL32A_1RAC_FMT<"mthi", 0b11000001>; +class PREPEND_MMR2_ENC : POOL32A_2RSA5B0_FMT<"prepend", 0b1001010101>; +class RADDU_W_QB_MM_ENC : POOL32A_2R_FMT<"raddu.w.qb", 0b1111000100>; +class RDDSP_MM_ENC : POOL32A_1RMASK7_FMT<"rddsp", 0b00011001>; +class REPL_PH_MM_ENC : POOL32A_1RIMM10_FMT<"repl.ph", 0b0000111101>; +class REPL_QB_MM_ENC : POOL32A_1RIMM8_FMT<"repl.qb", 0b010111>; +class REPLV_PH_MM_ENC : POOL32A_2R_FMT<"replv.ph", 0b0000001100>; +class REPLV_QB_MM_ENC : POOL32A_2R_FMT<"replv.qb", 0b0001001100>; +class MTHLIP_MM_ENC : POOL32A_1RAC_FMT<"mthlip", 0b00001001>; +class PACKRL_PH_MM_ENC : POOL32A_3RB0_FMT<"packrl.ph", 0b0110101101>; +class PICK_PH_MM_ENC : POOL32A_3RB0_FMT<"pick.ph", 0b1000101101>; +class PICK_QB_MM_ENC : POOL32A_3RB0_FMT<"pick.qb", 0b0111101101>; +class SHILO_MM_ENC : POOL32A_4B0SHIFT6AC4B0_FMT<"shilo", 0b0000011101>; +class SHILOV_MM_ENC : POOL32A_5B01RAC_FMT<"shilov", 0b01001001>; +class WRDSP_MM_ENC : POOL32A_1RMASK7_FMT<"wrdsp", 0b01011001>; + +// Instruction desc. +class ABSQ_S_PH_MM_R2_DESC_BASE<string opstr, SDPatternOperator OpNode, + InstrItinClass itin, RegisterOperand ROD, + RegisterOperand ROS = ROD> { + dag OutOperandList = (outs ROD:$rt); + dag InOperandList = (ins ROS:$rs); + string AsmString = !strconcat(opstr, "\t$rt, $rs"); + list<dag> Pattern = [(set ROD:$rt, (OpNode ROS:$rs))]; + InstrItinClass Itinerary = itin; +} +class ABSQ_S_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "absq_s.ph", int_mips_absq_s_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>; +class ABSQ_S_W_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "absq_s.w", int_mips_absq_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag20]>; +class ABSQ_S_QB_MMR2_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "absq_s.qb", int_mips_absq_s_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag20]>; +class PRECEQ_W_PHL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceq.w.phl", int_mips_preceq_w_phl, NoItinerary, GPR32Opnd, DSPROpnd>; +class PRECEQ_W_PHR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceq.w.phr", int_mips_preceq_w_phr, NoItinerary, GPR32Opnd, DSPROpnd>; +class PRECEQU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbl", int_mips_precequ_ph_qbl, NoItinerary, DSPROpnd>; +class PRECEQU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbla", int_mips_precequ_ph_qbla, NoItinerary, DSPROpnd>; +class PRECEQU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbr", int_mips_precequ_ph_qbr, NoItinerary, DSPROpnd>; +class PRECEQU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "precequ.ph.qbra", int_mips_precequ_ph_qbra, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBL_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbl", int_mips_preceu_ph_qbl, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBLA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbla", int_mips_preceu_ph_qbla, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBR_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbr", int_mips_preceu_ph_qbr, NoItinerary, DSPROpnd>; +class PRECEU_PH_QBRA_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE< + "preceu.ph.qbra", int_mips_preceu_ph_qbra, NoItinerary, DSPROpnd>; + +class SHLL_R2_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + SDPatternOperator ImmPat, InstrItinClass itin, + RegisterOperand RO, Operand ImmOpnd> { + dag OutOperandList = (outs RO:$rt); + dag InOperandList = (ins RO:$rs, ImmOpnd:$sa); + string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa"); + list<dag> Pattern = [(set RO:$rt, (OpNode RO:$rs, ImmPat:$sa))]; + InstrItinClass Itinerary = itin; + bit hasSideEffects = 1; +} +class SHLL_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>, + Defs<[DSPOutFlag22]>; +class SHLL_S_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll_s.ph", int_mips_shll_s_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>, + Defs<[DSPOutFlag22]>; +class SHLL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>, + Defs<[DSPOutFlag22]>; +class SHLL_S_W_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shll_s.w", int_mips_shll_s_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>, + Defs<[DSPOutFlag22]>; +class SHRA_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE< + "shra.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>; +class SHRA_R_QB_MMR2_DESC : SHLL_R2_MM_DESC_BASE< + "shra_r.qb", int_mips_shra_r_qb, immZExt3, NoItinerary, DSPROpnd, uimm3>; +class SHRA_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shra.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>; +class SHRA_R_PH_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shra_r.ph", int_mips_shra_r_ph, immZExt4, NoItinerary, DSPROpnd, uimm4>; +class SHRA_R_W_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shra_r.w", int_mips_shra_r_w, immZExt5, NoItinerary, GPR32Opnd, uimm5>; +class SHRL_QB_MM_DESC : SHLL_R2_MM_DESC_BASE< + "shrl.qb", null_frag, immZExt3, NoItinerary, DSPROpnd, uimm3>; +class SHRL_PH_MMR2_DESC : SHLL_R2_MM_DESC_BASE< + "shrl.ph", null_frag, immZExt4, NoItinerary, DSPROpnd, uimm4>; + +class SHLLV_R3_MM_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + InstrItinClass itin, RegisterOperand RO> { + dag OutOperandList = (outs RO:$rd); + dag InOperandList = (ins RO:$rt, GPR32Opnd:$rs); + string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs"); + list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))]; + InstrItinClass Itinerary = itin; +} +class SHLLV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv.ph", int_mips_shll_ph, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>; +class SHLLV_S_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv_s.ph", int_mips_shll_s_ph, NoItinerary, DSPROpnd>, + Defs<[DSPOutFlag22]>; +class SHLLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv.qb", int_mips_shll_qb, NoItinerary, DSPROpnd>, Defs<[DSPOutFlag22]>; +class SHLLV_S_W_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shllv_s.w", int_mips_shll_s_w, NoItinerary, GPR32Opnd>, Defs<[DSPOutFlag22]>; +class SHRAV_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>; +class SHRAV_R_PH_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>; +class SHRAV_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>; +class SHRAV_R_QB_MMR2_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>; +class SHRAV_R_W_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>; +class SHRLV_PH_MMR2_DESC : SHLLV_R3_MM_DESC_BASE< + "shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>; +class SHRLV_QB_MM_DESC : SHLLV_R3_MM_DESC_BASE< + "shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>; + +class EXT_MM_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + InstrItinClass itin> { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$rs); + string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $rs"); + InstrItinClass Itinerary = itin; +} +class EXT_MM_1R_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + InstrItinClass itin> { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm5:$imm); + string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $imm"); + InstrItinClass Itinerary = itin; +} + +class EXTP_MM_DESC + : EXT_MM_1R_DESC_BASE<"extp", MipsEXTP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPEFI]>; +class EXTPDP_MM_DESC + : EXT_MM_1R_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>; +class EXTPDPV_MM_DESC + : EXT_MM_2R_DESC_BASE<"extpdpv", MipsEXTPDP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPPos, DSPEFI]>; +class EXTPV_MM_DESC + : EXT_MM_2R_DESC_BASE<"extpv", MipsEXTP, NoItinerary>, + Uses<[DSPPos]>, Defs<[DSPEFI]>; +class EXTR_W_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTR_R_W_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr_r.w", MipsEXTR_R_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTR_RS_W_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTR_S_H_MM_DESC + : EXT_MM_1R_DESC_BASE<"extr_s.h", MipsEXTR_S_H, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_W_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv.w", MipsEXTR_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_R_W_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv_r.w", MipsEXTR_R_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_RS_W_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W, NoItinerary>, + Defs<[DSPOutFlag23]>; +class EXTRV_S_H_MM_DESC + : EXT_MM_2R_DESC_BASE<"extrv_s.h", MipsEXTR_S_H, NoItinerary>, + Defs<[DSPOutFlag23]>; + +class MFHI_MM_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode, + InstrItinClass itin> { + dag OutOperandList = (outs GPR32Opnd:$rs); + dag InOperandList = (ins RO:$ac); + string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); + list<dag> Pattern = [(set GPR32Opnd:$rs, (OpNode RO:$ac))]; + InstrItinClass Itinerary = itin; +} + +class MFHI_MM_DESC : MFHI_MM_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, + NoItinerary>; +class MFLO_MM_DESC : MFHI_MM_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, + NoItinerary>; + +class RADDU_W_QB_MM_DESC { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins DSPROpnd:$rs); + string AsmString = !strconcat("raddu.w.qb", "\t$rt, $rs"); + list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_raddu_w_qb DSPROpnd:$rs))]; + InstrItinClass Itinerary = NoItinerary; + string BaseOpcode = "raddu.w.qb"; +} + +class RDDSP_MM_DESC { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins uimm16:$mask); + string AsmString = !strconcat("rddsp", "\t$rt, $mask"); + list<dag> Pattern = [(set GPR32Opnd:$rt, (int_mips_rddsp immZExt10:$mask))]; + InstrItinClass Itinerary = NoItinerary; +} + +class REPL_QB_MM_DESC { + dag OutOperandList = (outs DSPROpnd:$rt); + dag InOperandList = (ins uimm16:$imm); + string AsmString = !strconcat("repl.qb", "\t$rt, $imm"); + list<dag> Pattern = [(set DSPROpnd:$rt, (int_mips_repl_qb immZExt8:$imm))]; + InstrItinClass Itinerary = NoItinerary; +} + +class REPLV_PH_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.ph", int_mips_repl_ph, + NoItinerary, DSPROpnd, + GPR32Opnd>; +class REPLV_QB_MM_DESC : ABSQ_S_PH_MM_R2_DESC_BASE<"replv.qb", int_mips_repl_qb, + NoItinerary, DSPROpnd, + GPR32Opnd>; + +class WRDSP_MM_DESC { + dag OutOperandList = (outs); + dag InOperandList = (ins GPR32Opnd:$rt, uimm7:$mask); + string AsmString = !strconcat("wrdsp", "\t$rt, $mask"); + list<dag> Pattern = [(int_mips_wrdsp GPR32Opnd:$rt, immZExt7:$mask)]; + InstrItinClass Itinerary = NoItinerary; +} + +// Instruction defs. +// microMIPS DSP Rev 1 +def ADDQ_PH_MM : DspMMRel, ADDQ_PH_MM_ENC, ADDQ_PH_DESC; +def ADDQ_S_PH_MM : DspMMRel, ADDQ_S_PH_MM_ENC, ADDQ_S_PH_DESC; +def ADDQ_S_W_MM : DspMMRel, ADDQ_S_W_MM_ENC, ADDQ_S_W_DESC; +def ADDU_QB_MM : DspMMRel, ADDU_QB_MM_ENC, ADDU_QB_DESC; +def ADDU_S_QB_MM : DspMMRel, ADDU_S_QB_MM_ENC, ADDU_S_QB_DESC; +def ADDSC_MM : DspMMRel, ADDSC_MM_ENC, ADDSC_DESC; +def ADDWC_MM : DspMMRel, ADDWC_MM_ENC, ADDWC_DESC; +def DPAQ_S_W_PH_MM : DspMMRel, DPAQ_S_W_PH_MM_ENC, DPAQ_S_W_PH_DESC; +def DPAQ_SA_L_W_MM : DspMMRel, DPAQ_SA_L_W_MM_ENC, DPAQ_SA_L_W_DESC; +def DPAU_H_QBL_MM : DspMMRel, DPAU_H_QBL_MM_ENC, DPAU_H_QBL_DESC; +def DPAU_H_QBR_MM : DspMMRel, DPAU_H_QBR_MM_ENC, DPAU_H_QBR_DESC; +def ABSQ_S_PH_MM : DspMMRel, ABSQ_S_PH_MM_ENC, ABSQ_S_PH_MM_DESC; +def ABSQ_S_W_MM : DspMMRel, ABSQ_S_W_MM_ENC, ABSQ_S_W_MM_DESC; +def INSV_MM : DspMMRel, INSV_MM_ENC, INSV_DESC; +def MADD_DSP_MM : DspMMRel, MADD_DSP_MM_ENC, MADD_DSP_DESC; +def MADDU_DSP_MM : DspMMRel, MADDU_DSP_MM_ENC, MADDU_DSP_DESC; +def MSUB_DSP_MM : DspMMRel, MSUB_DSP_MM_ENC, MSUB_DSP_DESC; +def MSUBU_DSP_MM : DspMMRel, MSUBU_DSP_MM_ENC, MSUBU_DSP_DESC; +def MULT_DSP_MM : DspMMRel, MULT_DSP_MM_ENC, MULT_DSP_DESC; +def MULTU_DSP_MM : DspMMRel, MULTU_DSP_MM_ENC, MULTU_DSP_DESC; +def SHLL_PH_MM : DspMMRel, SHLL_PH_MM_ENC, SHLL_PH_MM_DESC; +def SHLL_S_PH_MM : DspMMRel, SHLL_S_PH_MM_ENC, SHLL_S_PH_MM_DESC; +def SHLL_QB_MM : DspMMRel, SHLL_QB_MM_ENC, SHLL_QB_MM_DESC; +def SHLLV_PH_MM : DspMMRel, SHLLV_PH_MM_ENC, SHLLV_PH_MM_DESC; +def SHLLV_S_PH_MM : DspMMRel, SHLLV_S_PH_MM_ENC, SHLLV_S_PH_MM_DESC; +def SHLLV_QB_MM : DspMMRel, SHLLV_QB_MM_ENC, SHLLV_QB_MM_DESC; +def SHLLV_S_W_MM : DspMMRel, SHLLV_S_W_MM_ENC, SHLLV_S_W_MM_DESC; +def SHLL_S_W_MM : DspMMRel, SHLL_S_W_MM_ENC, SHLL_S_W_MM_DESC; +def SHRA_PH_MM : DspMMRel, SHRA_PH_MM_ENC, SHRA_PH_MM_DESC; +def SHRA_R_PH_MM : DspMMRel, SHRA_R_PH_MM_ENC, SHRA_R_PH_MM_DESC; +def SHRAV_PH_MM : DspMMRel, SHRAV_PH_MM_ENC, SHRAV_PH_MM_DESC; +def SHRAV_R_PH_MM : DspMMRel, SHRAV_R_PH_MM_ENC, SHRAV_R_PH_MM_DESC; +def SHRAV_R_W_MM : DspMMRel, SHRAV_R_W_MM_ENC, SHRAV_R_W_MM_DESC; +def SHRA_R_W_MM : DspMMRel, SHRA_R_W_MM_ENC, SHRA_R_W_MM_DESC; +def SHRL_QB_MM : DspMMRel, SHRL_QB_MM_ENC, SHRL_QB_MM_DESC; +def SHRLV_QB_MM : DspMMRel, SHRLV_QB_MM_ENC, SHRLV_QB_MM_DESC; +def PRECEQ_W_PHL_MM : DspMMRel, PRECEQ_W_PHL_MM_ENC, PRECEQ_W_PHL_MM_DESC; +def PRECEQ_W_PHR_MM : DspMMRel, PRECEQ_W_PHR_MM_ENC, PRECEQ_W_PHR_MM_DESC; +def PRECEQU_PH_QBL_MM : DspMMRel, PRECEQU_PH_QBL_MM_ENC, PRECEQU_PH_QBL_MM_DESC; +def PRECEQU_PH_QBLA_MM : DspMMRel, PRECEQU_PH_QBLA_MM_ENC, + PRECEQU_PH_QBLA_MM_DESC; +def PRECEQU_PH_QBR_MM : DspMMRel, PRECEQU_PH_QBR_MM_ENC, PRECEQU_PH_QBR_MM_DESC; +def PRECEQU_PH_QBRA_MM : DspMMRel, PRECEQU_PH_QBRA_MM_ENC, + PRECEQU_PH_QBRA_MM_DESC; +def PRECEU_PH_QBL_MM : DspMMRel, PRECEU_PH_QBL_MM_ENC, PRECEU_PH_QBL_MM_DESC; +def PRECEU_PH_QBLA_MM : DspMMRel, PRECEU_PH_QBLA_MM_ENC, PRECEU_PH_QBLA_MM_DESC; +def PRECEU_PH_QBR_MM : DspMMRel, PRECEU_PH_QBR_MM_ENC, PRECEU_PH_QBR_MM_DESC; +def PRECEU_PH_QBRA_MM : DspMMRel, PRECEU_PH_QBRA_MM_ENC, PRECEU_PH_QBRA_MM_DESC; +def SUBQ_PH_MM : DspMMRel, SUBQ_PH_MM_ENC, SUBQ_PH_DESC; +def SUBQ_S_PH_MM : DspMMRel, SUBQ_S_PH_MM_ENC, SUBQ_S_PH_DESC; +def SUBQ_S_W_MM : DspMMRel, SUBQ_S_W_MM_ENC, SUBQ_S_W_DESC; +def SUBU_QB_MM : DspMMRel, SUBU_QB_MM_ENC, SUBU_QB_DESC; +def SUBU_S_QB_MM : DspMMRel, SUBU_S_QB_MM_ENC, SUBU_S_QB_DESC; +def EXTP_MM : DspMMRel, EXTP_MM_ENC, EXTP_MM_DESC; +def EXTPDP_MM : DspMMRel, EXTPDP_MM_ENC, EXTPDP_MM_DESC; +def EXTPDPV_MM : DspMMRel, EXTPDPV_MM_ENC, EXTPDPV_MM_DESC; +def EXTPV_MM : DspMMRel, EXTPV_MM_ENC, EXTPV_MM_DESC; +def EXTR_W_MM : DspMMRel, EXTR_W_MM_ENC, EXTR_W_MM_DESC; +def EXTR_R_W_MM : DspMMRel, EXTR_R_W_MM_ENC, EXTR_R_W_MM_DESC; +def EXTR_RS_W_MM : DspMMRel, EXTR_RS_W_MM_ENC, EXTR_RS_W_MM_DESC; +def EXTR_S_H_MM : DspMMRel, EXTR_S_H_MM_ENC, EXTR_S_H_MM_DESC; +def EXTRV_W_MM : DspMMRel, EXTRV_W_MM_ENC, EXTRV_W_MM_DESC; +def EXTRV_R_W_MM : DspMMRel, EXTRV_R_W_MM_ENC, EXTRV_R_W_MM_DESC; +def EXTRV_RS_W_MM : DspMMRel, EXTRV_RS_W_MM_ENC, EXTRV_RS_W_MM_DESC; +def EXTRV_S_H_MM : DspMMRel, EXTRV_S_H_MM_ENC, EXTRV_S_H_MM_DESC; +def DPSQ_S_W_PH_MM : DspMMRel, DPSQ_S_W_PH_MM_ENC, DPSQ_S_W_PH_DESC; +def DPSQ_SA_L_W_MM : DspMMRel, DPSQ_SA_L_W_MM_ENC, DPSQ_SA_L_W_DESC; +def DPSU_H_QBL_MM : DspMMRel, DPSU_H_QBL_MM_ENC, DPSU_H_QBL_DESC; +def DPSU_H_QBR_MM : DspMMRel, DPSU_H_QBR_MM_ENC, DPSU_H_QBR_DESC; +def MULEQ_S_W_PHL_MM : DspMMRel, MULEQ_S_W_PHL_MM_ENC, MULEQ_S_W_PHL_DESC; +def MULEQ_S_W_PHR_MM : DspMMRel, MULEQ_S_W_PHR_MM_ENC, MULEQ_S_W_PHR_DESC; +def MULEU_S_PH_QBL_MM : DspMMRel, MULEU_S_PH_QBL_MM_ENC, MULEU_S_PH_QBL_DESC; +def MULEU_S_PH_QBR_MM : DspMMRel, MULEU_S_PH_QBR_MM_ENC, MULEU_S_PH_QBR_DESC; +def MULQ_RS_PH_MM : DspMMRel, MULQ_RS_PH_MM_ENC, MULQ_RS_PH_DESC; +def PRECRQ_PH_W_MM : DspMMRel, PRECRQ_PH_W_MM_ENC, PRECRQ_PH_W_DESC; +def PRECRQ_QB_PH_MM : DspMMRel, PRECRQ_QB_PH_MM_ENC, PRECRQ_QB_PH_DESC; +def PRECRQU_S_QB_PH_MM : DspMMRel, PRECRQU_S_QB_PH_MM_ENC, PRECRQU_S_QB_PH_DESC; +def PRECRQ_RS_PH_W_MM : DspMMRel, PRECRQ_RS_PH_W_MM_ENC, PRECRQ_RS_PH_W_DESC; +def LBUX_MM : DspMMRel, LBUX_MM_ENC, LBUX_DESC; +def LHX_MM : DspMMRel, LHX_MM_ENC, LHX_DESC; +def LWX_MM : DspMMRel, LWX_MM_ENC, LWX_DESC; +def MAQ_S_W_PHL_MM : DspMMRel, MAQ_S_W_PHL_MM_ENC, MAQ_S_W_PHL_DESC; +def MAQ_SA_W_PHL_MM : DspMMRel, MAQ_SA_W_PHL_MM_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_S_W_PHR_MM : DspMMRel, MAQ_S_W_PHR_MM_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHR_MM : DspMMRel, MAQ_SA_W_PHR_MM_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP_MM : DspMMRel, MFHI_MM_ENC, MFHI_MM_DESC; +def MFLO_DSP_MM : DspMMRel, MFLO_MM_ENC, MFLO_MM_DESC; +def MTHI_DSP_MM : DspMMRel, MTHI_MM_ENC, MTHI_DESC; +def MTLO_DSP_MM : DspMMRel, MTLO_MM_ENC, MTLO_DESC; +def RADDU_W_QB_MM : DspMMRel, RADDU_W_QB_MM_ENC, RADDU_W_QB_MM_DESC; +def RDDSP_MM : DspMMRel, RDDSP_MM_ENC, RDDSP_MM_DESC; +def REPL_PH_MM : DspMMRel, REPL_PH_MM_ENC, REPL_PH_DESC; +def REPL_QB_MM : DspMMRel, REPL_QB_MM_ENC, REPL_QB_MM_DESC; +def REPLV_PH_MM : DspMMRel, REPLV_PH_MM_ENC, REPLV_PH_MM_DESC; +def REPLV_QB_MM : DspMMRel, REPLV_QB_MM_ENC, REPLV_QB_MM_DESC; +def MTHLIP_MM : DspMMRel, MTHLIP_MM_ENC, MTHLIP_DESC; +def PACKRL_PH_MM : DspMMRel, PACKRL_PH_MM_ENC, PACKRL_PH_DESC; +def PICK_PH_MM : DspMMRel, PICK_PH_MM_ENC, PICK_PH_DESC; +def PICK_QB_MM : DspMMRel, PICK_QB_MM_ENC, PICK_QB_DESC; +def SHILO_MM : DspMMRel, SHILO_MM_ENC, SHILO_DESC; +def SHILOV_MM : DspMMRel, SHILOV_MM_ENC, SHILOV_DESC; +def WRDSP_MM : DspMMRel, WRDSP_MM_ENC, WRDSP_MM_DESC; +// microMIPS DSP Rev 2 +def ABSQ_S_QB_MMR2 : DspMMRel, ABSQ_S_QB_MMR2_ENC, ABSQ_S_QB_MMR2_DESC, + ISA_DSPR2; +def ADDQH_PH_MMR2 : DspMMRel, ADDQH_PH_MMR2_ENC, ADDQH_PH_DESC, ISA_DSPR2; +def ADDQH_R_PH_MMR2 : DspMMRel, ADDQH_R_PH_MMR2_ENC, ADDQH_R_PH_DESC, ISA_DSPR2; +def ADDQH_W_MMR2 : DspMMRel, ADDQH_W_MMR2_ENC, ADDQH_W_DESC, ISA_DSPR2; +def ADDQH_R_W_MMR2 : DspMMRel, ADDQH_R_W_MMR2_ENC, ADDQH_R_W_DESC, ISA_DSPR2; +def ADDU_PH_MMR2 : DspMMRel, ADDU_PH_MMR2_ENC, ADDU_PH_DESC, ISA_DSPR2; +def ADDU_S_PH_MMR2 : DspMMRel, ADDU_S_PH_MMR2_ENC, ADDU_S_PH_DESC, ISA_DSPR2; +def ADDUH_QB_MMR2 : DspMMRel, ADDUH_QB_MMR2_ENC, ADDUH_QB_DESC, ISA_DSPR2; +def ADDUH_R_QB_MMR2 : DspMMRel, ADDUH_R_QB_MMR2_ENC, ADDUH_R_QB_DESC, ISA_DSPR2; +def DPA_W_PH_MMR2 : DspMMRel, DPA_W_PH_MMR2_ENC, DPA_W_PH_DESC, ISA_DSPR2; +def DPAQX_S_W_PH_MMR2 : DspMMRel, DPAQX_S_W_PH_MMR2_ENC, DPAQX_S_W_PH_DESC, + ISA_DSPR2; +def DPAQX_SA_W_PH_MMR2 : DspMMRel, DPAQX_SA_W_PH_MMR2_ENC, DPAQX_SA_W_PH_DESC, + ISA_DSPR2; +def DPAX_W_PH_MMR2 : DspMMRel, DPAX_W_PH_MMR2_ENC, DPAX_W_PH_DESC, ISA_DSPR2; +def SHRA_QB_MMR2 : DspMMRel, SHRA_QB_MMR2_ENC, SHRA_QB_MMR2_DESC, ISA_DSPR2; +def SHRA_R_QB_MMR2 : DspMMRel, SHRA_R_QB_MMR2_ENC, SHRA_R_QB_MMR2_DESC, + ISA_DSPR2; +def SHRAV_QB_MMR2 : DspMMRel, SHRAV_QB_MMR2_ENC, SHRAV_QB_MMR2_DESC, ISA_DSPR2; +def SHRAV_R_QB_MMR2 : DspMMRel, SHRAV_R_QB_MMR2_ENC, SHRAV_R_QB_MMR2_DESC, + ISA_DSPR2; +def SHRL_PH_MMR2 : DspMMRel, SHRL_PH_MMR2_ENC, SHRL_PH_MMR2_DESC, ISA_DSPR2; +def SHRLV_PH_MMR2 : DspMMRel, SHRLV_PH_MMR2_ENC, SHRLV_PH_MMR2_DESC, ISA_DSPR2; +def SUBQH_PH_MMR2 : DspMMRel, SUBQH_PH_MMR2_ENC, SUBQH_PH_DESC, ISA_DSPR2; +def SUBQH_R_PH_MMR2 : DspMMRel, SUBQH_R_PH_MMR2_ENC, SUBQH_R_PH_DESC, ISA_DSPR2; +def SUBQH_W_MMR2 : DspMMRel, SUBQH_W_MMR2_ENC, SUBQH_W_DESC, ISA_DSPR2; +def SUBQH_R_W_MMR2 : DspMMRel, SUBQH_R_W_MMR2_ENC, SUBQH_R_W_DESC, ISA_DSPR2; +def SUBU_PH_MMR2 : DspMMRel, SUBU_PH_MMR2_ENC, SUBU_PH_DESC, ISA_DSPR2; +def SUBU_S_PH_MMR2 : DspMMRel, SUBU_S_PH_MMR2_ENC, SUBU_S_PH_DESC, ISA_DSPR2; +def SUBUH_QB_MMR2 : DspMMRel, SUBUH_QB_MMR2_ENC, SUBUH_QB_DESC, ISA_DSPR2; +def SUBUH_R_QB_MMR2 : DspMMRel, SUBUH_R_QB_MMR2_ENC, SUBUH_R_QB_DESC, ISA_DSPR2; +def DPS_W_PH_MMR2 : DspMMRel, DPS_W_PH_MMR2_ENC, DPS_W_PH_DESC, ISA_DSPR2; +def DPSQX_S_W_PH_MMR2 : DspMMRel, DPSQX_S_W_PH_MMR2_ENC, DPSQX_S_W_PH_DESC, + ISA_DSPR2; +def DPSQX_SA_W_PH_MMR2 : DspMMRel, DPSQX_SA_W_PH_MMR2_ENC, DPSQX_SA_W_PH_DESC, + ISA_DSPR2; +def DPSX_W_PH_MMR2 : DspMMRel, DPSX_W_PH_MMR2_ENC, DPSX_W_PH_DESC, ISA_DSPR2; +def MUL_PH_MMR2 : DspMMRel, MUL_PH_MMR2_ENC, MUL_PH_DESC, ISA_DSPR2; +def MUL_S_PH_MMR2 : DspMMRel, MUL_S_PH_MMR2_ENC, MUL_S_PH_DESC, ISA_DSPR2; +def MULQ_RS_W_MMR2 : DspMMRel, MULQ_RS_W_MMR2_ENC, MULQ_RS_W_DESC, ISA_DSPR2; +def MULQ_S_PH_MMR2 : DspMMRel, MULQ_S_PH_MMR2_ENC, MULQ_S_PH_DESC, ISA_DSPR2; +def MULQ_S_W_MMR2 : DspMMRel, MULQ_S_W_MMR2_ENC, MULQ_S_W_DESC, ISA_DSPR2; +def PRECR_QB_PH_MMR2 : DspMMRel, PRECR_QB_PH_MMR2_ENC, PRECR_QB_PH_DESC, + ISA_DSPR2; +def PRECR_SRA_PH_W_MMR2 : DspMMRel, PRECR_SRA_PH_W_MMR2_ENC, + PRECR_SRA_PH_W_DESC, ISA_DSPR2; +def PRECR_SRA_R_PH_W_MMR2 : DspMMRel, PRECR_SRA_R_PH_W_MMR2_ENC, + PRECR_SRA_R_PH_W_DESC, ISA_DSPR2; +def PREPEND_MMR2 : DspMMRel, PREPEND_MMR2_ENC, PREPEND_DESC, ISA_DSPR2; + +// Instruction alias. +def : MMDSPInstAlias<"wrdsp $rt", (WRDSP_MM GPR32Opnd:$rt, 0x1F), 1>; diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td index 004b0d5..756e6c9 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td +++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFPU.td @@ -37,23 +37,14 @@ def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, def FCMP_D32_MM : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM_MM<1>; -def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, IIBranch, MIPS_BRANCH_F>, +def BC1F_MM : MMRel, BC1F_FT<"bc1f", brtarget_mm, II_BC1F, MIPS_BRANCH_F>, BC1F_FM_MM<0x1c>, ISA_MIPS1_NOT_32R6_64R6; -def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, IIBranch, MIPS_BRANCH_T>, +def BC1T_MM : MMRel, BC1F_FT<"bc1t", brtarget_mm, II_BC1T, MIPS_BRANCH_T>, BC1F_FM_MM<0x1d>, ISA_MIPS1_NOT_32R6_64R6; - -def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, - ROUND_W_FM_MM<0, 0x6c>; def CVT_W_S_MM : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ROUND_W_FM_MM<0, 0x24>; -def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, - ROUND_W_FM_MM<0, 0x2c>; -def ROUND_W_S_MM : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ROUND_W_FM_MM<0, 0xec>; -def TRUNC_W_S_MM : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, - ROUND_W_FM_MM<0, 0xac>; -def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, - fsqrt>, ROUND_W_FM_MM<0, 0x28>; def CEIL_W_MM : MMRel, ABSS_FT<"ceil.w.d", FGR32Opnd, AFGR64Opnd, II_CEIL>, ROUND_W_FM_MM<1, 0x6c>; @@ -61,7 +52,7 @@ def CVT_W_MM : MMRel, ABSS_FT<"cvt.w.d", FGR32Opnd, AFGR64Opnd, II_CVT>, ROUND_W_FM_MM<1, 0x24>; def FLOOR_W_MM : MMRel, ABSS_FT<"floor.w.d", FGR32Opnd, AFGR64Opnd, II_FLOOR>, ROUND_W_FM_MM<1, 0x2c>; -def ROUND_W_MM : MMRel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, +def ROUND_W_MM : MMRel, StdMMR6Rel, ABSS_FT<"round.w.d", FGR32Opnd, AFGR64Opnd, II_ROUND>, ROUND_W_FM_MM<1, 0xec>; def TRUNC_W_MM : MMRel, ABSS_FT<"trunc.w.d", FGR32Opnd, AFGR64Opnd, II_TRUNC>, ROUND_W_FM_MM<1, 0xac>; @@ -146,3 +137,14 @@ def NMADD_D32_MM : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>, def NMSUB_D32_MM : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>, MADDS_FM_MM<0x2a>; } + +let AdditionalPredicates = [InMicroMips] in { + def FLOOR_W_S_MM : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, + II_FLOOR>, ROUND_W_FM_MM<0, 0x2c>; + def TRUNC_W_S_MM : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, + FGR32Opnd, II_TRUNC>, ROUND_W_FM_MM<0, 0xac>; + def CEIL_W_S_MM : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, + ROUND_W_FM_MM<0, 0x6c>; + def FSQRT_S_MM : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, + fsqrt>, ROUND_W_FM_MM<0, 0x28>; +} diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td index 560afa4..b736367 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrFormats.td @@ -389,6 +389,22 @@ class LW_FM_MM<bits<6> op> : MMArch { let Inst{15-0} = addr{15-0}; } +class POOL32C_LHUE_FM_MM<bits<6> op, bits<4> fmt, bits<3> funct> : MMArch { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = fmt; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + class LWL_FM_MM<bits<4> funct> { bits<5> rt; bits<21> addr; @@ -402,6 +418,22 @@ class LWL_FM_MM<bits<4> funct> { let Inst{11-0} = addr{11-0}; } +class POOL32C_STEVA_LDEVA_FM_MM<bits<4> type, bits<3> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0x18; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = type; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + class CMov_F_I_FM_MM<bits<7> func> : MMArch { bits<5> rd; bits<5> rs; @@ -655,6 +687,22 @@ class LL_FM_MM<bits<4> funct> { let Inst{11-0} = addr{11-0}; } +class LLE_FM_MM<bits<4> funct> { + bits<5> rt; + bits<21> addr; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = 0x18; + let Inst{25-21} = rt; + let Inst{20-16} = base; + let Inst{15-12} = funct; + let Inst{11-9} = 0x6; + let Inst{8-0} = offset; +} + class ADDS_FM_MM<bits<2> fmt, bits<8> funct> : MMArch { bits<5> ft; bits<5> fs; @@ -895,7 +943,7 @@ class LWM_FM_MM<bits<4> funct> : MMArch { let Inst{11-0} = addr{11-0}; } -class LWM_FM_MM16<bits<4> funct> : MMArch { +class LWM_FM_MM16<bits<4> funct> : MMArch, PredicateControl { bits<2> rt; bits<4> addr; @@ -922,6 +970,37 @@ class CACHE_PREF_FM_MM<bits<6> op, bits<4> funct> : MMArch { let Inst{11-0} = offset; } +class CACHE_PREFE_FM_MM<bits<6> op, bits<3> funct> : MMArch { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = hint; + let Inst{20-16} = base; + let Inst{15-12} = 0xA; + let Inst{11-9} = funct; + let Inst{8-0} = offset; +} + +class POOL32F_PREFX_FM_MM<bits<6> op, bits<9> funct> : MMArch { + bits<5> index; + bits<5> base; + bits<5> hint; + + bits<32> Inst; + + let Inst{31-26} = op; + let Inst{25-21} = index; + let Inst{20-16} = base; + let Inst{15-11} = hint; + let Inst{10-9} = 0x0; + let Inst{8-0} = funct; +} + class BARRIER_FM_MM<bits<5> op> : MMArch { bits<32> Inst; diff --git a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td index 3939384..99f0f44 100644 --- a/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MicroMipsInstrInfo.td @@ -13,11 +13,6 @@ def simm12 : Operand<i32> { let DecoderMethod = "DecodeSimm12"; } -def uimm5_lsl2 : Operand<OtherVT> { - let EncoderMethod = "getUImm5Lsl2Encoding"; - let DecoderMethod = "DecodeUImm5lsl2"; -} - def uimm6_lsl2 : Operand<i32> { let EncoderMethod = "getUImm6Lsl2Encoding"; let DecoderMethod = "DecodeUImm6Lsl2"; @@ -30,6 +25,7 @@ def simm9_addiusp : Operand<i32> { def uimm3_shift : Operand<i32> { let EncoderMethod = "getUImm3Mod8Encoding"; + let DecoderMethod = "DecodePOOL16BEncodedField"; } def simm3_lsa2 : Operand<i32> { @@ -105,6 +101,14 @@ def mem_mm_gp_imm7_lsl2 : Operand<i32> { let EncoderMethod = "getMemEncodingMMGPImm7Lsl2"; } +def mem_mm_9 : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops GPR32, simm9); + let EncoderMethod = "getMemEncodingMMImm9"; + let ParserMatchClass = MipsMemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + def mem_mm_12 : Operand<i32> { let PrintMethod = "printMemOperand"; let MIOperandInfo = (ops GPR32, simm12); @@ -113,6 +117,14 @@ def mem_mm_12 : Operand<i32> { let OperandType = "OPERAND_MEMORY"; } +def mem_mm_16 : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops GPR32, simm16); + let EncoderMethod = "getMemEncodingMMImm16"; + let ParserMatchClass = MipsMemAsmOperand; + let OperandType = "OPERAND_MEMORY"; +} + def MipsMemUimm4AsmOperand : AsmOperandClass { let Name = "MemOffsetUimm4"; let SuperClasses = [MipsMemAsmOperand]; @@ -166,7 +178,7 @@ def simm23_lsl2 : Operand<i32> { class CompactBranchMM<string opstr, DAGOperand opnd, PatFrag cond_op, RegisterOperand RO> : InstSE<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> { + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZC, FrmI> { let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 0; @@ -251,6 +263,13 @@ class LLBaseMM<string opstr, RegisterOperand RO> : let mayLoad = 1; } +class LLEBaseMM<string opstr, RegisterOperand RO> : + InstSE<(outs RO:$rt), (ins mem_mm_12:$addr), + !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> { + let DecoderMethod = "DecodeMemMMImm9"; + let mayLoad = 1; +} + class SCBaseMM<string opstr, RegisterOperand RO> : InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr), !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> { @@ -259,6 +278,14 @@ class SCBaseMM<string opstr, RegisterOperand RO> : let Constraints = "$rt = $dst"; } +class SCEBaseMM<string opstr, RegisterOperand RO> : + InstSE<(outs RO:$dst), (ins RO:$rt, mem_mm_12:$addr), + !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> { + let DecoderMethod = "DecodeMemMMImm9"; + let mayStore = 1; + let Constraints = "$rt = $dst"; +} + class LoadMM<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, InstrItinClass Itin = NoItinerary> : InstSE<(outs RO:$rt), (ins mem_mm_12:$addr), @@ -392,7 +419,7 @@ class LoadImmMM16<string opstr, Operand Od, RegisterOperand RO> : // 16-bit Jump and Link (Call) class JumpLinkRegMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [(MipsJmpLink RO:$rs)], IIBranch, FrmR> { + [(MipsJmpLink RO:$rs)], II_JALR, FrmR>, PredicateControl { let isCall = 1; let hasDelaySlot = 1; let Defs = [RA]; @@ -401,7 +428,7 @@ class JumpLinkRegMM16<string opstr, RegisterOperand RO> : // 16-bit Jump Reg class JumpRegMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [], IIBranch, FrmR> { + [], II_JR, FrmR> { let hasDelaySlot = 1; let isBranch = 1; let isIndirectBranch = 1; @@ -410,7 +437,7 @@ class JumpRegMM16<string opstr, RegisterOperand RO> : // Base class for JRADDIUSP instruction. class JumpRAddiuStackMM16 : MicroMipsInst16<(outs), (ins uimm5_lsl2:$imm), "jraddiusp\t$imm", - [], IIBranch, FrmR> { + [], II_JRADDIUSP, FrmR> { let isTerminator = 1; let isBarrier = 1; let isBranch = 1; @@ -420,7 +447,7 @@ class JumpRAddiuStackMM16 : // 16-bit Jump and Link (Call) - Short Delay Slot class JumpLinkRegSMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [], IIBranch, FrmR> { + [], II_JALRS, FrmR> { let isCall = 1; let hasDelaySlot = 1; let Defs = [RA]; @@ -429,7 +456,7 @@ class JumpLinkRegSMM16<string opstr, RegisterOperand RO> : // 16-bit Jump Register Compact - No delay slot class JumpRegCMM16<string opstr, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), - [], IIBranch, FrmR> { + [], II_JRC, FrmR> { let isTerminator = 1; let isBarrier = 1; let isBranch = 1; @@ -444,7 +471,7 @@ class BrkSdbbp16MM<string opstr> : class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> : MicroMipsInst16<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI> { + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZ, FrmI> { let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 1; @@ -455,18 +482,18 @@ class CBranchZeroMM<string opstr, DAGOperand opnd, RegisterOperand RO> : let isCall = 1, hasDelaySlot = 1, Defs = [RA] in { class JumpLinkMM<string opstr, DAGOperand opnd> : InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"), - [], IIBranch, FrmJ, opstr> { + [], II_JALS, FrmJ, opstr> { let DecoderMethod = "DecodeJumpTargetMM"; } class JumpLinkRegMM<string opstr, RegisterOperand RO>: InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"), - [], IIBranch, FrmR>; + [], II_JALRS, FrmR>; class BranchCompareToZeroLinkMM<string opstr, DAGOperand opnd, RegisterOperand RO> : InstSE<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>; + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZALS, FrmI, opstr>; } class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO, @@ -475,6 +502,10 @@ class LoadWordIndexedScaledMM<string opstr, RegisterOperand RO, InstSE<(outs RO:$rd), (ins PtrRC:$base, PtrRC:$index), !strconcat(opstr, "\t$rd, ${index}(${base})"), [], Itin, FrmFI>; +class PrefetchIndexed<string opstr> : + InstSE<(outs), (ins PtrRC:$base, PtrRC:$index, uimm5:$hint), + !strconcat(opstr, "\t$hint, ${index}(${base})"), [], NoItinerary, FrmOther>; + class AddImmUPC<string opstr, RegisterOperand RO> : InstSE<(outs RO:$rs), (ins simm23_lsl2:$imm), !strconcat(opstr, "\t$rs, $imm"), [], NoItinerary, FrmR>; @@ -543,7 +574,7 @@ class LoadMultMM16<string opstr, class UncondBranchMM16<string opstr> : MicroMipsInst16<(outs), (ins brtarget10_mm:$offset), !strconcat(opstr, "\t$offset"), - [], IIBranch, FrmI> { + [], II_B, FrmI> { let isBranch = 1; let isTerminator = 1; let isBarrier = 1; @@ -553,21 +584,24 @@ class UncondBranchMM16<string opstr> : } def ADDU16_MM : ArithRMM16<"addu16", GPRMM16Opnd, 1, II_ADDU, add>, - ARITH_FM_MM16<0>; -def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>, - ARITH_FM_MM16<1>; -def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>; + ARITH_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6; def AND16_MM : LogicRMM16<"and16", GPRMM16Opnd, II_AND, and>, - LOGIC_FM_MM16<0x2>; -def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, - LOGIC_FM_MM16<0x3>; -def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>, - LOGIC_FM_MM16<0x1>; -def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>; + LOGIC_FM_MM16<0x2>, ISA_MICROMIPS_NOT_32R6_64R6; +def ANDI16_MM : AndImmMM16<"andi16", GPRMM16Opnd, II_AND>, ANDI_FM_MM16<0x0b>, + ISA_MICROMIPS_NOT_32R6_64R6; +def NOT16_MM : NotMM16<"not16", GPRMM16Opnd>, LOGIC_FM_MM16<0x0>, + ISA_MICROMIPS_NOT_32R6_64R6; +def OR16_MM : LogicRMM16<"or16", GPRMM16Opnd, II_OR, or>, LOGIC_FM_MM16<0x3>, + ISA_MICROMIPS_NOT_32R6_64R6; def SLL16_MM : ShiftIMM16<"sll16", uimm3_shift, GPRMM16Opnd, II_SLL>, - SHIFT_FM_MM16<0>; + SHIFT_FM_MM16<0>, ISA_MICROMIPS_NOT_32R6_64R6; def SRL16_MM : ShiftIMM16<"srl16", uimm3_shift, GPRMM16Opnd, II_SRL>, - SHIFT_FM_MM16<1>; + SHIFT_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6; + +def SUBU16_MM : ArithRMM16<"subu16", GPRMM16Opnd, 0, II_SUBU, sub>, + ARITH_FM_MM16<1>, ISA_MICROMIPS_NOT_32R6_64R6; +def XOR16_MM : LogicRMM16<"xor16", GPRMM16Opnd, II_XOR, xor>, + LOGIC_FM_MM16<0x1>, ISA_MICROMIPS_NOT_32R6_64R6; def LBU16_MM : LoadMM16<"lbu16", GPRMM16Opnd, zextloadi8, II_LBU, mem_mm_4>, LOAD_STORE_FM_MM16<0x02>; def LHU16_MM : LoadMM16<"lhu16", GPRMM16Opnd, zextloadi16, II_LHU, @@ -597,7 +631,8 @@ def MOVE16_MM : MoveMM16<"move", GPR32Opnd>, MOVE_FM_MM16<0x03>; def MOVEP_MM : MovePMM16<"movep", GPRMM16OpndMoveP>, MOVEP_FM_MM16; def LI16_MM : LoadImmMM16<"li16", li_simm7, GPRMM16Opnd>, LI_FM_MM16, IsAsCheapAsAMove; -def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>; +def JALR16_MM : JumpLinkRegMM16<"jalr", GPR32Opnd>, JALR_FM_MM16<0x0e>, + ISA_MICROMIPS32_NOT_MIPS32R6; def JALRS16_MM : JumpLinkRegSMM16<"jalrs16", GPR32Opnd>, JALR_FM_MM16<0x0f>; def JRC16_MM : JumpRegCMM16<"jrc", GPR32Opnd>, JALR_FM_MM16<0x0d>; def JRADDIUSP : JumpRAddiuStackMM16, JRADDIUSP_FM_MM16<0x18>; @@ -607,8 +642,18 @@ def BEQZ16_MM : CBranchZeroMM<"beqz16", brtarget7_mm, GPRMM16Opnd>, def BNEZ16_MM : CBranchZeroMM<"bnez16", brtarget7_mm, GPRMM16Opnd>, BEQNEZ_FM_MM16<0x2b>; def B16_MM : UncondBranchMM16<"b16">, B16_FM; -def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>; -def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>; +def BREAK16_MM : BrkSdbbp16MM<"break16">, BRKSDBBP16_FM_MM<0x28>, + ISA_MICROMIPS_NOT_32R6_64R6; +def SDBBP16_MM : BrkSdbbp16MM<"sdbbp16">, BRKSDBBP16_FM_MM<0x2C>, + ISA_MICROMIPS_NOT_32R6_64R6; + +let DecoderNamespace = "MicroMips" in { + /// Load and Store Instructions - multiple + def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>, + ISA_MICROMIPS32_NOT_MIPS32R6; + def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>, + ISA_MICROMIPS32_NOT_MIPS32R6; +} class WaitMM<string opstr> : InstSE<(outs), (ins uimm10:$code_), !strconcat(opstr, "\t$code_"), [], @@ -701,6 +746,18 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def SW_MM : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>; } + let DecoderMethod = "DecodeMemMMImm9" in { + def LBE_MM : Load<"lbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x4>; + def LBuE_MM : Load<"lbue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x0>; + def LHE_MM : Load<"lhe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x5>; + def LHuE_MM : Load<"lhue", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x1>; + def LWE_MM : Load<"lwe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0x6, 0x7>; + def SBE_MM : Store<"sbe", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x4>; + def SHE_MM : Store<"she", GPR32Opnd>, POOL32C_LHUE_FM_MM<0x18, 0xa, 0x5>; + def SWE_MM : StoreMemory<"swe", GPR32Opnd, mem_simm9gpr>, + POOL32C_LHUE_FM_MM<0x18, 0xa, 0x7>; + } + def LWXS_MM : LoadWordIndexedScaledMM<"lwxs", GPR32Opnd>, LWXS_FM_MM<0x118>; def LWU_MM : LoadMM<"lwu", GPR32Opnd, zextloadi32, II_LWU>, LL_FM_MM<0xe>; @@ -714,12 +771,20 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { LWL_FM_MM<0x8>; def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>, LWL_FM_MM<0x9>; + let DecoderMethod = "DecodeMemMMImm9" in { + def LWLE_MM : LoadLeftRightMM<"lwle", MipsLWL, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x2>; + def LWRE_MM : LoadLeftRightMM<"lwre", MipsLWR, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0x6, 0x3>; + def SWLE_MM : StoreLeftRightMM<"swle", MipsSWL, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x0>; + def SWRE_MM : StoreLeftRightMM<"swre", MipsSWR, GPR32Opnd, mem_mm_12>, + POOL32C_STEVA_LDEVA_FM_MM<0xa, 0x1>, ISA_MIPS1_NOT_32R6_64R6; + } /// Load and Store Instructions - multiple def SWM32_MM : StoreMultMM<"swm32">, LWM_FM_MM<0xd>; def LWM32_MM : LoadMultMM<"lwm32">, LWM_FM_MM<0x5>; - def SWM16_MM : StoreMultMM16<"swm16">, LWM_FM_MM16<0x5>; - def LWM16_MM : LoadMultMM16<"lwm16">, LWM_FM_MM16<0x4>; /// Load and Store Pair Instructions def SWP_MM : StorePairMM<"swp">, LWM_FM_MM<0x9>; @@ -777,11 +842,11 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { SEB_FM_MM<0x0ec>, ISA_MIPS32R2; /// Word Swap Bytes Within Halfwords - def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>, - ISA_MIPS32R2; - - def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, - EXT_FM_MM<0x2c>; + def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, + SEB_FM_MM<0x1ec>, ISA_MIPS32R2; + // TODO: Add '0 < pos+size <= 32' constraint check to ext instruction + def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, + MipsExt>, EXT_FM_MM<0x2c>; def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM_MM<0x0c>; @@ -854,12 +919,22 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def LL_MM : LLBaseMM<"ll", GPR32Opnd>, LL_FM_MM<0x3>; def SC_MM : SCBaseMM<"sc", GPR32Opnd>, LL_FM_MM<0xb>; + def LLE_MM : LLEBaseMM<"lle", GPR32Opnd>, LLE_FM_MM<0x6>; + def SCE_MM : SCEBaseMM<"sce", GPR32Opnd>, LLE_FM_MM<0xA>; + let DecoderMethod = "DecodeCacheOpMM" in { def CACHE_MM : MMRel, CacheOp<"cache", mem_mm_12>, CACHE_PREF_FM_MM<0x08, 0x6>; def PREF_MM : MMRel, CacheOp<"pref", mem_mm_12>, CACHE_PREF_FM_MM<0x18, 0x2>; } + + let DecoderMethod = "DecodePrefeOpMM" in { + def PREFE_MM : MMRel, CacheOp<"prefe", mem_mm_9>, + CACHE_PREFE_FM_MM<0x18, 0x2>; + def CACHEE_MM : MMRel, CacheOp<"cachee", mem_mm_9>, + CACHE_PREFE_FM_MM<0x18, 0x3>; + } def SSNOP_MM : MMRel, Barrier<"ssnop">, BARRIER_FM_MM<0x1>; def EHB_MM : MMRel, Barrier<"ehb">, BARRIER_FM_MM<0x3>; def PAUSE_MM : MMRel, Barrier<"pause">, BARRIER_FM_MM<0x5>; @@ -870,7 +945,13 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in { def TLBWR_MM : MMRel, TLB<"tlbwr">, COP0_TLB_FM_MM<0xcd>; def SDBBP_MM : MMRel, SYS_FT<"sdbbp">, SDBBP_FM_MM; - def RDHWR_MM : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM_MM; + + def PREFX_MM : PrefetchIndexed<"prefx">, POOL32F_PREFX_FM_MM<0x15, 0x1A0>; +} + +let DecoderNamespace = "MicroMips" in { + def RDHWR_MM : MMRel, R6MMR6Rel, ReadHardware<GPR32Opnd, HWRegsOpnd>, + RDHWR_FM_MM, ISA_MICROMIPS32_NOT_MIPS32R6; } let Predicates = [InMicroMips] in { @@ -928,7 +1009,7 @@ class UncondBranchMMPseudo<string opstr> : MipsAsmPseudoInst<(outs), (ins brtarget_mm:$offset), !strconcat(opstr, "\t$offset")>; - def B_MM_Pseudo : UncondBranchMMPseudo<"b">; +def B_MM_Pseudo : UncondBranchMMPseudo<"b">, ISA_MICROMIPS; def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>; def : MipsInstAlias<"nop", (SLL_MM ZERO, ZERO, 0), 1>; @@ -937,4 +1018,17 @@ class UncondBranchMMPseudo<string opstr> : let Predicates = [InMicroMips] in { def : MipsInstAlias<"ei", (EI_MM ZERO), 1>, ISA_MIPS32R2; +def : MipsInstAlias<"di", (DI_MM ZERO), 1>, ISA_MIPS32R2; +def : MipsInstAlias<"teq $rs, $rt", + (TEQ_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tge $rs, $rt", + (TGE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tgeu $rs, $rt", + (TGEU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tlt $rs, $rt", + (TLT_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tltu $rs, $rt", + (TLTU_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; +def : MipsInstAlias<"tne $rs, $rt", + (TNE_MM GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>; } diff --git a/contrib/llvm/lib/Target/Mips/Mips.td b/contrib/llvm/lib/Target/Mips/Mips.td index dbb5f7b..35352b6 100644 --- a/contrib/llvm/lib/Target/Mips/Mips.td +++ b/contrib/llvm/lib/Target/Mips/Mips.td @@ -154,9 +154,14 @@ def FeatureMips16 : SubtargetFeature<"mips16", "InMips16Mode", "true", def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">; def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true", "Mips DSP-R2 ASE", [FeatureDSP]>; +def FeatureDSPR3 + : SubtargetFeature<"dspr3", "HasDSPR3", "true", "Mips DSP-R3 ASE", + [ FeatureDSP, FeatureDSPR2 ]>; def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">; +def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">; + def FeatureMicroMips : SubtargetFeature<"micromips", "InMicroMipsMode", "true", "microMips mode">; @@ -164,10 +169,19 @@ def FeatureCnMips : SubtargetFeature<"cnmips", "HasCnMips", "true", "Octeon cnMIPS Support", [FeatureMips64r2]>; +def FeatureUseTCCInDIV : SubtargetFeature< + "use-tcc-in-div", + "UseTCCInDIV", "false", + "Force the assembler to use trapping">; + //===----------------------------------------------------------------------===// // Mips processors supported. //===----------------------------------------------------------------------===// +def ImplP5600 : SubtargetFeature<"p5600", "ProcImpl", + "MipsSubtarget::CPU::P5600", + "The P5600 Processor", [FeatureMips32r5]>; + class Proc<string Name, list<SubtargetFeature> Features> : Processor<Name, MipsGenericItineraries, Features>; @@ -187,12 +201,11 @@ def : Proc<"mips64r2", [FeatureMips64r2]>; def : Proc<"mips64r3", [FeatureMips64r3]>; def : Proc<"mips64r5", [FeatureMips64r5]>; def : Proc<"mips64r6", [FeatureMips64r6]>; -def : Proc<"mips16", [FeatureMips16]>; def : Proc<"octeon", [FeatureMips64r2, FeatureCnMips]>; +def : ProcessorModel<"p5600", MipsP5600Model, [ImplP5600]>; def MipsAsmParser : AsmParser { let ShouldEmitMatchRegisterName = 0; - let MnemonicContainsDot = 1; } def MipsAsmParserVariant : AsmParserVariant { diff --git a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp index 46cc99c..26426c0 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16FrameLowering.cpp @@ -39,7 +39,11 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF, const Mips16InstrInfo &TII = *static_cast<const Mips16InstrInfo *>(STI.getInstrInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + uint64_t StackSize = MFI->getStackSize(); // No need to allocate space on the stack. @@ -107,7 +111,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - MachineBasicBlock *EntryBlock = MF->begin(); + MachineBasicBlock *EntryBlock = &MF->front(); // // Registers RA, S0,S1 are the callee saved registers and they diff --git a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp index 893fc7c..b2bc7e7 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16HardFloat.cpp @@ -40,26 +40,17 @@ namespace { const MipsTargetMachine &TM; }; - class InlineAsmHelper { - LLVMContext &C; - BasicBlock *BB; - public: - InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) : - C(C_), BB(BB_) { - } - - void Out(StringRef AsmString) { - std::vector<llvm::Type *> AsmArgTypes; - std::vector<llvm::Value*> AsmArgs; - - llvm::FunctionType *AsmFTy = llvm::FunctionType::get(Type::getVoidTy(C), - AsmArgTypes, false); - llvm::InlineAsm *IA = llvm::InlineAsm::get(AsmFTy, AsmString, "", true, - /* IsAlignStack */ false, - llvm::InlineAsm::AD_ATT); - CallInst::Create(IA, AsmArgs, "", BB); - } - }; + static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) { + std::vector<llvm::Type *> AsmArgTypes; + std::vector<llvm::Value *> AsmArgs; + + llvm::FunctionType *AsmFTy = + llvm::FunctionType::get(Type::getVoidTy(C), AsmArgTypes, false); + llvm::InlineAsm *IA = + llvm::InlineAsm::get(AsmFTy, AsmText, "", true, + /* IsAlignStack */ false, llvm::InlineAsm::AD_ATT); + CallInst::Create(IA, AsmArgs, "", BB); + } char Mips16HardFloat::ID = 0; } @@ -182,7 +173,7 @@ static bool needsFPReturnHelper(Function &F) { return whichFPReturnVariant(RetType) != NoFPRet; } -static bool needsFPReturnHelper(const FunctionType &FT) { +static bool needsFPReturnHelper(FunctionType &FT) { Type* RetType = FT.getReturnType(); return whichFPReturnVariant(RetType) != NoFPRet; } @@ -195,63 +186,72 @@ static bool needsFPHelperFromSig(Function &F) { // We swap between FP and Integer registers to allow Mips16 and Mips32 to // interoperate // -static void swapFPIntParams(FPParamVariant PV, Module *M, InlineAsmHelper &IAH, - bool LE, bool ToFP) { - //LLVMContext &Context = M->getContext(); - std::string MI = ToFP? "mtc1 ": "mfc1 "; +static std::string swapFPIntParams(FPParamVariant PV, Module *M, bool LE, + bool ToFP) { + std::string MI = ToFP ? "mtc1 ": "mfc1 "; + std::string AsmText; + switch (PV) { case FSig: - IAH.Out(MI + "$$4,$$f12"); + AsmText += MI + "$$4, $$f12\n"; break; + case FFSig: - IAH.Out(MI +"$$4,$$f12"); - IAH.Out(MI + "$$5,$$f14"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f14\n"; break; + case FDSig: - IAH.Out(MI + "$$4,$$f12"); + AsmText += MI + "$$4, $$f12\n"; if (LE) { - IAH.Out(MI + "$$6,$$f14"); - IAH.Out(MI + "$$7,$$f15"); + AsmText += MI + "$$6, $$f14\n"; + AsmText += MI + "$$7, $$f15\n"; } else { - IAH.Out(MI + "$$7,$$f14"); - IAH.Out(MI + "$$6,$$f15"); + AsmText += MI + "$$7, $$f14\n"; + AsmText += MI + "$$6, $$f15\n"; } break; + case DSig: if (LE) { - IAH.Out(MI + "$$4,$$f12"); - IAH.Out(MI + "$$5,$$f13"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f13\n"; } else { - IAH.Out(MI + "$$5,$$f12"); - IAH.Out(MI + "$$4,$$f13"); + AsmText += MI + "$$5, $$f12\n"; + AsmText += MI + "$$4, $$f13\n"; } break; + case DDSig: if (LE) { - IAH.Out(MI + "$$4,$$f12"); - IAH.Out(MI + "$$5,$$f13"); - IAH.Out(MI + "$$6,$$f14"); - IAH.Out(MI + "$$7,$$f15"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f13\n"; + AsmText += MI + "$$6, $$f14\n"; + AsmText += MI + "$$7, $$f15\n"; } else { - IAH.Out(MI + "$$5,$$f12"); - IAH.Out(MI + "$$4,$$f13"); - IAH.Out(MI + "$$7,$$f14"); - IAH.Out(MI + "$$6,$$f15"); + AsmText += MI + "$$5, $$f12\n"; + AsmText += MI + "$$4, $$f13\n"; + AsmText += MI + "$$7, $$f14\n"; + AsmText += MI + "$$6, $$f15\n"; } break; + case DFSig: if (LE) { - IAH.Out(MI + "$$4,$$f12"); - IAH.Out(MI + "$$5,$$f13"); + AsmText += MI + "$$4, $$f12\n"; + AsmText += MI + "$$5, $$f13\n"; } else { - IAH.Out(MI + "$$5,$$f12"); - IAH.Out(MI + "$$4,$$f13"); + AsmText += MI + "$$5, $$f12\n"; + AsmText += MI + "$$4, $$f13\n"; } - IAH.Out(MI + "$$6,$$f14"); + AsmText += MI + "$$6, $$f14\n"; break; + case NoSig: - return; + break; } + + return AsmText; } // @@ -282,68 +282,77 @@ static void assureFPCallStub(Function &F, Module *M, FStub->addFnAttr("nomips16"); FStub->setSection(SectionName); BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub); - InlineAsmHelper IAH(Context, BB); - IAH.Out(".set reorder"); FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType()); FPParamVariant PV = whichFPParamVariantNeeded(F); - swapFPIntParams(PV, M, IAH, LE, true); + + std::string AsmText; + AsmText += ".set reorder\n"; + AsmText += swapFPIntParams(PV, M, LE, true); if (RV != NoFPRet) { - IAH.Out("move $$18, $$31"); - IAH.Out("jal " + Name); + AsmText += "move $$18, $$31\n"; + AsmText += "jal " + Name + "\n"; } else { - IAH.Out("lui $$25,%hi(" + Name + ")"); - IAH.Out("addiu $$25,$$25,%lo(" + Name + ")" ); + AsmText += "lui $$25, %hi(" + Name + ")\n"; + AsmText += "addiu $$25, $$25, %lo(" + Name + ")\n"; } + switch (RV) { case FRet: - IAH.Out("mfc1 $$2,$$f0"); + AsmText += "mfc1 $$2, $$f0\n"; break; + case DRet: if (LE) { - IAH.Out("mfc1 $$2,$$f0"); - IAH.Out("mfc1 $$3,$$f1"); + AsmText += "mfc1 $$2, $$f0\n"; + AsmText += "mfc1 $$3, $$f1\n"; } else { - IAH.Out("mfc1 $$3,$$f0"); - IAH.Out("mfc1 $$2,$$f1"); + AsmText += "mfc1 $$3, $$f0\n"; + AsmText += "mfc1 $$2, $$f1\n"; } break; + case CFRet: if (LE) { - IAH.Out("mfc1 $$2,$$f0"); - IAH.Out("mfc1 $$3,$$f2"); + AsmText += "mfc1 $$2, $$f0\n"; + AsmText += "mfc1 $$3, $$f2\n"; } else { - IAH.Out("mfc1 $$3,$$f0"); - IAH.Out("mfc1 $$3,$$f2"); + AsmText += "mfc1 $$3, $$f0\n"; + AsmText += "mfc1 $$3, $$f2\n"; } break; + case CDRet: if (LE) { - IAH.Out("mfc1 $$4,$$f2"); - IAH.Out("mfc1 $$5,$$f3"); - IAH.Out("mfc1 $$2,$$f0"); - IAH.Out("mfc1 $$3,$$f1"); + AsmText += "mfc1 $$4, $$f2\n"; + AsmText += "mfc1 $$5, $$f3\n"; + AsmText += "mfc1 $$2, $$f0\n"; + AsmText += "mfc1 $$3, $$f1\n"; } else { - IAH.Out("mfc1 $$5,$$f2"); - IAH.Out("mfc1 $$4,$$f3"); - IAH.Out("mfc1 $$3,$$f0"); - IAH.Out("mfc1 $$2,$$f1"); + AsmText += "mfc1 $$5, $$f2\n"; + AsmText += "mfc1 $$4, $$f3\n"; + AsmText += "mfc1 $$3, $$f0\n"; + AsmText += "mfc1 $$2, $$f1\n"; } break; + case NoFPRet: break; } + if (RV != NoFPRet) - IAH.Out("jr $$18"); + AsmText += "jr $$18\n"; else - IAH.Out("jr $$25"); + AsmText += "jr $$25\n"; + EmitInlineAsm(Context, BB, AsmText); + new UnreachableInst(Context, BB); } // // Functions that are llvm intrinsics and don't need helpers. // -static const char *IntrinsicInline[] = { +static const char *const IntrinsicInline[] = { "fabs", "fabsf", "llvm.ceil.f32", "llvm.ceil.f64", "llvm.copysign.f32", "llvm.copysign.f64", @@ -395,7 +404,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M, Type *T = RVal->getType(); FPReturnVariant RV = whichFPReturnVariant(T); if (RV == NoFPRet) continue; - static const char* Helper[NoFPRet] = { + static const char *const Helper[NoFPRet] = { "__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc", "__mips16_ret_dc" }; @@ -419,11 +428,11 @@ static bool fixupFPReturnAndCall(Function &F, Module *M, CallInst::Create(F, Params, "", &Inst ); } else if (const CallInst *CI = dyn_cast<CallInst>(I)) { const Value* V = CI->getCalledValue(); - const Type* T = nullptr; + Type* T = nullptr; if (V) T = V->getType(); - const PointerType *PFT=nullptr; + PointerType *PFT = nullptr; if (T) PFT = dyn_cast<PointerType>(T); - const FunctionType *FT=nullptr; + FunctionType *FT = nullptr; if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType()); Function *F_ = CI->getCalledFunction(); if (FT && needsFPReturnHelper(*FT) && @@ -469,20 +478,21 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV, FStub->addFnAttr("nomips16"); FStub->setSection(SectionName); BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub); - InlineAsmHelper IAH(Context, BB); + + std::string AsmText; if (PicMode) { - IAH.Out(".set noreorder"); - IAH.Out(".cpload $$25"); - IAH.Out(".set reorder"); - IAH.Out(".reloc 0,R_MIPS_NONE," + Name); - IAH.Out("la $$25," + LocalName); - } - else { - IAH.Out("la $$25," + Name); - } - swapFPIntParams(PV, M, IAH, LE, false); - IAH.Out("jr $$25"); - IAH.Out(LocalName + " = " + Name); + AsmText += ".set noreorder\n"; + AsmText += ".cpload $$25\n"; + AsmText += ".set reorder\n"; + AsmText += ".reloc 0, R_MIPS_NONE, " + Name + "\n"; + AsmText += "la $$25, " + LocalName + "\n"; + } else + AsmText += "la $$25, " + Name + "\n"; + AsmText += swapFPIntParams(PV, M, LE, false); + AsmText += "jr $$25\n"; + AsmText += LocalName + " = " + Name + "\n"; + EmitInlineAsm(Context, BB, AsmText); + new UnreachableInst(FStub->getContext(), BB); } @@ -535,7 +545,7 @@ bool Mips16HardFloat::runOnModule(Module &M) { FPParamVariant V = whichFPParamVariantNeeded(*F); if (V != NoSig) { Modified = true; - createFPFnStub(F, &M, V, TM); + createFPFnStub(&*F, &M, V, TM); } } return Modified; diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp index bce2c1e..5a1c2c67 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp @@ -73,7 +73,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { MachineBasicBlock::iterator I = MBB.begin(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; unsigned V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg(); const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass; diff --git a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp index 3522cbb..e748325 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16ISelLowering.cpp @@ -530,8 +530,7 @@ emitSel16(unsigned Opc, MachineInstr *MI, MachineBasicBlock *BB) const { // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -592,8 +591,7 @@ Mips16TargetLowering::emitSelT16(unsigned Opc1, unsigned Opc2, MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -657,8 +655,7 @@ Mips16TargetLowering::emitSeliT16(unsigned Opc1, unsigned Opc2, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp index a49572e..da8ada4 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.cpp @@ -196,7 +196,7 @@ static void addSaveRestoreRegs(MachineInstrBuilder &MIB, void Mips16InstrInfo::makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); const BitVector Reserved = RI.getReservedRegs(MF); @@ -263,7 +263,7 @@ void Mips16InstrInfo::adjustStackPtrBig(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned Reg1, unsigned Reg2) const { - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; // // li reg1, constant // move reg2, sp @@ -446,7 +446,7 @@ const MCInstrDesc &Mips16InstrInfo::AddiuSpImm(int64_t Imm) const { void Mips16InstrInfo::BuildAddiuSpImm (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const { - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; BuildMI(MBB, I, DL, AddiuSpImm(Imm)).addImm(Imm); } diff --git a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td index 10fff03..dad6ea4 100644 --- a/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips16InstrInfo.td @@ -530,19 +530,19 @@ class MayStore { // Purpose: Add Immediate Unsigned Word (2-Operand, Extended) // To add a constant to a 32-bit integer. // -def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>; +def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIM16Alu>; -def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIAlu>, +def AddiuRxRxImm16: F2RI16_ins<0b01001, "addiu", IIM16Alu>, ArithLogic16Defs<0> { let AddedComplexity = 5; } -def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>, +def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIM16Alu>, ArithLogic16Defs<0> { let isCodeGenOnly = 1; } def AddiuRxRyOffMemX16: - FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>; + FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIM16Alu>; // @@ -550,7 +550,7 @@ def AddiuRxRyOffMemX16: // Purpose: Add Immediate Unsigned Word (3-Operand, PC-Relative, Extended) // To add a constant to the program counter. // -def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>; +def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIM16Alu>; // // Format: ADDIU sp, immediate MIPS16e @@ -558,14 +558,14 @@ def AddiuRxPcImmX16: FEXT_RI16_PC_ins<0b00001, "addiu", IIAlu>; // To add a constant to the stack pointer. // def AddiuSpImm16 - : FI816_SP_ins<0b011, "addiu", IIAlu> { + : FI816_SP_ins<0b011, "addiu", IIM16Alu> { let Defs = [SP]; let Uses = [SP]; let AddedComplexity = 5; } def AddiuSpImmX16 - : FEXT_I816_SP_ins<0b011, "addiu", IIAlu> { + : FEXT_I816_SP_ins<0b011, "addiu", IIM16Alu> { let Defs = [SP]; let Uses = [SP]; } @@ -576,14 +576,14 @@ def AddiuSpImmX16 // To add 32-bit integers. // -def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>; +def AdduRxRyRz16: FRRR16_ins<01, "addu", IIM16Alu>, ArithLogic16Defs<1>; // // Format: AND rx, ry MIPS16e // Purpose: AND // To do a bitwise logical AND. -def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>; +def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIM16Alu>, ArithLogic16Defs<1>; // @@ -591,7 +591,7 @@ def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>; // Purpose: Branch on Equal to Zero // To test a GPR then do a PC-relative conditional branch. // -def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; +def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16; // @@ -599,7 +599,7 @@ def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; // Purpose: Branch on Equal to Zero (Extended) // To test a GPR then do a PC-relative conditional branch. // -def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; +def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIM16Alu>, cbranch16; // // Format: B offset MIPS16e @@ -607,27 +607,27 @@ def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16; // To do an unconditional PC-relative branch. // -def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16; +def Bimm16: FI16_ins<0b00010, "b", IIM16Alu>, branch16; // Format: B offset MIPS16e // Purpose: Unconditional Branch // To do an unconditional PC-relative branch. // -def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16; +def BimmX16: FEXT_I16_ins<0b00010, "b", IIM16Alu>, branch16; // // Format: BNEZ rx, offset MIPS16e // Purpose: Branch on Not Equal to Zero // To test a GPR then do a PC-relative conditional branch. // -def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16; +def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16; // // Format: BNEZ rx, offset MIPS16e // Purpose: Branch on Not Equal to Zero (Extended) // To test a GPR then do a PC-relative conditional branch. // -def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16; +def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIM16Alu>, cbranch16; // @@ -641,11 +641,11 @@ def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>; // Purpose: Branch on T Equal to Zero (Extended) // To test special register T then do a PC-relative conditional branch. // -def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 { +def Bteqz16: FI816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 { let Uses = [T8]; } -def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 { +def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIM16Alu>, cbranch16 { let Uses = [T8]; } @@ -669,11 +669,11 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">, // To test special register T then do a PC-relative conditional branch. // -def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 { +def Btnez16: FI816_ins<0b001, "btnez", IIM16Alu>, cbranch16 { let Uses = [T8]; } -def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 { +def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIM16Alu> ,cbranch16 { let Uses = [T8]; } @@ -695,7 +695,7 @@ def BtnezT8SltiuX16: FEXT_T8I8I16_ins<"btnez", "sltiu">, // Purpose: Compare // To compare the contents of two GPRs. // -def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> { +def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIM16Alu> { let Defs = [T8]; } @@ -704,7 +704,7 @@ def CmpRxRy16: FRR16R_ins<0b01010, "cmp", IIAlu> { // Purpose: Compare Immediate // To compare a constant with the contents of a GPR. // -def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> { +def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIM16Alu> { let Defs = [T8]; } @@ -713,7 +713,7 @@ def CmpiRxImm16: FRI16R_ins<0b01110, "cmpi", IIAlu> { // Purpose: Compare Immediate (Extended) // To compare a constant with the contents of a GPR. // -def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> { +def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIM16Alu> { let Defs = [T8]; } @@ -723,7 +723,7 @@ def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> { // Purpose: Divide Word // To divide 32-bit signed integers. // -def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> { +def DivRxRy16: FRR16_div_ins<0b11010, "div", IIM16Alu> { let Defs = [HI0, LO0]; } @@ -732,7 +732,7 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> { // Purpose: Divide Unsigned Word // To divide 32-bit unsigned integers. // -def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> { +def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIM16Alu> { let Defs = [HI0, LO0]; } // @@ -742,13 +742,13 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> { // region and preserve the current ISA. // -def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> { +def Jal16 : FJAL16_ins<0b0, "jal", IIM16Alu> { let hasDelaySlot = 0; // not true, but we add the nop for now let isCall=1; let Defs = [RA]; } -def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 { +def JalB16 : FJALB16_ins<0b0, "jal", IIM16Alu>, branch16 { let hasDelaySlot = 0; // not true, but we add the nop for now let isBranch=1; let Defs = [RA]; @@ -761,7 +761,7 @@ def JalB16 : FJALB16_ins<0b0, "jal", IIAlu>, branch16 { // address register. // -def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> { +def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> { let isBranch = 1; let isIndirectBranch = 1; let hasDelaySlot = 1; @@ -769,14 +769,14 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> { let isBarrier=1; } -def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIAlu> { +def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> { let isBranch = 1; let isIndirectBranch = 1; let isTerminator=1; let isBarrier=1; } -def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> { +def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> { let isBranch = 1; let isIndirectBranch = 1; let isTerminator=1; @@ -825,16 +825,16 @@ def LhuRxRyOffMemX16: // Purpose: Load Immediate // To load a constant into a GPR. // -def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>; +def LiRxImm16: FRI16_ins<0b01101, "li", IIM16Alu>; // // Format: LI rx, immediate MIPS16e // Purpose: Load Immediate (Extended) // To load a constant into a GPR. // -def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>; +def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIM16Alu>; -def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> { +def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIM16Alu> { let isCodeGenOnly = 1; } @@ -863,21 +863,21 @@ def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", II_LW>, MayLoad; // Purpose: Move // To move the contents of a GPR to a GPR. // -def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>; +def Move32R16: FI8_MOV32R16_ins<"move", IIM16Alu>; // // Format: MOVE ry, r32 MIPS16e //Purpose: Move // To move the contents of a GPR to a GPR. // -def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>; +def MoveR3216: FI8_MOVR3216_ins<"move", IIM16Alu>; // // Format: MFHI rx MIPS16e // Purpose: Move From HI Register // To copy the special purpose HI register to a GPR. // -def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> { +def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIM16Alu> { let Uses = [HI0]; let hasSideEffects = 0; } @@ -887,7 +887,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> { // Purpose: Move From LO Register // To copy the special purpose LO register to a GPR. // -def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> { +def Mflo16: FRR16_M_ins<0b10010, "mflo", IIM16Alu> { let Uses = [LO0]; let hasSideEffects = 0; } @@ -895,13 +895,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> { // // Pseudo Instruction for mult // -def MultRxRy16: FMULT16_ins<"mult", IIAlu> { +def MultRxRy16: FMULT16_ins<"mult", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; } -def MultuRxRy16: FMULT16_ins<"multu", IIAlu> { +def MultuRxRy16: FMULT16_ins<"multu", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; @@ -912,7 +912,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> { // Purpose: Multiply Word // To multiply 32-bit signed integers. // -def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> { +def MultRxRyRz16: FMULT16_LO_ins<"mult", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; @@ -923,7 +923,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> { // Purpose: Multiply Unsigned Word // To multiply 32-bit unsigned integers. // -def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> { +def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIM16Alu> { let isCommutable = 1; let hasSideEffects = 0; let Defs = [HI0, LO0]; @@ -934,21 +934,21 @@ def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> { // Purpose: Negate // To negate an integer value. // -def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIAlu>; +def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIM16Alu>; // // Format: NOT rx, ry MIPS16e // Purpose: Not // To complement an integer value // -def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIAlu>; +def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIM16Alu>; // // Format: OR rx, ry MIPS16e // Purpose: Or // To do a bitwise logical OR. // -def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>; +def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIM16Alu>, ArithLogic16Defs<1>; // // Format: RESTORE {ra,}{s0/s1/s0-1,}{framesize} @@ -1012,7 +1012,7 @@ def SbRxRyOffMemX16: // Sign-extend least significant byte in register rx. // def SebRx16 - : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>; + : FRR_SF16_ins<0b10001, 0b100, "seb", IIM16Alu>; // // Format: SEH rx MIPS16e @@ -1020,7 +1020,7 @@ def SebRx16 // Sign-extend least significant word in register rx. // def SehRx16 - : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>; + : FRR_SF16_ins<0b10001, 0b101, "seh", IIM16Alu>; // // The Sel(T) instructions are pseudos @@ -1149,21 +1149,21 @@ def ShRxRyOffMemX16: // Purpose: Shift Word Left Logical (Extended) // To execute a left-shift of a word by a fixed number of bits-0 to 31 bits. // -def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>; +def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIM16Alu>; // // Format: SLLV ry, rx MIPS16e // Purpose: Shift Word Left Logical Variable // To execute a left-shift of a word by a variable number of bits. // -def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>; +def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIM16Alu>; // Format: SLTI rx, immediate MIPS16e // Purpose: Set on Less Than Immediate // To record the result of a less-than comparison with a constant. // // -def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> { +def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIM16Alu> { let Defs = [T8]; } @@ -1173,7 +1173,7 @@ def SltiRxImm16: FRI16R_ins<0b01010, "slti", IIAlu> { // To record the result of a less-than comparison with a constant. // // -def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIAlu> { +def SltiRxImmX16: FEXT_RI16R_ins<0b01010, "slti", IIM16Alu> { let Defs = [T8]; } @@ -1184,7 +1184,7 @@ def SltiCCRxImmX16: FEXT_CCRXI16_ins<"slti">; // To record the result of a less-than comparison with a constant. // // -def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> { +def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIM16Alu> { let Defs = [T8]; } @@ -1194,7 +1194,7 @@ def SltiuRxImm16: FRI16R_ins<0b01011, "sltiu", IIAlu> { // To record the result of a less-than comparison with a constant. // // -def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIAlu> { +def SltiuRxImmX16: FEXT_RI16R_ins<0b01011, "sltiu", IIM16Alu> { let Defs = [T8]; } // @@ -1209,7 +1209,7 @@ def SltiuCCRxImmX16: FEXT_CCRXI16_ins<"sltiu">; // Purpose: Set on Less Than // To record the result of a less-than comparison. // -def SltRxRy16: FRR16R_ins<0b00010, "slt", IIAlu>{ +def SltRxRy16: FRR16R_ins<0b00010, "slt", IIM16Alu>{ let Defs = [T8]; } @@ -1219,7 +1219,7 @@ def SltCCRxRy16: FCCRR16_ins<"slt">; // Purpose: Set on Less Than Unsigned // To record the result of an unsigned less-than comparison. // -def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIAlu>{ +def SltuRxRy16: FRR16R_ins<0b00011, "sltu", IIM16Alu>{ let Defs = [T8]; } @@ -1236,7 +1236,7 @@ def SltuCCRxRy16: FCCRR16_ins<"sltu">; // To execute an arithmetic right-shift of a word by a variable // number of bits. // -def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>; +def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIM16Alu>; // @@ -1245,7 +1245,7 @@ def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>; // To execute an arithmetic right-shift of a word by a fixed // number of bits-1 to 8 bits. // -def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>; +def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIM16Alu>; // @@ -1254,7 +1254,7 @@ def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>; // To execute a logical right-shift of a word by a variable // number of bits. // -def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>; +def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIM16Alu>; // @@ -1263,14 +1263,14 @@ def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>; // To execute a logical right-shift of a word by a fixed // number of bits-1 to 31 bits. // -def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>; +def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIM16Alu>; // // Format: SUBU rz, rx, ry MIPS16e // Purpose: Subtract Unsigned Word // To subtract 32-bit integers // -def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>; +def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIM16Alu>, ArithLogic16Defs<0>; // // Format: SW ry, offset(rx) MIPS16e @@ -1294,7 +1294,7 @@ def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins // Purpose: Xor // To do a bitwise logical XOR. // -def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIAlu>, ArithLogic16Defs<1>; +def XorRxRxRy16: FRxRxRy16_ins<0b01110, "xor", IIM16Alu>, ArithLogic16Defs<1>; class Mips16Pat<dag pattern, dag result> : Pat<pattern, result> { let Predicates = [InMips16Mode]; @@ -1380,7 +1380,7 @@ def: Mips16Pat<(brind CPU16Regs:$rs), (JrcRx16 CPU16Regs:$rs)> { let isCall=1, hasDelaySlot=0 in def JumpLinkReg16: FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs), - "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch> { + "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], II_JALRC> { let Defs = [RA]; } diff --git a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td index d6ab8a6..82d2c8e 100644 --- a/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips32r6InstrInfo.td @@ -186,54 +186,56 @@ class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr, RegisterOperand FGROpnd>{ - def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>, - CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>, - CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>, - CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>, - CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>, + let AdditionalPredicates = [NotInMicroMips] in { + def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_AF>, + CMP_CONDN_DESC_BASE<"af", Typestr, FGROpnd>, ISA_MIPS32R6, HARDFLOAT; - def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>, - CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>, - CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>, - CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>, - CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>, - CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>, - CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>, - CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>, - CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, + def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>, + CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd, setuo>, ISA_MIPS32R6, HARDFLOAT; - def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>, - CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>, - CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, + def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>, + CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd, setoeq>, ISA_MIPS32R6, HARDFLOAT; - def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>, - CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, - ISA_MIPS32R6, HARDFLOAT; - def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>, - CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, + def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>, + CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd, setueq>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>, + CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd, setolt>, ISA_MIPS32R6, HARDFLOAT; + def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>, + CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd, setult>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>, + CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd, setole>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>, + CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd, setule>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SAF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SAF>, + CMP_CONDN_DESC_BASE<"saf", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SUN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUN>, + CMP_CONDN_DESC_BASE<"sun", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>, + CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SUEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SUEQ>, + CMP_CONDN_DESC_BASE<"sueq", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLT>, + CMP_CONDN_DESC_BASE<"slt", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULT>, + CMP_CONDN_DESC_BASE<"sult", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SLE>, + CMP_CONDN_DESC_BASE<"sle", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + def CMP_SULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SULE>, + CMP_CONDN_DESC_BASE<"sule", Typestr, FGROpnd>, + ISA_MIPS32R6, HARDFLOAT; + } } //===----------------------------------------------------------------------===// @@ -557,7 +559,7 @@ class CACHE_HINT_DESC<string instr_asm, Operand MemOpnd, dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint); string AsmString = !strconcat(instr_asm, "\t$hint, $addr"); list<dag> Pattern = []; - string DecoderMethod = "DecodeCacheOpR6"; + string DecoderMethod = "DecodeCacheeOp_CacheOpR6"; } class CACHE_DESC : CACHE_HINT_DESC<"cache", mem_simm9, GPR32Opnd>; @@ -595,7 +597,7 @@ class LSA_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, list<dag> Pattern = []; } -class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2>; +class LSA_R6_DESC : LSA_R6_DESC_BASE<"lsa", GPR32Opnd, uimm2_plus1>; class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { dag OutOperandList = (outs GPROpnd:$rt); @@ -685,8 +687,10 @@ def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6; def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6; def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6; def CACHE_R6 : R6MMR6Rel, CACHE_ENC, CACHE_DESC, ISA_MIPS32R6; -def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def CLO_R6 : R6MMR6Rel, CLO_R6_ENC, CLO_R6_DESC, ISA_MIPS32R6; def CLZ_R6 : R6MMR6Rel, CLZ_R6_ENC, CLZ_R6_DESC, ISA_MIPS32R6; defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>; @@ -702,39 +706,51 @@ def LSA_R6 : R6MMR6Rel, LSA_R6_ENC, LSA_R6_DESC, ISA_MIPS32R6; def LWC2_R6 : LWC2_R6_ENC, LWC2_R6_DESC, ISA_MIPS32R6; def LWPC : R6MMR6Rel, LWPC_ENC, LWPC_DESC, ISA_MIPS32R6; def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6; -def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def MOD : R6MMR6Rel, MOD_ENC, MOD_DESC, ISA_MIPS32R6; def MODU : R6MMR6Rel, MODU_ENC, MODU_DESC, ISA_MIPS32R6; -def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6, HARDFLOAT; +} def MUH : R6MMR6Rel, MUH_ENC, MUH_DESC, ISA_MIPS32R6; def MUHU : R6MMR6Rel, MUHU_ENC, MUHU_DESC, ISA_MIPS32R6; def MUL_R6 : R6MMR6Rel, MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6; def MULU : R6MMR6Rel, MULU_ENC, MULU_DESC, ISA_MIPS32R6; def NAL; // BAL with rd=0 def PREF_R6 : R6MMR6Rel, PREF_ENC, PREF_DESC, ISA_MIPS32R6; -def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SC_R6 : SC_R6_ENC, SC_R6_DESC, ISA_MIPS32R6; +let AdditionalPredicates = [NotInMicroMips] in { def SDBBP_R6 : SDBBP_R6_ENC, SDBBP_R6_DESC, ISA_MIPS32R6; +} def SDC2_R6 : SDC2_R6_ENC, SDC2_R6_DESC, ISA_MIPS32R6; def SELEQZ : R6MMR6Rel, SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6, GPR_32; -def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SELNEZ : R6MMR6Rel, SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6, GPR_32; -def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; -def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +let AdditionalPredicates = [NotInMicroMips] in { + def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6, HARDFLOAT; + def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6, HARDFLOAT; +} def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; //===----------------------------------------------------------------------===// @@ -743,7 +759,9 @@ def SWC2_R6 : SWC2_R6_ENC, SWC2_R6_DESC, ISA_MIPS32R6; // //===----------------------------------------------------------------------===// +let AdditionalPredicates = [NotInMicroMips] in { def : MipsInstAlias<"sdbbp", (SDBBP_R6 0)>, ISA_MIPS32R6; +} def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6; //===----------------------------------------------------------------------===// @@ -752,84 +770,78 @@ def : MipsInstAlias<"jr $rs", (JALR ZERO, GPR32Opnd:$rs), 1>, ISA_MIPS32R6; // //===----------------------------------------------------------------------===// -// f32 comparisons supported via another comparison -def : MipsPat<(setone f32:$lhs, f32:$rhs), - (NOR (CMP_UEQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seto f32:$lhs, f32:$rhs), - (NOR (CMP_UN_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(setune f32:$lhs, f32:$rhs), - (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seteq f32:$lhs, f32:$rhs), (CMP_EQ_S f32:$lhs, f32:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setgt f32:$lhs, f32:$rhs), (CMP_LE_S f32:$rhs, f32:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setge f32:$lhs, f32:$rhs), (CMP_LT_S f32:$rhs, f32:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setlt f32:$lhs, f32:$rhs), (CMP_LT_S f32:$lhs, f32:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setle f32:$lhs, f32:$rhs), (CMP_LE_S f32:$lhs, f32:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setne f32:$lhs, f32:$rhs), - (NOR (CMP_EQ_S f32:$lhs, f32:$rhs), ZERO)>, ISA_MIPS32R6; - -// f64 comparisons supported via another comparison -def : MipsPat<(setone f64:$lhs, f64:$rhs), - (NOR (CMP_UEQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seto f64:$lhs, f64:$rhs), - (NOR (CMP_UN_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(setune f64:$lhs, f64:$rhs), - (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; -def : MipsPat<(seteq f64:$lhs, f64:$rhs), (CMP_EQ_D f64:$lhs, f64:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setgt f64:$lhs, f64:$rhs), (CMP_LE_D f64:$rhs, f64:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setge f64:$lhs, f64:$rhs), (CMP_LT_D f64:$rhs, f64:$lhs)>, - ISA_MIPS32R6; -def : MipsPat<(setlt f64:$lhs, f64:$rhs), (CMP_LT_D f64:$lhs, f64:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setle f64:$lhs, f64:$rhs), (CMP_LE_D f64:$lhs, f64:$rhs)>, - ISA_MIPS32R6; -def : MipsPat<(setne f64:$lhs, f64:$rhs), - (NOR (CMP_EQ_D f64:$lhs, f64:$rhs), ZERO)>, ISA_MIPS32R6; +// comparisons supported via another comparison +multiclass Cmp_Pats<ValueType VT, Instruction NOROp, Register ZEROReg> { +def : MipsPat<(setone VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_UEQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +def : MipsPat<(seto VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_UN_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +def : MipsPat<(setune VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +def : MipsPat<(seteq VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs)>; +def : MipsPat<(setgt VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LE_"#NAME) VT:$rhs, VT:$lhs)>; +def : MipsPat<(setge VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LT_"#NAME) VT:$rhs, VT:$lhs)>; +def : MipsPat<(setlt VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LT_"#NAME) VT:$lhs, VT:$rhs)>; +def : MipsPat<(setle VT:$lhs, VT:$rhs), + (!cast<Instruction>("CMP_LE_"#NAME) VT:$lhs, VT:$rhs)>; +def : MipsPat<(setne VT:$lhs, VT:$rhs), + (NOROp (!cast<Instruction>("CMP_EQ_"#NAME) VT:$lhs, VT:$rhs), ZEROReg)>; +} + +defm S : Cmp_Pats<f32, NOR, ZERO>, ISA_MIPS32R6; +defm D : Cmp_Pats<f64, NOR, ZERO>, ISA_MIPS32R6; // i32 selects +multiclass SelectInt_Pats<ValueType RC, Instruction OROp, Instruction XORiOp, + Instruction SLTiOp, Instruction SLTiuOp, + Instruction SELEQZOp, Instruction SELNEZOp, + SDPatternOperator imm_type, ValueType Opg> { +// reg, immz +def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, RC:$cond), (SELNEZOp RC:$f, RC:$cond))>; +def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, RC:$f), + (OROp (SELNEZOp RC:$t, RC:$cond), (SELEQZOp RC:$f, RC:$cond))>; + +// reg, immZExt16[_64] +def : MipsPat<(select (Opg (seteq RC:$cond, imm_type:$imm)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)), + (SELNEZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>; +def : MipsPat<(select (Opg (setne RC:$cond, imm_type:$imm)), RC:$t, RC:$f), + (OROp (SELNEZOp RC:$t, (XORiOp RC:$cond, imm_type:$imm)), + (SELEQZOp RC:$f, (XORiOp RC:$cond, imm_type:$imm)))>; + +// reg, immSExt16Plus1 +def : MipsPat<(select (Opg (setgt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, (SLTiOp RC:$cond, (Plus1 imm:$imm))), + (SELNEZOp RC:$f, (SLTiOp RC:$cond, (Plus1 imm:$imm))))>; +def : MipsPat<(select (Opg (setugt RC:$cond, immSExt16Plus1:$imm)), RC:$t, RC:$f), + (OROp (SELEQZOp RC:$t, (SLTiuOp RC:$cond, (Plus1 imm:$imm))), + (SELNEZOp RC:$f, (SLTiuOp RC:$cond, (Plus1 imm:$imm))))>; + +def : MipsPat<(select (Opg (seteq RC:$cond, immz)), RC:$t, immz), + (SELEQZOp RC:$t, RC:$cond)>; +def : MipsPat<(select (Opg (setne RC:$cond, immz)), RC:$t, immz), + (SELNEZOp RC:$t, RC:$cond)>; +def : MipsPat<(select (Opg (seteq RC:$cond, immz)), immz, RC:$f), + (SELNEZOp RC:$f, RC:$cond)>; +def : MipsPat<(select (Opg (setne RC:$cond, immz)), immz, RC:$f), + (SELEQZOp RC:$f, RC:$cond)>; +} + +defm : SelectInt_Pats<i32, OR, XORi, SLTi, SLTiu, SELEQZ, SELNEZ, + immZExt16, i32>, ISA_MIPS32R6; + def : MipsPat<(select i32:$cond, i32:$t, i32:$f), - (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, i32:$f), - (OR (SELEQZ i32:$t, i32:$cond), (SELNEZ i32:$f, i32:$cond))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, i32:$cond), (SELEQZ i32:$f, i32:$cond))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), - (OR (SELEQZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), - (SELNEZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, + (OR (SELNEZ i32:$t, i32:$cond), + (SELEQZ i32:$f, i32:$cond))>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immZExt16:$imm)), i32:$t, i32:$f), - (OR (SELNEZ i32:$t, (XORi i32:$cond, immZExt16:$imm)), - (SELEQZ i32:$f, (XORi i32:$cond, immZExt16:$imm)))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (setgt i32:$cond, immSExt16Plus1:$imm)), i32:$t, - i32:$f), - (OR (SELEQZ i32:$t, (SLTi i32:$cond, (Plus1 imm:$imm))), - (SELNEZ i32:$f, (SLTi i32:$cond, (Plus1 imm:$imm))))>, - ISA_MIPS32R6; -def : MipsPat<(select (i32 (setugt i32:$cond, immSExt16Plus1:$imm)), - i32:$t, i32:$f), - (OR (SELEQZ i32:$t, (SLTiu i32:$cond, (Plus1 imm:$imm))), - (SELNEZ i32:$f, (SLTiu i32:$cond, (Plus1 imm:$imm))))>, - ISA_MIPS32R6; - def : MipsPat<(select i32:$cond, i32:$t, immz), - (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immz)), i32:$t, immz), - (SELNEZ i32:$t, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immz)), i32:$t, immz), - (SELEQZ i32:$t, i32:$cond)>, ISA_MIPS32R6; + (SELNEZ i32:$t, i32:$cond)>, + ISA_MIPS32R6; def : MipsPat<(select i32:$cond, immz, i32:$f), - (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (setne i32:$cond, immz)), immz, i32:$f), - (SELEQZ i32:$f, i32:$cond)>, ISA_MIPS32R6; -def : MipsPat<(select (i32 (seteq i32:$cond, immz)), immz, i32:$f), - (SELNEZ i32:$f, i32:$cond)>, ISA_MIPS32R6; + (SELEQZ i32:$f, i32:$cond)>, + ISA_MIPS32R6; diff --git a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td index f917eca..cbdcdd7 100644 --- a/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips64InstrInfo.td @@ -16,10 +16,6 @@ //===----------------------------------------------------------------------===// // Unsigned Operand -def uimm5_64 : Operand<i64> { - let PrintMethod = "printUnsignedImm"; -} - def uimm16_64 : Operand<i64> { let PrintMethod = "printUnsignedImm"; } @@ -276,12 +272,20 @@ def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>; let isCodeGenOnly = 1 in def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM; -def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>; -def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>; -def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>; +let AdditionalPredicates = [NotInMicroMips] in { + // TODO: Add 'pos + size' constraint check to dext* instructions + // DEXT: 0 < pos + size <= 63 + // DEXTM, DEXTU: 32 < pos + size <= 64 + def DEXT : ExtBase<"dext", GPR64Opnd, uimm5, uimm5_plus1, MipsExt>, + EXT_FM<3>; + def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5, uimm5_plus33, MipsExt>, + EXT_FM<1>; + def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm5_plus32, uimm5_plus1, + MipsExt>, EXT_FM<2>; +} def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>; -def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>; +def DINSU : InsBase<"dinsu", GPR64Opnd, uimm5_plus32>, EXT_FM<6>; def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>; let isCodeGenOnly = 1, rs = 0, shamt = 0 in { @@ -341,11 +345,11 @@ class SetCC64_I<string opstr, PatFrag cond_op>: } class CBranchBitNum<string opstr, DAGOperand opnd, PatFrag cond_op, - RegisterOperand RO, bits<64> shift = 1> : - InstSE<(outs), (ins RO:$rs, uimm5_64:$p, opnd:$offset), + RegisterOperand RO, Operand ImmOp, bits<64> shift = 1> : + InstSE<(outs), (ins RO:$rs, ImmOp:$p, opnd:$offset), !strconcat(opstr, "\t$rs, $p, $offset"), [(brcond (i32 (cond_op (and RO:$rs, (shl shift, immZExt5_64:$p)), 0)), - bb:$offset)], IIBranch, FrmI, opstr> { + bb:$offset)], II_BBIT, FrmI, opstr> { let isBranch = 1; let isTerminator = 1; let hasDelaySlot = 1; @@ -363,14 +367,17 @@ def BADDu : ArithLogicR<"baddu", GPR64Opnd, 1, II_BADDU>, ADD_FM<0x1c, 0x28>; // Branch on Bit Clear /+32 -def BBIT0 : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd>, BBIT_FM<0x32>; -def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, 0x100000000>, +def BBIT0 : CBranchBitNum<"bbit0", brtarget, seteq, GPR64Opnd, + uimm5_64_report_uimm6>, BBIT_FM<0x32>; +def BBIT032: CBranchBitNum<"bbit032", brtarget, seteq, GPR64Opnd, uimm5_64, + 0x100000000>, BBIT_FM<0x36>; // Branch on Bit Set /+32 -def BBIT1 : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd>, BBIT_FM<0x3a>; -def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, 0x100000000>, - BBIT_FM<0x3e>; +def BBIT1 : CBranchBitNum<"bbit1", brtarget, setne, GPR64Opnd, + uimm5_64_report_uimm6>, BBIT_FM<0x3a>; +def BBIT132: CBranchBitNum<"bbit132", brtarget, setne, GPR64Opnd, uimm5_64, + 0x100000000>, BBIT_FM<0x3e>; // Multiply Doubleword to GPR let Defs = [HI0, LO0, P0, P1, P2] in @@ -544,10 +551,25 @@ def : MipsPat<(brcond (i32 (setne (and i64:$lhs, PowerOf2HI:$mask), 0)), bb:$dst (BBIT132 i64:$lhs, (Log2HI PowerOf2HI:$mask), bb:$dst)>; } +// Atomic load patterns. +def : MipsPat<(atomic_load_8 addr:$a), (LB64 addr:$a)>; +def : MipsPat<(atomic_load_16 addr:$a), (LH64 addr:$a)>; +def : MipsPat<(atomic_load_32 addr:$a), (LW64 addr:$a)>; +def : MipsPat<(atomic_load_64 addr:$a), (LD addr:$a)>; + +// Atomic store patterns. +def : MipsPat<(atomic_store_8 addr:$a, GPR64:$v), (SB64 GPR64:$v, addr:$a)>; +def : MipsPat<(atomic_store_16 addr:$a, GPR64:$v), (SH64 GPR64:$v, addr:$a)>; +def : MipsPat<(atomic_store_32 addr:$a, GPR64:$v), (SW64 GPR64:$v, addr:$a)>; +def : MipsPat<(atomic_store_64 addr:$a, GPR64:$v), (SD GPR64:$v, addr:$a)>; + //===----------------------------------------------------------------------===// // Instruction aliases //===----------------------------------------------------------------------===// def : MipsInstAlias<"move $dst, $src", + (OR64 GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>, + GPR_64; +def : MipsInstAlias<"move $dst, $src", (DADDu GPR64Opnd:$dst, GPR64Opnd:$src, ZERO_64), 1>, GPR_64; def : MipsInstAlias<"daddu $rs, $rt, $imm", @@ -617,6 +639,38 @@ def : MipsInstAlias<"syncw", (SYNC 0x4), 0>; def : MipsInstAlias<"syncws", (SYNC 0x5), 0>; } +// cnMIPS Aliases. + +// bbit* with $p 32-63 converted to bbit*32 with $p 0-31 +def : MipsInstAlias<"bbit0 $rs, $p, $offset", + (BBIT032 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p, + brtarget:$offset), 0>, + ASE_CNMIPS; +def : MipsInstAlias<"bbit1 $rs, $p, $offset", + (BBIT132 GPR64Opnd:$rs, uimm5_plus32_normalize_64:$p, + brtarget:$offset), 0>, + ASE_CNMIPS; + +// exts with $pos 32-63 in converted to exts32 with $pos 0-31 +def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1", + (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; +def : MipsInstAlias<"exts $rt, $pos, $lenm1", + (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; + +// cins with $pos 32-63 in converted to cins32 with $pos 0-31 +def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1", + (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; +def : MipsInstAlias<"cins $rt, $pos, $lenm1", + (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt, + uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>, + ASE_CNMIPS; + //===----------------------------------------------------------------------===// // Assembler Pseudo Instructions //===----------------------------------------------------------------------===// @@ -625,3 +679,8 @@ class LoadImmediate64<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm64), !strconcat(instr_asm, "\t$rt, $imm64")> ; def LoadImm64 : LoadImmediate64<"dli", imm64, GPR64Opnd>; + +def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr), + "dla\t$rt, $addr">; +def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64), + "dla\t$rt, $imm64">; diff --git a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td index 6b546e8..6f34dbe 100644 --- a/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/Mips64r6InstrInfo.td @@ -62,7 +62,7 @@ class DCLO_R6_DESC : CLO_R6_DESC_BASE<"dclo", GPR64Opnd>; class DCLZ_R6_DESC : CLZ_R6_DESC_BASE<"dclz", GPR64Opnd>; class DDIV_DESC : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd, sdiv>; class DDIVU_DESC : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd, udiv>; -class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2>; +class DLSA_R6_DESC : LSA_R6_DESC_BASE<"dlsa", GPR64Opnd, uimm2_plus1>; class DMOD_DESC : DIVMOD_DESC_BASE<"dmod", GPR64Opnd, srem>; class DMODU_DESC : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd, urem>; class DMUH_DESC : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd, mulhs>; @@ -81,10 +81,12 @@ class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>; // //===----------------------------------------------------------------------===// -def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6; -def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6; -def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6; -def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6; +let AdditionalPredicates = [NotInMicroMips] in { + def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6; + def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6; + def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6; + def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6; +} def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6; def DCLO_R6 : DCLO_R6_ENC, DCLO_R6_DESC, ISA_MIPS64R6; def DCLZ_R6 : DCLZ_R6_ENC, DCLZ_R6_DESC, ISA_MIPS64R6; diff --git a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index fdba064..9575293 100644 --- a/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -169,12 +169,12 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { if (MCPE.isMachineConstantPoolEntry()) EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal); else - EmitGlobalConstant(MCPE.Val.ConstVal); + EmitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal); return; } - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); do { @@ -202,7 +202,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) { llvm_unreachable("Pseudo opcode found in EmitInstruction()"); MCInst TmpInst0; - MCInstLowering.Lower(I, TmpInst0); + MCInstLowering.Lower(&*I, TmpInst0); EmitToStreamer(*OutStreamer, TmpInst0); } while ((++I != E) && I->isInsideBundle()); // Delay slot check } @@ -405,7 +405,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock* // If this is a landing pad, it isn't a fall through. If it has no preds, // then nothing falls through to it. - if (MBB->isLandingPad() || MBB->pred_empty()) + if (MBB->isEHPad() || MBB->pred_empty()) return false; // If there isn't exactly one predecessor, it can't be a fall through. @@ -559,7 +559,6 @@ bool MipsAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); const MachineOperand &MO = MI->getOperand(opNum); bool closeP = false; @@ -608,7 +607,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum, } case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" + O << getDataLayout().getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" << MO.getIndex(); if (MO.getOffset()) O << "+" << MO.getOffset(); @@ -1009,7 +1008,7 @@ void MipsAsmPrinter::EmitFPCallStub( // // Mov $18, $31 - EmitInstrRegRegReg(*STI, Mips::ADDu, Mips::S2, Mips::RA, Mips::ZERO); + EmitInstrRegRegReg(*STI, Mips::OR, Mips::S2, Mips::RA, Mips::ZERO); EmitSwapFPIntParams(*STI, Signature->ParamSig, LE, true); diff --git a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp index b808129..d82063e 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsCCState.cpp @@ -29,22 +29,16 @@ static bool isF128SoftLibCall(const char *CallSym) { "powl", "rintl", "sinl", "sqrtl", "truncl"}; - const char *const *End = LibCalls + array_lengthof(LibCalls); - // Check that LibCalls is sorted alphabetically. - MipsTargetLowering::LTStr Comp; - -#ifndef NDEBUG - for (const char *const *I = LibCalls; I < End - 1; ++I) - assert(Comp(*I, *(I + 1))); -#endif - - return std::binary_search(LibCalls, End, CallSym, Comp); + auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; }; + assert(std::is_sorted(std::begin(LibCalls), std::end(LibCalls), Comp)); + return std::binary_search(std::begin(LibCalls), std::end(LibCalls), + CallSym, Comp); } /// This function returns true if Ty is fp128, {f128} or i128 which was /// originally a fp128. -static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) { +static bool originalTypeIsF128(Type *Ty, const SDNode *CallNode) { if (Ty->isFP128Ty()) return true; diff --git a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td index 93e1908..0b4b778 100644 --- a/contrib/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/contrib/llvm/lib/Target/Mips/MipsCallingConv.td @@ -427,3 +427,28 @@ def CSR_Mips16RetHelper : CalleeSavedRegs<(add V0, V1, FP, (sequence "A%u", 3, 0), (sequence "S%u", 7, 0), (sequence "D%u", 15, 10))>; + +def CSR_Interrupt_32R6 : CalleeSavedRegs<(add (sequence "A%u", 3, 0), + (sequence "S%u", 7, 0), + (sequence "V%u", 1, 0), + (sequence "T%u", 9, 0), + RA, FP, GP, AT)>; + +def CSR_Interrupt_32 : CalleeSavedRegs<(add (sequence "A%u", 3, 0), + (sequence "S%u", 7, 0), + (sequence "V%u", 1, 0), + (sequence "T%u", 9, 0), + RA, FP, GP, AT, LO0, HI0)>; + +def CSR_Interrupt_64R6 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0), + (sequence "V%u_64", 1, 0), + (sequence "S%u_64", 7, 0), + (sequence "T%u_64", 9, 0), + RA_64, FP_64, GP_64, AT_64)>; + +def CSR_Interrupt_64 : CalleeSavedRegs<(add (sequence "A%u_64", 3, 0), + (sequence "S%u_64", 7, 0), + (sequence "T%u_64", 9, 0), + (sequence "V%u_64", 1, 0), + RA_64, FP_64, GP_64, AT_64, + LO0_64, HI0_64)>; diff --git a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 96553d2..ea8c587 100644 --- a/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -560,7 +560,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { // identity mapping of CPI's to CPE's. const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants(); - const DataLayout &TD = *MF->getTarget().getDataLayout(); + const DataLayout &TD = MF->getDataLayout(); for (unsigned i = 0, e = CPs.size(); i != e; ++i) { unsigned Size = TD.getTypeAllocSize(CPs[i].getType()); assert(Size >= 4 && "Too small constant pool entry"); @@ -598,12 +598,12 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) { /// into the block immediately after it. static bool BBHasFallthrough(MachineBasicBlock *MBB) { // Get the next machine basic block in the function. - MachineFunction::iterator MBBI = MBB; + MachineFunction::iterator MBBI = MBB->getIterator(); // Can't fall off end of function. if (std::next(MBBI) == MBB->getParent()->end()) return false; - MachineBasicBlock *NextBB = std::next(MBBI); + MachineBasicBlock *NextBB = &*std::next(MBBI); for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) if (*I == NextBB) @@ -656,11 +656,11 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) { // alignment assumptions, as we don't know for sure the size of any // instructions in the inline assembly. for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) - computeBlockSize(I); + computeBlockSize(&*I); // Compute block offsets. - adjustBBOffsetsAfter(MF->begin()); + adjustBBOffsetsAfter(&MF->front()); // Now go back through the instructions and build up our data structures. for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); @@ -879,7 +879,7 @@ MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr // Create a new MBB for the code after the OrigBB. MachineBasicBlock *NewBB = MF->CreateMachineBasicBlock(OrigBB->getBasicBlock()); - MachineFunction::iterator MBBI = OrigBB; ++MBBI; + MachineFunction::iterator MBBI = ++OrigBB->getIterator(); MF->insert(MBBI, NewBB); // Splice the instructions starting with MI over to NewBB. @@ -967,8 +967,8 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset, unsigned CPELogAlign = getCPELogAlign(U.CPEMI); unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign); unsigned NextBlockOffset, NextBlockAlignment; - MachineFunction::const_iterator NextBlock = Water; - if (++NextBlock == MF->end()) { + MachineFunction::const_iterator NextBlock = ++Water->getIterator(); + if (NextBlock == MF->end()) { NextBlockOffset = BBInfo[Water->getNumber()].postOffset(); NextBlockAlignment = 0; } else { @@ -1261,7 +1261,7 @@ void MipsConstantIslands::createNewWater(unsigned CPUserIndex, if (isOffsetInRange(UserOffset, CPEOffset, U)) { DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber() << format(", expected CPE offset %#x\n", CPEOffset)); - NewMBB = std::next(MachineFunction::iterator(UserMBB)); + NewMBB = &*++UserMBB->getIterator(); // Add an unconditional branch from UserMBB to fallthrough block. Record // it for branch lengthening; this new branch will not get out of range, // but if the preceding conditional branch is out of range, the targets @@ -1371,8 +1371,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { NewWaterList.insert(NewIsland); // The new CPE goes before the following block (NewMBB). - NewMBB = std::next(MachineFunction::iterator(WaterBB)); - + NewMBB = &*++WaterBB->getIterator(); } else { // No water found. // we first see if a longer form of the instrucion could have reached @@ -1389,7 +1388,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // next iteration for constant pools, but in this context, we don't want // it. Check for this so it will be removed from the WaterList. // Also remove any entry from NewWaterList. - MachineBasicBlock *WaterBB = std::prev(MachineFunction::iterator(NewMBB)); + MachineBasicBlock *WaterBB = &*--NewMBB->getIterator(); IP = std::find(WaterList.begin(), WaterList.end(), WaterBB); if (IP != WaterList.end()) NewWaterList.erase(WaterBB); @@ -1406,7 +1405,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { WaterList.erase(IP); // Okay, we know we can put an island before NewMBB now, do it! - MF->insert(NewMBB, NewIsland); + MF->insert(NewMBB->getIterator(), NewIsland); // Update internal data structures to account for the newly inserted MBB. updateForInsertedWaterBlock(NewIsland); @@ -1431,9 +1430,7 @@ bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) { // Increase the size of the island block to account for the new entry. BBInfo[NewIsland->getNumber()].Size += Size; - adjustBBOffsetsAfter(std::prev(MachineFunction::iterator(NewIsland))); - - + adjustBBOffsetsAfter(&*--NewIsland->getIterator()); // Finally, change the CPI in the instruction operand to be ID. for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i) @@ -1645,7 +1642,7 @@ MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) { MBB->back().eraseFromParent(); // BBInfo[SplitBB].Offset is wrong temporarily, fixed below } - MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB)); + MachineBasicBlock *NextBB = &*++MBB->getIterator(); DEBUG(dbgs() << " Insert B to BB#" << DestBB->getNumber() << " also invert condition and change dest. to BB#" diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td index b5d52ce..f959bd4 100644 --- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrFormats.td @@ -7,10 +7,30 @@ // //===----------------------------------------------------------------------===// +class DspMMRel; + +def Dsp2MicroMips : InstrMapping { + let FilterClass = "DspMMRel"; + // Instructions with the same BaseOpcode and isNVStore values form a row. + let RowFields = ["BaseOpcode"]; + // Instructions with the same predicate sense form a column. + let ColFields = ["Arch"]; + // The key column is the unpredicated instructions. + let KeyCol = ["dsp"]; + // Value columns are PredSense=true and PredSense=false + let ValueCols = [["dsp"], ["mmdsp"]]; +} + def HasDSP : Predicate<"Subtarget->hasDSP()">, AssemblerPredicate<"FeatureDSP">; def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">, AssemblerPredicate<"FeatureDSPR2">; +def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">, + AssemblerPredicate<"FeatureDSPR3">; + +class ISA_DSPR2 { + list<Predicate> InsnPredicates = [HasDSPR2]; +} // Fields. class Field6<bits<6> val> { @@ -20,14 +40,22 @@ class Field6<bits<6> val> { def SPECIAL3_OPCODE : Field6<0b011111>; def REGIMM_OPCODE : Field6<0b000001>; -class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> { - let Predicates = [HasDSP]; +class DSPInst<string opstr = ""> + : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, PredicateControl { + let InsnPredicates = [HasDSP]; + string BaseOpcode = opstr; + string Arch = "dsp"; } class PseudoDSP<dag outs, dag ins, list<dag> pattern, - InstrItinClass itin = IIPseudo>: - MipsPseudo<outs, ins, pattern, itin> { - let Predicates = [HasDSP]; + InstrItinClass itin = IIPseudo> + : MipsPseudo<outs, ins, pattern, itin>, PredicateControl { + let InsnPredicates = [HasDSP]; +} + +class DSPInstAlias<string Asm, dag Result, bit Emit = 0b1> + : InstAlias<Asm, Result, Emit>, PredicateControl { + let InsnPredicates = [HasDSP]; } // ADDU.QB sub-class format. diff --git a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td index d268384..da6f174 100644 --- a/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsDSPInstrInfo.td @@ -12,9 +12,11 @@ //===----------------------------------------------------------------------===// // ImmLeaf +def immZExt1 : ImmLeaf<i32, [{return isUInt<1>(Imm);}]>; def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>; def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>; def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>; +def immZExt7 : ImmLeaf<i32, [{return isUInt<7>(Imm);}]>; def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>; def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>; def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>; @@ -263,6 +265,7 @@ class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -273,6 +276,7 @@ class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -293,6 +297,7 @@ class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -304,6 +309,7 @@ class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -314,6 +320,7 @@ class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -323,6 +330,7 @@ class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $imm"); list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -332,17 +340,19 @@ class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa"); list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, SDPatternOperator ImmPat, InstrItinClass itin, - RegisterOperand RO> { + RegisterOperand RO, Operand ImmOpnd> { dag OutOperandList = (outs RO:$rd); - dag InOperandList = (ins RO:$rt, uimm16:$rs_sa); + dag InOperandList = (ins RO:$rt, ImmOpnd:$rs_sa); string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa"); list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))]; InstrItinClass Itinerary = itin; bit hasSideEffects = 1; + string BaseOpcode = instr_asm; } class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -353,6 +363,7 @@ class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))]; InstrItinClass Itinerary = itin; bit mayLoad = 1; + string BaseOpcode = instr_asm; } class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -363,17 +374,19 @@ class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt"); list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - SDPatternOperator ImmOp, InstrItinClass itin> { + Operand ImmOp, SDPatternOperator Imm, InstrItinClass itin> { dag OutOperandList = (outs GPR32Opnd:$rt); - dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src); + dag InOperandList = (ins GPR32Opnd:$rs, ImmOp:$sa, GPR32Opnd:$src); string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa"); list<dag> Pattern = [(set GPR32Opnd:$rt, - (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))]; + (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, Imm:$sa))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -382,6 +395,7 @@ class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode, dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs); string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -390,15 +404,17 @@ class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode, dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs); string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { dag OutOperandList = (outs ACC64DSPOpnd:$ac); - dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin); + dag InOperandList = (ins simm6:$shift, ACC64DSPOpnd:$acin); string AsmString = !strconcat(instr_asm, "\t$ac, $shift"); list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { @@ -408,6 +424,7 @@ class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { @@ -417,6 +434,7 @@ class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -426,15 +444,17 @@ class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $mask"); list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode, InstrItinClass itin> { dag OutOperandList = (outs); - dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask); + dag InOperandList = (ins GPR32Opnd:$rs, uimm10:$mask); string AsmString = !strconcat(instr_asm, "\t$rs, $mask"); list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { @@ -444,6 +464,7 @@ class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> { list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))]; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -454,6 +475,7 @@ class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))]; InstrItinClass Itinerary = itin; bit isCommutable = 1; + string BaseOpcode = instr_asm; } class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -465,6 +487,7 @@ class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))]; InstrItinClass Itinerary = itin; string Constraints = "$acin = $ac"; + string BaseOpcode = instr_asm; } class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode, @@ -474,6 +497,7 @@ class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode, string AsmString = !strconcat(instr_asm, "\t$rd, $ac"); list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))]; InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> { @@ -481,6 +505,7 @@ class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> dag InOperandList = (ins GPR32Opnd:$rs); string AsmString = !strconcat(instr_asm, "\t$rs, $ac"); InstrItinClass Itinerary = itin; + string BaseOpcode = instr_asm; } class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> : @@ -506,6 +531,7 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode, list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))]; InstrItinClass Itinerary = itin; string Constraints = "$src = $rt"; + string BaseOpcode = instr_asm; } //===----------------------------------------------------------------------===// @@ -639,7 +665,7 @@ class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra", // Shift class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3, - NoItinerary, DSPROpnd>, + NoItinerary, DSPROpnd, uimm3>, Defs<[DSPOutFlag22]>; class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb, @@ -647,13 +673,13 @@ class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb, Defs<[DSPOutFlag22]>; class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm3>; class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb, NoItinerary, DSPROpnd>; class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4, - NoItinerary, DSPROpnd>, + NoItinerary, DSPROpnd, uimm4>, Defs<[DSPOutFlag22]>; class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph, @@ -661,7 +687,8 @@ class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph, Defs<[DSPOutFlag22]>; class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph, - immZExt4, NoItinerary, DSPROpnd>, + immZExt4, NoItinerary, DSPROpnd, + uimm4>, Defs<[DSPOutFlag22]>; class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph, @@ -669,19 +696,21 @@ class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph, Defs<[DSPOutFlag22]>; class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm4>; class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph, NoItinerary, DSPROpnd>; class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph, - immZExt4, NoItinerary, DSPROpnd>; + immZExt4, NoItinerary, DSPROpnd, + uimm4>; class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph, NoItinerary, DSPROpnd>; class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w, - immZExt5, NoItinerary, GPR32Opnd>, + immZExt5, NoItinerary, GPR32Opnd, + uimm5>, Defs<[DSPOutFlag22]>; class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w, @@ -689,7 +718,8 @@ class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w, Defs<[DSPOutFlag22]>; class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w, - immZExt5, NoItinerary, GPR32Opnd>; + immZExt5, NoItinerary, GPR32Opnd, + uimm5>; class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w, NoItinerary, GPR32Opnd>; @@ -1039,32 +1069,33 @@ class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w", // Shift class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm3>; class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb, NoItinerary, DSPROpnd>; class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb, - immZExt3, NoItinerary, DSPROpnd>; + immZExt3, NoItinerary, DSPROpnd, + uimm3>; class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb, NoItinerary, DSPROpnd>; class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4, - NoItinerary, DSPROpnd>; + NoItinerary, DSPROpnd, uimm4>; class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph, NoItinerary, DSPROpnd>; // Misc -class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5, +class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, uimm5, immZExt5, NoItinerary>; -class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2, +class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, uimm2, immZExt2, NoItinerary>; -class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5, - NoItinerary>; +class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, uimm5, + immZExt5, NoItinerary>; // Pseudos. def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, @@ -1072,80 +1103,80 @@ def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, // Instruction defs. // MIPS DSP Rev 1 -def ADDU_QB : ADDU_QB_ENC, ADDU_QB_DESC; -def ADDU_S_QB : ADDU_S_QB_ENC, ADDU_S_QB_DESC; -def SUBU_QB : SUBU_QB_ENC, SUBU_QB_DESC; -def SUBU_S_QB : SUBU_S_QB_ENC, SUBU_S_QB_DESC; -def ADDQ_PH : ADDQ_PH_ENC, ADDQ_PH_DESC; -def ADDQ_S_PH : ADDQ_S_PH_ENC, ADDQ_S_PH_DESC; -def SUBQ_PH : SUBQ_PH_ENC, SUBQ_PH_DESC; -def SUBQ_S_PH : SUBQ_S_PH_ENC, SUBQ_S_PH_DESC; -def ADDQ_S_W : ADDQ_S_W_ENC, ADDQ_S_W_DESC; -def SUBQ_S_W : SUBQ_S_W_ENC, SUBQ_S_W_DESC; -def ADDSC : ADDSC_ENC, ADDSC_DESC; -def ADDWC : ADDWC_ENC, ADDWC_DESC; +def ADDU_QB : DspMMRel, ADDU_QB_ENC, ADDU_QB_DESC; +def ADDU_S_QB : DspMMRel, ADDU_S_QB_ENC, ADDU_S_QB_DESC; +def SUBU_QB : DspMMRel, SUBU_QB_ENC, SUBU_QB_DESC; +def SUBU_S_QB : DspMMRel, SUBU_S_QB_ENC, SUBU_S_QB_DESC; +def ADDQ_PH : DspMMRel, ADDQ_PH_ENC, ADDQ_PH_DESC; +def ADDQ_S_PH : DspMMRel, ADDQ_S_PH_ENC, ADDQ_S_PH_DESC; +def SUBQ_PH : DspMMRel, SUBQ_PH_ENC, SUBQ_PH_DESC; +def SUBQ_S_PH : DspMMRel, SUBQ_S_PH_ENC, SUBQ_S_PH_DESC; +def ADDQ_S_W : DspMMRel, ADDQ_S_W_ENC, ADDQ_S_W_DESC; +def SUBQ_S_W : DspMMRel, SUBQ_S_W_ENC, SUBQ_S_W_DESC; +def ADDSC : DspMMRel, ADDSC_ENC, ADDSC_DESC; +def ADDWC : DspMMRel, ADDWC_ENC, ADDWC_DESC; def MODSUB : MODSUB_ENC, MODSUB_DESC; -def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC; -def ABSQ_S_PH : ABSQ_S_PH_ENC, ABSQ_S_PH_DESC; -def ABSQ_S_W : ABSQ_S_W_ENC, ABSQ_S_W_DESC; -def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC; -def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC; -def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC; -def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC; -def PRECEQ_W_PHL : PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC; -def PRECEQ_W_PHR : PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC; -def PRECEQU_PH_QBL : PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC; -def PRECEQU_PH_QBR : PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC; -def PRECEQU_PH_QBLA : PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC; -def PRECEQU_PH_QBRA : PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC; -def PRECEU_PH_QBL : PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC; -def PRECEU_PH_QBR : PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC; -def PRECEU_PH_QBLA : PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC; -def PRECEU_PH_QBRA : PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC; -def SHLL_QB : SHLL_QB_ENC, SHLL_QB_DESC; -def SHLLV_QB : SHLLV_QB_ENC, SHLLV_QB_DESC; -def SHRL_QB : SHRL_QB_ENC, SHRL_QB_DESC; -def SHRLV_QB : SHRLV_QB_ENC, SHRLV_QB_DESC; -def SHLL_PH : SHLL_PH_ENC, SHLL_PH_DESC; -def SHLLV_PH : SHLLV_PH_ENC, SHLLV_PH_DESC; -def SHLL_S_PH : SHLL_S_PH_ENC, SHLL_S_PH_DESC; -def SHLLV_S_PH : SHLLV_S_PH_ENC, SHLLV_S_PH_DESC; -def SHRA_PH : SHRA_PH_ENC, SHRA_PH_DESC; -def SHRAV_PH : SHRAV_PH_ENC, SHRAV_PH_DESC; -def SHRA_R_PH : SHRA_R_PH_ENC, SHRA_R_PH_DESC; -def SHRAV_R_PH : SHRAV_R_PH_ENC, SHRAV_R_PH_DESC; -def SHLL_S_W : SHLL_S_W_ENC, SHLL_S_W_DESC; -def SHLLV_S_W : SHLLV_S_W_ENC, SHLLV_S_W_DESC; -def SHRA_R_W : SHRA_R_W_ENC, SHRA_R_W_DESC; -def SHRAV_R_W : SHRAV_R_W_ENC, SHRAV_R_W_DESC; -def MULEU_S_PH_QBL : MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC; -def MULEU_S_PH_QBR : MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC; -def MULEQ_S_W_PHL : MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC; -def MULEQ_S_W_PHR : MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC; -def MULQ_RS_PH : MULQ_RS_PH_ENC, MULQ_RS_PH_DESC; +def RADDU_W_QB : DspMMRel, RADDU_W_QB_ENC, RADDU_W_QB_DESC; +def ABSQ_S_PH : DspMMRel, ABSQ_S_PH_ENC, ABSQ_S_PH_DESC; +def ABSQ_S_W : DspMMRel, ABSQ_S_W_ENC, ABSQ_S_W_DESC; +def PRECRQ_QB_PH : DspMMRel, PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC; +def PRECRQ_PH_W : DspMMRel, PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC; +def PRECRQ_RS_PH_W : DspMMRel, PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC; +def PRECRQU_S_QB_PH : DspMMRel, PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC; +def PRECEQ_W_PHL : DspMMRel, PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC; +def PRECEQ_W_PHR : DspMMRel, PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC; +def PRECEQU_PH_QBL : DspMMRel, PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC; +def PRECEQU_PH_QBR : DspMMRel, PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC; +def PRECEQU_PH_QBLA : DspMMRel, PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC; +def PRECEQU_PH_QBRA : DspMMRel, PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC; +def PRECEU_PH_QBL : DspMMRel, PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC; +def PRECEU_PH_QBR : DspMMRel, PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC; +def PRECEU_PH_QBLA : DspMMRel, PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC; +def PRECEU_PH_QBRA : DspMMRel, PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC; +def SHLL_QB : DspMMRel, SHLL_QB_ENC, SHLL_QB_DESC; +def SHLLV_QB : DspMMRel, SHLLV_QB_ENC, SHLLV_QB_DESC; +def SHRL_QB : DspMMRel, SHRL_QB_ENC, SHRL_QB_DESC; +def SHRLV_QB : DspMMRel, SHRLV_QB_ENC, SHRLV_QB_DESC; +def SHLL_PH : DspMMRel, SHLL_PH_ENC, SHLL_PH_DESC; +def SHLLV_PH : DspMMRel, SHLLV_PH_ENC, SHLLV_PH_DESC; +def SHLL_S_PH : DspMMRel, SHLL_S_PH_ENC, SHLL_S_PH_DESC; +def SHLLV_S_PH : DspMMRel, SHLLV_S_PH_ENC, SHLLV_S_PH_DESC; +def SHRA_PH : DspMMRel, SHRA_PH_ENC, SHRA_PH_DESC; +def SHRAV_PH : DspMMRel, SHRAV_PH_ENC, SHRAV_PH_DESC; +def SHRA_R_PH : DspMMRel, SHRA_R_PH_ENC, SHRA_R_PH_DESC; +def SHRAV_R_PH : DspMMRel, SHRAV_R_PH_ENC, SHRAV_R_PH_DESC; +def SHLL_S_W : DspMMRel, SHLL_S_W_ENC, SHLL_S_W_DESC; +def SHLLV_S_W : DspMMRel, SHLLV_S_W_ENC, SHLLV_S_W_DESC; +def SHRA_R_W : DspMMRel, SHRA_R_W_ENC, SHRA_R_W_DESC; +def SHRAV_R_W : DspMMRel, SHRAV_R_W_ENC, SHRAV_R_W_DESC; +def MULEU_S_PH_QBL : DspMMRel, MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC; +def MULEU_S_PH_QBR : DspMMRel, MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC; +def MULEQ_S_W_PHL : DspMMRel, MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC; +def MULEQ_S_W_PHR : DspMMRel, MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC; +def MULQ_RS_PH : DspMMRel, MULQ_RS_PH_ENC, MULQ_RS_PH_DESC; def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC; -def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; -def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; -def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; -def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; -def MFHI_DSP : MFHI_ENC, MFHI_DESC; -def MFLO_DSP : MFLO_ENC, MFLO_DESC; -def MTHI_DSP : MTHI_ENC, MTHI_DESC; -def MTLO_DSP : MTLO_ENC, MTLO_DESC; -def DPAU_H_QBL : DPAU_H_QBL_ENC, DPAU_H_QBL_DESC; -def DPAU_H_QBR : DPAU_H_QBR_ENC, DPAU_H_QBR_DESC; -def DPSU_H_QBL : DPSU_H_QBL_ENC, DPSU_H_QBL_DESC; -def DPSU_H_QBR : DPSU_H_QBR_ENC, DPSU_H_QBR_DESC; -def DPAQ_S_W_PH : DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC; -def DPSQ_S_W_PH : DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC; -def DPAQ_SA_L_W : DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC; -def DPSQ_SA_L_W : DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC; -def MULT_DSP : MULT_DSP_ENC, MULT_DSP_DESC; -def MULTU_DSP : MULTU_DSP_ENC, MULTU_DSP_DESC; -def MADD_DSP : MADD_DSP_ENC, MADD_DSP_DESC; -def MADDU_DSP : MADDU_DSP_ENC, MADDU_DSP_DESC; -def MSUB_DSP : MSUB_DSP_ENC, MSUB_DSP_DESC; -def MSUBU_DSP : MSUBU_DSP_ENC, MSUBU_DSP_DESC; +def MAQ_S_W_PHL : DspMMRel, MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC; +def MAQ_S_W_PHR : DspMMRel, MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC; +def MAQ_SA_W_PHL : DspMMRel, MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC; +def MAQ_SA_W_PHR : DspMMRel, MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC; +def MFHI_DSP : DspMMRel, MFHI_ENC, MFHI_DESC; +def MFLO_DSP : DspMMRel, MFLO_ENC, MFLO_DESC; +def MTHI_DSP : DspMMRel, MTHI_ENC, MTHI_DESC; +def MTLO_DSP : DspMMRel, MTLO_ENC, MTLO_DESC; +def DPAU_H_QBL : DspMMRel, DPAU_H_QBL_ENC, DPAU_H_QBL_DESC; +def DPAU_H_QBR : DspMMRel, DPAU_H_QBR_ENC, DPAU_H_QBR_DESC; +def DPSU_H_QBL : DspMMRel, DPSU_H_QBL_ENC, DPSU_H_QBL_DESC; +def DPSU_H_QBR : DspMMRel, DPSU_H_QBR_ENC, DPSU_H_QBR_DESC; +def DPAQ_S_W_PH : DspMMRel, DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC; +def DPSQ_S_W_PH : DspMMRel, DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC; +def DPAQ_SA_L_W : DspMMRel, DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC; +def DPSQ_SA_L_W : DspMMRel, DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC; +def MULT_DSP : DspMMRel, MULT_DSP_ENC, MULT_DSP_DESC; +def MULTU_DSP : DspMMRel, MULTU_DSP_ENC, MULTU_DSP_DESC; +def MADD_DSP : DspMMRel, MADD_DSP_ENC, MADD_DSP_DESC; +def MADDU_DSP : DspMMRel, MADDU_DSP_ENC, MADDU_DSP_DESC; +def MSUB_DSP : DspMMRel, MSUB_DSP_ENC, MSUB_DSP_DESC; +def MSUBU_DSP : DspMMRel, MSUBU_DSP_ENC, MSUBU_DSP_DESC; def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC; def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC; def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC; @@ -1156,87 +1187,85 @@ def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC; def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC; def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC; def BITREV : BITREV_ENC, BITREV_DESC; -def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC; -def REPL_QB : REPL_QB_ENC, REPL_QB_DESC; -def REPL_PH : REPL_PH_ENC, REPL_PH_DESC; -def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC; -def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC; -def PICK_QB : PICK_QB_ENC, PICK_QB_DESC; -def PICK_PH : PICK_PH_ENC, PICK_PH_DESC; -def LWX : LWX_ENC, LWX_DESC; -def LHX : LHX_ENC, LHX_DESC; -def LBUX : LBUX_ENC, LBUX_DESC; +def PACKRL_PH : DspMMRel, PACKRL_PH_ENC, PACKRL_PH_DESC; +def REPL_QB : DspMMRel, REPL_QB_ENC, REPL_QB_DESC; +def REPL_PH : DspMMRel, REPL_PH_ENC, REPL_PH_DESC; +def REPLV_QB : DspMMRel, REPLV_QB_ENC, REPLV_QB_DESC; +def REPLV_PH : DspMMRel, REPLV_PH_ENC, REPLV_PH_DESC; +def PICK_QB : DspMMRel, PICK_QB_ENC, PICK_QB_DESC; +def PICK_PH : DspMMRel, PICK_PH_ENC, PICK_PH_DESC; +def LWX : DspMMRel, LWX_ENC, LWX_DESC; +def LHX : DspMMRel, LHX_ENC, LHX_DESC; +def LBUX : DspMMRel, LBUX_ENC, LBUX_DESC; def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC; -def INSV : INSV_ENC, INSV_DESC; -def EXTP : EXTP_ENC, EXTP_DESC; -def EXTPV : EXTPV_ENC, EXTPV_DESC; -def EXTPDP : EXTPDP_ENC, EXTPDP_DESC; -def EXTPDPV : EXTPDPV_ENC, EXTPDPV_DESC; -def EXTR_W : EXTR_W_ENC, EXTR_W_DESC; -def EXTRV_W : EXTRV_W_ENC, EXTRV_W_DESC; -def EXTR_R_W : EXTR_R_W_ENC, EXTR_R_W_DESC; -def EXTRV_R_W : EXTRV_R_W_ENC, EXTRV_R_W_DESC; -def EXTR_RS_W : EXTR_RS_W_ENC, EXTR_RS_W_DESC; -def EXTRV_RS_W : EXTRV_RS_W_ENC, EXTRV_RS_W_DESC; -def EXTR_S_H : EXTR_S_H_ENC, EXTR_S_H_DESC; -def EXTRV_S_H : EXTRV_S_H_ENC, EXTRV_S_H_DESC; -def SHILO : SHILO_ENC, SHILO_DESC; -def SHILOV : SHILOV_ENC, SHILOV_DESC; -def MTHLIP : MTHLIP_ENC, MTHLIP_DESC; -def RDDSP : RDDSP_ENC, RDDSP_DESC; -def WRDSP : WRDSP_ENC, WRDSP_DESC; +def INSV : DspMMRel, INSV_ENC, INSV_DESC; +def EXTP : DspMMRel, EXTP_ENC, EXTP_DESC; +def EXTPV : DspMMRel, EXTPV_ENC, EXTPV_DESC; +def EXTPDP : DspMMRel, EXTPDP_ENC, EXTPDP_DESC; +def EXTPDPV : DspMMRel, EXTPDPV_ENC, EXTPDPV_DESC; +def EXTR_W : DspMMRel, EXTR_W_ENC, EXTR_W_DESC; +def EXTRV_W : DspMMRel, EXTRV_W_ENC, EXTRV_W_DESC; +def EXTR_R_W : DspMMRel, EXTR_R_W_ENC, EXTR_R_W_DESC; +def EXTRV_R_W : DspMMRel, EXTRV_R_W_ENC, EXTRV_R_W_DESC; +def EXTR_RS_W : DspMMRel, EXTR_RS_W_ENC, EXTR_RS_W_DESC; +def EXTRV_RS_W : DspMMRel, EXTRV_RS_W_ENC, EXTRV_RS_W_DESC; +def EXTR_S_H : DspMMRel, EXTR_S_H_ENC, EXTR_S_H_DESC; +def EXTRV_S_H : DspMMRel, EXTRV_S_H_ENC, EXTRV_S_H_DESC; +def SHILO : DspMMRel, SHILO_ENC, SHILO_DESC; +def SHILOV : DspMMRel, SHILOV_ENC, SHILOV_DESC; +def MTHLIP : DspMMRel, MTHLIP_ENC, MTHLIP_DESC; +def RDDSP : DspMMRel, RDDSP_ENC, RDDSP_DESC; +let AdditionalPredicates = [NotInMicroMips] in { + def WRDSP : WRDSP_ENC, WRDSP_DESC; +} // MIPS DSP Rev 2 -let Predicates = [HasDSPR2] in { - -def ADDU_PH : ADDU_PH_ENC, ADDU_PH_DESC; -def ADDU_S_PH : ADDU_S_PH_ENC, ADDU_S_PH_DESC; -def SUBU_PH : SUBU_PH_ENC, SUBU_PH_DESC; -def SUBU_S_PH : SUBU_S_PH_ENC, SUBU_S_PH_DESC; -def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC; -def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC; -def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC; -def ABSQ_S_QB : ABSQ_S_QB_ENC, ABSQ_S_QB_DESC; -def ADDUH_QB : ADDUH_QB_ENC, ADDUH_QB_DESC; -def ADDUH_R_QB : ADDUH_R_QB_ENC, ADDUH_R_QB_DESC; -def SUBUH_QB : SUBUH_QB_ENC, SUBUH_QB_DESC; -def SUBUH_R_QB : SUBUH_R_QB_ENC, SUBUH_R_QB_DESC; -def ADDQH_PH : ADDQH_PH_ENC, ADDQH_PH_DESC; -def ADDQH_R_PH : ADDQH_R_PH_ENC, ADDQH_R_PH_DESC; -def SUBQH_PH : SUBQH_PH_ENC, SUBQH_PH_DESC; -def SUBQH_R_PH : SUBQH_R_PH_ENC, SUBQH_R_PH_DESC; -def ADDQH_W : ADDQH_W_ENC, ADDQH_W_DESC; -def ADDQH_R_W : ADDQH_R_W_ENC, ADDQH_R_W_DESC; -def SUBQH_W : SUBQH_W_ENC, SUBQH_W_DESC; -def SUBQH_R_W : SUBQH_R_W_ENC, SUBQH_R_W_DESC; -def MUL_PH : MUL_PH_ENC, MUL_PH_DESC; -def MUL_S_PH : MUL_S_PH_ENC, MUL_S_PH_DESC; -def MULQ_S_W : MULQ_S_W_ENC, MULQ_S_W_DESC; -def MULQ_RS_W : MULQ_RS_W_ENC, MULQ_RS_W_DESC; -def MULQ_S_PH : MULQ_S_PH_ENC, MULQ_S_PH_DESC; -def DPA_W_PH : DPA_W_PH_ENC, DPA_W_PH_DESC; -def DPS_W_PH : DPS_W_PH_ENC, DPS_W_PH_DESC; -def DPAQX_S_W_PH : DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC; -def DPAQX_SA_W_PH : DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC; -def DPAX_W_PH : DPAX_W_PH_ENC, DPAX_W_PH_DESC; -def DPSX_W_PH : DPSX_W_PH_ENC, DPSX_W_PH_DESC; -def DPSQX_S_W_PH : DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC; -def DPSQX_SA_W_PH : DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC; -def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC; -def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC; -def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC; -def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC; -def SHRA_QB : SHRA_QB_ENC, SHRA_QB_DESC; -def SHRAV_QB : SHRAV_QB_ENC, SHRAV_QB_DESC; -def SHRA_R_QB : SHRA_R_QB_ENC, SHRA_R_QB_DESC; -def SHRAV_R_QB : SHRAV_R_QB_ENC, SHRAV_R_QB_DESC; -def SHRL_PH : SHRL_PH_ENC, SHRL_PH_DESC; -def SHRLV_PH : SHRLV_PH_ENC, SHRLV_PH_DESC; -def APPEND : APPEND_ENC, APPEND_DESC; -def BALIGN : BALIGN_ENC, BALIGN_DESC; -def PREPEND : PREPEND_ENC, PREPEND_DESC; - -} +def ADDU_PH : DspMMRel, ADDU_PH_ENC, ADDU_PH_DESC, ISA_DSPR2; +def ADDU_S_PH : DspMMRel, ADDU_S_PH_ENC, ADDU_S_PH_DESC, ISA_DSPR2; +def SUBU_PH : DspMMRel, SUBU_PH_ENC, SUBU_PH_DESC, ISA_DSPR2; +def SUBU_S_PH : DspMMRel, SUBU_S_PH_ENC, SUBU_S_PH_DESC, ISA_DSPR2; +def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC, ISA_DSPR2; +def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC, ISA_DSPR2; +def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC, ISA_DSPR2; +def ABSQ_S_QB : DspMMRel, ABSQ_S_QB_ENC, ABSQ_S_QB_DESC, ISA_DSPR2; +def ADDUH_QB : DspMMRel, ADDUH_QB_ENC, ADDUH_QB_DESC, ISA_DSPR2; +def ADDUH_R_QB : DspMMRel, ADDUH_R_QB_ENC, ADDUH_R_QB_DESC, ISA_DSPR2; +def SUBUH_QB : DspMMRel, SUBUH_QB_ENC, SUBUH_QB_DESC, ISA_DSPR2; +def SUBUH_R_QB : DspMMRel, SUBUH_R_QB_ENC, SUBUH_R_QB_DESC, ISA_DSPR2; +def ADDQH_PH : DspMMRel, ADDQH_PH_ENC, ADDQH_PH_DESC, ISA_DSPR2; +def ADDQH_R_PH : DspMMRel, ADDQH_R_PH_ENC, ADDQH_R_PH_DESC, ISA_DSPR2; +def SUBQH_PH : DspMMRel, SUBQH_PH_ENC, SUBQH_PH_DESC, ISA_DSPR2; +def SUBQH_R_PH : DspMMRel, SUBQH_R_PH_ENC, SUBQH_R_PH_DESC, ISA_DSPR2; +def ADDQH_W : DspMMRel, ADDQH_W_ENC, ADDQH_W_DESC, ISA_DSPR2; +def ADDQH_R_W : DspMMRel, ADDQH_R_W_ENC, ADDQH_R_W_DESC, ISA_DSPR2; +def SUBQH_W : DspMMRel, SUBQH_W_ENC, SUBQH_W_DESC, ISA_DSPR2; +def SUBQH_R_W : DspMMRel, SUBQH_R_W_ENC, SUBQH_R_W_DESC, ISA_DSPR2; +def MUL_PH : DspMMRel, MUL_PH_ENC, MUL_PH_DESC, ISA_DSPR2; +def MUL_S_PH : DspMMRel, MUL_S_PH_ENC, MUL_S_PH_DESC, ISA_DSPR2; +def MULQ_S_W : DspMMRel, MULQ_S_W_ENC, MULQ_S_W_DESC, ISA_DSPR2; +def MULQ_RS_W : DspMMRel, MULQ_RS_W_ENC, MULQ_RS_W_DESC, ISA_DSPR2; +def MULQ_S_PH : DspMMRel, MULQ_S_PH_ENC, MULQ_S_PH_DESC, ISA_DSPR2; +def DPA_W_PH : DspMMRel, DPA_W_PH_ENC, DPA_W_PH_DESC, ISA_DSPR2; +def DPS_W_PH : DspMMRel, DPS_W_PH_ENC, DPS_W_PH_DESC, ISA_DSPR2; +def DPAQX_S_W_PH : DspMMRel, DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC, ISA_DSPR2; +def DPAQX_SA_W_PH : DspMMRel, DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC, ISA_DSPR2; +def DPAX_W_PH : DspMMRel, DPAX_W_PH_ENC, DPAX_W_PH_DESC, ISA_DSPR2; +def DPSX_W_PH : DspMMRel, DPSX_W_PH_ENC, DPSX_W_PH_DESC, ISA_DSPR2; +def DPSQX_S_W_PH : DspMMRel, DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC, ISA_DSPR2; +def DPSQX_SA_W_PH : DspMMRel, DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC, ISA_DSPR2; +def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC, ISA_DSPR2; +def PRECR_QB_PH : DspMMRel, PRECR_QB_PH_ENC, PRECR_QB_PH_DESC, ISA_DSPR2; +def PRECR_SRA_PH_W : DspMMRel, PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC, ISA_DSPR2; +def PRECR_SRA_R_PH_W : DspMMRel, PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC, ISA_DSPR2; +def SHRA_QB : DspMMRel, SHRA_QB_ENC, SHRA_QB_DESC, ISA_DSPR2; +def SHRAV_QB : DspMMRel, SHRAV_QB_ENC, SHRAV_QB_DESC, ISA_DSPR2; +def SHRA_R_QB : DspMMRel, SHRA_R_QB_ENC, SHRA_R_QB_DESC, ISA_DSPR2; +def SHRAV_R_QB : DspMMRel, SHRAV_R_QB_ENC, SHRAV_R_QB_DESC, ISA_DSPR2; +def SHRL_PH : DspMMRel, SHRL_PH_ENC, SHRL_PH_DESC, ISA_DSPR2; +def SHRLV_PH : DspMMRel, SHRLV_PH_ENC, SHRLV_PH_DESC, ISA_DSPR2; +def APPEND : APPEND_ENC, APPEND_DESC, ISA_DSPR2; +def BALIGN : BALIGN_ENC, BALIGN_DESC, ISA_DSPR2; +def PREPEND : DspMMRel, PREPEND_ENC, PREPEND_DESC, ISA_DSPR2; // Pseudos. let isPseudo = 1, isCodeGenOnly = 1 in { @@ -1415,3 +1444,8 @@ let AddedComplexity = 20 in { def : IndexedLoadPat<sextloadi16, LHX>; def : IndexedLoadPat<load, LWX>; } + +// Instruction alias. +let AdditionalPredicates = [NotInMicroMips] in { + def : DSPInstAlias<"wrdsp $rt", (WRDSP GPR32Opnd:$rt, 0x1F), 1>; +} diff --git a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp index 4faeb33..8313d90 100644 --- a/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp @@ -355,9 +355,8 @@ void RegDefsUses::addLiveOut(const MachineBasicBlock &MBB, for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) if (*SI != &SuccBB) - for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(), - LE = (*SI)->livein_end(); LI != LE; ++LI) - Uses.set(*LI); + for (const auto &LI : (*SI)->liveins()) + Uses.set(LI.PhysReg); } bool RegDefsUses::update(const MachineInstr &MI, unsigned Begin, unsigned End) { @@ -431,7 +430,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) { (*MI.memoperands_begin())->getPseudoValue()) { if (isa<FixedStackPseudoSourceValue>(PSV)) return false; - return !PSV->isConstant(nullptr) && PSV != PseudoSourceValue::getStack(); + return !PSV->isConstant(nullptr) && !PSV->isStack(); } return true; @@ -598,7 +597,7 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) { // Get instruction with delay slot. MachineBasicBlock::instr_iterator DSI(I); - if (InMicroMipsMode && TII->GetInstSizeInBytes(std::next(DSI)) == 2 && + if (InMicroMipsMode && TII->GetInstSizeInBytes(&*std::next(DSI)) == 2 && DSI->isCall()) { // If instruction in delay slot is 16b change opcode to // corresponding instruction with short delay slot. @@ -713,8 +712,9 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const { if (DisableBackwardSearch) return false; - RegDefsUses RegDU(*MBB.getParent()->getSubtarget().getRegisterInfo()); - MemDefsUses MemDU(*TM.getDataLayout(), MBB.getParent()->getFrameInfo()); + auto *Fn = MBB.getParent(); + RegDefsUses RegDU(*Fn->getSubtarget().getRegisterInfo()); + MemDefsUses MemDU(Fn->getDataLayout(), Fn->getFrameInfo()); ReverseIter Filler; RegDU.init(*Slot); @@ -763,6 +763,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const { BB2BrMap BrMap; std::unique_ptr<InspectMemInstr> IM; Iter Filler; + auto *Fn = MBB.getParent(); // Iterate over SuccBB's predecessor list. for (MachineBasicBlock::pred_iterator PI = SuccBB->pred_begin(), @@ -772,15 +773,15 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const { // Do not allow moving instructions which have unallocatable register operands // across basic block boundaries. - RegDU.setUnallocatableRegs(*MBB.getParent()); + RegDU.setUnallocatableRegs(*Fn); // Only allow moving loads from stack or constants if any of the SuccBB's // predecessors have multiple successors. if (HasMultipleSuccs) { IM.reset(new LoadFromStackOrConst()); } else { - const MachineFrameInfo *MFI = MBB.getParent()->getFrameInfo(); - IM.reset(new MemDefsUses(*TM.getDataLayout(), MFI)); + const MachineFrameInfo *MFI = Fn->getFrameInfo(); + IM.reset(new MemDefsUses(Fn->getDataLayout(), MFI)); } if (!searchRange(MBB, SuccBB->begin(), SuccBB->end(), RegDU, *IM, Slot, @@ -800,12 +801,13 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const { // Select the successor with the larget edge weight. auto &Prob = getAnalysis<MachineBranchProbabilityInfo>(); - MachineBasicBlock *S = *std::max_element(B.succ_begin(), B.succ_end(), - [&](const MachineBasicBlock *Dst0, - const MachineBasicBlock *Dst1) { - return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1); - }); - return S->isLandingPad() ? nullptr : S; + MachineBasicBlock *S = *std::max_element( + B.succ_begin(), B.succ_end(), + [&](const MachineBasicBlock *Dst0, const MachineBasicBlock *Dst1) { + return Prob.getEdgeProbability(&B, Dst0) < + Prob.getEdgeProbability(&B, Dst1); + }); + return S->isEHPad() ? nullptr : S; } std::pair<MipsInstrInfo::BranchType, MachineInstr *> diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td new file mode 100644 index 0000000..11e191a --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrFormats.td @@ -0,0 +1,84 @@ +//===- MipsEVAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes Mips32r6 instruction formats. +// +//===----------------------------------------------------------------------===// + +class MipsEVAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, + PredicateControl, StdArch { + let DecoderNamespace = "Mips"; + let EncodingPredicates = [HasStdEnc]; +} + +//===----------------------------------------------------------------------===// +// +// Field Values +// +//===----------------------------------------------------------------------===// + +// Memory Load/Store EVA +def OPCODE6_LBE : OPCODE6<0b101100>; +def OPCODE6_LBuE : OPCODE6<0b101000>; +def OPCODE6_LHE : OPCODE6<0b101101>; +def OPCODE6_LHuE : OPCODE6<0b101001>; +def OPCODE6_LWE : OPCODE6<0b101111>; + +def OPCODE6_SBE : OPCODE6<0b011100>; +def OPCODE6_SHE : OPCODE6<0b011101>; +def OPCODE6_SWE : OPCODE6<0b011111>; + +// load/store left/right EVA +def OPCODE6_LWLE : OPCODE6<0b011001>; +def OPCODE6_LWRE : OPCODE6<0b011010>; +def OPCODE6_SWLE : OPCODE6<0b100001>; +def OPCODE6_SWRE : OPCODE6<0b100010>; + +// Load-linked EVA, Store-conditional EVA +def OPCODE6_LLE : OPCODE6<0b101110>; +def OPCODE6_SCE : OPCODE6<0b011110>; + +def OPCODE6_TLBINV : OPCODE6<0b000011>; +def OPCODE6_TLBINVF : OPCODE6<0b000100>; + +def OPCODE6_CACHEE : OPCODE6<0b011011>; +def OPCODE6_PREFE : OPCODE6<0b100011>; + +def OPGROUP_COP0 : OPGROUP<0b010000>; + +//===----------------------------------------------------------------------===// +// +// Encoding Formats +// +//===----------------------------------------------------------------------===// + +class SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6 Operation> : MipsEVAInst { + bits<21> addr; + bits<5> hint; + bits<5> base = addr{20-16}; + bits<9> offset = addr{8-0}; + + bits<32> Inst; + + let Inst{31-26} = OPGROUP_SPECIAL3.Value; + let Inst{25-21} = base; + let Inst{20-16} = hint; + let Inst{15-7} = offset; + let Inst{6} = 0; + let Inst{5-0} = Operation.Value; +} + +class TLB_FM<OPCODE6 Operation> : MipsEVAInst { + bits<32> Inst; + + let Inst{31-26} = OPGROUP_COP0.Value; + let Inst{25} = 1; // CO + let Inst{24-6} = 0; + let Inst{5-0} = Operation.Value; +} diff --git a/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td new file mode 100644 index 0000000..36c9694 --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MipsEVAInstrInfo.td @@ -0,0 +1,192 @@ +//===- MipsEVAInstrInfo.td - EVA ASE instructions -*- tablegen ------------*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes Mips EVA ASE instructions. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// +// Instruction encodings +// +//===----------------------------------------------------------------------===// + +// Memory Load/Store EVA encodings +class LBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBE>; +class LBuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LBuE>; +class LHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHE>; +class LHuE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LHuE>; +class LWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWE>; + +class SBE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SBE>; +class SHE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SHE>; +class SWE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWE>; + +// load/store left/right EVA encodings +class LWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWLE>; +class LWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LWRE>; +class SWLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWLE>; +class SWRE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SWRE>; + +// Load-linked EVA, Store-conditional EVA encodings +class LLE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_LLE>; +class SCE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_SCE>; + +class TLBINV_ENC : TLB_FM<OPCODE6_TLBINV>; +class TLBINVF_ENC : TLB_FM<OPCODE6_TLBINVF>; + +class CACHEE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_CACHEE>; +class PREFE_ENC : SPECIAL3_EVA_LOAD_STORE_FM<OPCODE6_PREFE>; + +//===----------------------------------------------------------------------===// +// +// Instruction descriptions +// +//===----------------------------------------------------------------------===// + +// Memory Load/Store EVA descriptions +class LOAD_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; + bit canFoldAsLoad = 1; + bit mayLoad = 1; +} + +class LBE_DESC : LOAD_EVA_DESC_BASE<"lbe", GPR32Opnd>; +class LBuE_DESC : LOAD_EVA_DESC_BASE<"lbue", GPR32Opnd>; +class LHE_DESC : LOAD_EVA_DESC_BASE<"lhe", GPR32Opnd>; +class LHuE_DESC : LOAD_EVA_DESC_BASE<"lhue", GPR32Opnd>; +class LWE_DESC : LOAD_EVA_DESC_BASE<"lwe", GPR32Opnd>; + +class STORE_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd, + SDPatternOperator OpNode = null_frag> { + dag OutOperandList = (outs); + dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; + bit mayStore = 1; +} + +class SBE_DESC : STORE_EVA_DESC_BASE<"sbe", GPR32Opnd>; +class SHE_DESC : STORE_EVA_DESC_BASE<"she", GPR32Opnd>; +class SWE_DESC : STORE_EVA_DESC_BASE<"swe", GPR32Opnd>; + +// Load/Store Left/Right EVA descriptions +class LOAD_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins mem_simm9:$addr, GPROpnd:$src); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; + string Constraints = "$src = $rt"; + bit canFoldAsLoad = 1; +} + +class LWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwle", GPR32Opnd>; +class LWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"lwre", GPR32Opnd>; + +class STORE_LEFT_RIGHT_EVA_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs); + dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeMemEVA"; +} + +class SWLE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swle", GPR32Opnd>; +class SWRE_DESC : LOAD_LEFT_RIGHT_EVA_DESC_BASE<"swre", GPR32Opnd>; + +// Load-linked EVA, Store-conditional EVA descriptions +class LLE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$rt); + dag InOperandList = (ins mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayLoad = 1; + string DecoderMethod = "DecodeMemEVA"; +} + +class LLE_DESC : LLE_DESC_BASE<"lle", GPR32Opnd>; + +class SCE_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> { + dag OutOperandList = (outs GPROpnd:$dst); + dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr); + string AsmString = !strconcat(instr_asm, "\t$rt, $addr"); + list<dag> Pattern = []; + bit mayStore = 1; + string Constraints = "$rt = $dst"; + string DecoderMethod = "DecodeMemEVA"; +} + +class SCE_DESC : SCE_DESC_BASE<"sce", GPR32Opnd>; + +class TLB_DESC_BASE<string instr_asm> { + dag OutOperandList = (outs); + dag InOperandList = (ins); + string AsmString = instr_asm; + list<dag> Pattern = []; +} + +class TLBINV_DESC : TLB_DESC_BASE<"tlbinv">; +class TLBINVF_DESC : TLB_DESC_BASE<"tlbinvf">; + +class CACHEE_DESC_BASE<string instr_asm, Operand MemOpnd> { + dag OutOperandList = (outs); + dag InOperandList = (ins MemOpnd:$addr, uimm5:$hint); + string AsmString = !strconcat(instr_asm, "\t$hint, $addr"); + list<dag> Pattern = []; + string DecoderMethod = "DecodeCacheeOp_CacheOpR6"; +} + +class CACHEE_DESC : CACHEE_DESC_BASE<"cachee", mem>; +class PREFE_DESC : CACHEE_DESC_BASE<"prefe", mem>; + +//===----------------------------------------------------------------------===// +// +// Instruction definitions +// +//===----------------------------------------------------------------------===// + +/// Load and Store EVA Instructions +def LBE : LBE_ENC, LBE_DESC, INSN_EVA; +def LBuE : LBuE_ENC, LBuE_DESC, INSN_EVA; +def LHE : LHE_ENC, LHE_DESC, INSN_EVA; +def LHuE : LHuE_ENC, LHuE_DESC, INSN_EVA; +let AdditionalPredicates = [NotInMicroMips] in { +def LWE : LWE_ENC, LWE_DESC, INSN_EVA; +} +def SBE : SBE_ENC, SBE_DESC, INSN_EVA; +def SHE : SHE_ENC, SHE_DESC, INSN_EVA; +let AdditionalPredicates = [NotInMicroMips] in { +def SWE : SWE_ENC, SWE_DESC, INSN_EVA; +} + +/// load/store left/right EVA +let AdditionalPredicates = [NotInMicroMips] in { +def LWLE : LWLE_ENC, LWLE_DESC, INSN_EVA_NOT_32R6_64R6; +def LWRE : LWRE_ENC, LWRE_DESC, INSN_EVA_NOT_32R6_64R6; +def SWLE : SWLE_ENC, SWLE_DESC, INSN_EVA_NOT_32R6_64R6; +def SWRE : SWRE_ENC, SWRE_DESC, INSN_EVA_NOT_32R6_64R6; +} + +/// Load-linked EVA, Store-conditional EVA +let AdditionalPredicates = [NotInMicroMips] in { +def LLE : LLE_ENC, LLE_DESC, INSN_EVA; +def SCE : SCE_ENC, SCE_DESC, INSN_EVA; +} + +def TLBINV : TLBINV_ENC, TLBINV_DESC, INSN_EVA; +def TLBINVF : TLBINVF_ENC, TLBINVF_DESC, INSN_EVA; + +def CACHEE : CACHEE_ENC, CACHEE_DESC, INSN_EVA; +def PREFE : PREFE_ENC, PREFE_DESC, INSN_EVA; diff --git a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp index 5152a07..e9eaf81 100644 --- a/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -192,10 +192,10 @@ public: TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()) { MFI = funcInfo.MF->getInfo<MipsFunctionInfo>(); Context = &funcInfo.Fn->getContext(); + bool ISASupported = !Subtarget->hasMips32r6() && Subtarget->hasMips32(); TargetSupported = - ((TM.getRelocationModel() == Reloc::PIC_) && - ((Subtarget->hasMips32r2() || Subtarget->hasMips32()) && - (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32()))); + ISASupported && (TM.getRelocationModel() == Reloc::PIC_) && + (static_cast<const MipsTargetMachine &>(TM).getABI().IsO32()); UnsupportedFPMode = Subtarget->isFP64bit(); } @@ -236,32 +236,36 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT, std::swap(LHS, RHS); unsigned Opc; - if (ISDOpc == ISD::AND) { + switch (ISDOpc) { + case ISD::AND: Opc = Mips::AND; - } else if (ISDOpc == ISD::OR) { + break; + case ISD::OR: Opc = Mips::OR; - } else if (ISDOpc == ISD::XOR) { + break; + case ISD::XOR: Opc = Mips::XOR; - } else + break; + default: llvm_unreachable("unexpected opcode"); + } unsigned LHSReg = getRegForValue(LHS); - unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); - if (!ResultReg) - return 0; - - unsigned RHSReg; if (!LHSReg) return 0; + unsigned RHSReg; if (const auto *C = dyn_cast<ConstantInt>(RHS)) RHSReg = materializeInt(C, MVT::i32); else RHSReg = getRegForValue(RHS); - if (!RHSReg) return 0; + unsigned ResultReg = createResultReg(&Mips::GPR32RegClass); + if (!ResultReg) + return 0; + emitInst(Opc, ResultReg).addReg(LHSReg).addReg(RHSReg); return ResultReg; } @@ -747,7 +751,7 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr, unsigned Offset = Addr.getOffset(); MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad, + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addFrameIndex(FI) @@ -798,7 +802,7 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr, unsigned Offset = Addr.getOffset(); MachineFrameInfo &MFI = *MF->getFrameInfo(); MachineMemOperand *MMO = MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad, + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)) .addReg(SrcReg) @@ -912,8 +916,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) { BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::BGTZ)) .addReg(CondReg) .addMBB(TBB); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } return false; @@ -1057,22 +1060,16 @@ bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) { // entirely within FPRs. unsigned DestReg = createResultReg(&Mips::GPR32RegClass); unsigned TempReg = createResultReg(&Mips::FGR32RegClass); - unsigned Opc; - - if (SrcVT == MVT::f32) - Opc = Mips::TRUNC_W_S; - else - Opc = Mips::TRUNC_W_D32; + unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32; // Generate the convert. emitInst(Opc, TempReg).addReg(SrcReg); - emitInst(Mips::MFC1, DestReg).addReg(TempReg); updateValueMap(I, DestReg); return true; } -// + bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, SmallVectorImpl<MVT> &OutVTs, unsigned &NumBytes) { @@ -1196,7 +1193,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(Addr.getOffset()), + MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()), MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); (void)(MMO); // if (!emitStore(ArgVT, ArgReg, Addr, MMO)) @@ -1607,19 +1604,23 @@ bool MipsFastISel::emitIntSExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool MipsFastISel::emitIntZExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, unsigned DestReg) { + int64_t Imm; + switch (SrcVT.SimpleTy) { default: return false; case MVT::i1: - emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(1); + Imm = 1; break; case MVT::i8: - emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xff); + Imm = 0xff; break; case MVT::i16: - emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(0xffff); + Imm = 0xffff; break; } + + emitInst(Mips::ANDi, DestReg).addReg(SrcReg).addImm(Imm); return true; } diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp index fab2fdf..6756c17 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -117,6 +117,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const { case MipsISD::GPRel: return "MipsISD::GPRel"; case MipsISD::ThreadPointer: return "MipsISD::ThreadPointer"; case MipsISD::Ret: return "MipsISD::Ret"; + case MipsISD::ERet: return "MipsISD::ERet"; case MipsISD::EH_RETURN: return "MipsISD::EH_RETURN"; case MipsISD::FPBrcond: return "MipsISD::FPBrcond"; case MipsISD::FPCmp: return "MipsISD::FPCmp"; @@ -390,10 +391,10 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); - setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand); - setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + if (!Subtarget.isGP64bit()) { + setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand); + setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand); + } setInsertFencesForAtomic(true); @@ -437,9 +438,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setStackPointerRegisterToSaveRestore(ABI.IsN64() ? Mips::SP_64 : Mips::SP); - setExceptionPointerRegister(ABI.IsN64() ? Mips::A0_64 : Mips::A0); - setExceptionSelectorRegister(ABI.IsN64() ? Mips::A1_64 : Mips::A1); - MaxStoresPerMemcpy = 16; isMicroMips = Subtarget.inMicroMipsMode(); @@ -836,6 +834,14 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return SDValue(); } +bool MipsTargetLowering::isCheapToSpeculateCttz() const { + return Subtarget.hasMips32(); +} + +bool MipsTargetLowering::isCheapToSpeculateCtlz() const { + return Subtarget.hasMips32(); +} + void MipsTargetLowering::LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results, @@ -1092,8 +1098,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loopMBB); MF->insert(It, exitMBB); @@ -1204,8 +1209,7 @@ MachineBasicBlock *MipsTargetLowering::emitAtomicBinaryPartword( MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loopMBB); MF->insert(It, sinkMBB); MF->insert(It, exitMBB); @@ -1330,15 +1334,20 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI, DebugLoc DL = MI->getDebugLoc(); unsigned LL, SC, ZERO, BNE, BEQ; - if (Size == 4) { - LL = isMicroMips ? Mips::LL_MM : Mips::LL; - SC = isMicroMips ? Mips::SC_MM : Mips::SC; + if (Size == 4) { + if (isMicroMips) { + LL = Mips::LL_MM; + SC = Mips::SC_MM; + } else { + LL = Subtarget.hasMips32r6() ? Mips::LL_R6 : Mips::LL; + SC = Subtarget.hasMips32r6() ? Mips::SC_R6 : Mips::SC; + } ZERO = Mips::ZERO; BNE = Mips::BNE; BEQ = Mips::BEQ; } else { - LL = Mips::LLD; - SC = Mips::SCD; + LL = Subtarget.hasMips64r6() ? Mips::LLD_R6 : Mips::LLD; + SC = Subtarget.hasMips64r6() ? Mips::SCD_R6 : Mips::SCD; ZERO = Mips::ZERO_64; BNE = Mips::BNE64; BEQ = Mips::BEQ64; @@ -1356,8 +1365,7 @@ MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loop1MBB); MF->insert(It, loop2MBB); MF->insert(It, exitMBB); @@ -1440,8 +1448,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI, MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MF->insert(It, loop1MBB); MF->insert(It, loop2MBB); MF->insert(It, sinkMBB); @@ -1586,9 +1593,10 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Addr = DAG.getNode(ISD::ADD, DL, PTy, Index, Table); EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8); - Addr = DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr, - MachinePointerInfo::getJumpTable(), MemVT, false, false, - false, 0); + Addr = + DAG.getExtLoad(ISD::SEXTLOAD, DL, PTy, Chain, Addr, + MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), + MemVT, false, false, false, 0); Chain = Addr.getValue(1); if ((getTargetMachine().getRelocationModel() == Reloc::PIC_) || ABI.IsN64()) { @@ -1690,14 +1698,15 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op, return getAddrLocal(N, SDLoc(N), Ty, DAG, ABI.IsN32() || ABI.IsN64()); if (LargeGOT) - return getAddrGlobalLargeGOT(N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, - MipsII::MO_GOT_LO16, DAG.getEntryNode(), - MachinePointerInfo::getGOT()); + return getAddrGlobalLargeGOT( + N, SDLoc(N), Ty, DAG, MipsII::MO_GOT_HI16, MipsII::MO_GOT_LO16, + DAG.getEntryNode(), + MachinePointerInfo::getGOT(DAG.getMachineFunction())); - return getAddrGlobal(N, SDLoc(N), Ty, DAG, - (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP - : MipsII::MO_GOT16, - DAG.getEntryNode(), MachinePointerInfo::getGOT()); + return getAddrGlobal( + N, SDLoc(N), Ty, DAG, + (ABI.IsN32() || ABI.IsN64()) ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16, + DAG.getEntryNode(), MachinePointerInfo::getGOT(DAG.getMachineFunction())); } SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op, @@ -1719,6 +1728,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const // Local Exec TLS Model. GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + SDLoc DL(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -1813,7 +1825,8 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const static_cast<const MipsTargetObjectFile *>( getTargetMachine().getObjFileLowering()); - if (TLOF->IsConstantInSmallSection(N->getConstVal(), getTargetMachine())) + if (TLOF->IsConstantInSmallSection(DAG.getDataLayout(), N->getConstVal(), + getTargetMachine())) // %gp_rel relocation return getAddrGPRel(N, SDLoc(N), Ty, DAG); @@ -2946,8 +2959,12 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1); - Function::const_arg_iterator FuncArg = - DAG.getMachineFunction().getFunction()->arg_begin(); + const Function *Func = DAG.getMachineFunction().getFunction(); + Function::const_arg_iterator FuncArg = Func->arg_begin(); + + if (Func->hasFnAttribute("interrupt") && !Func->arg_empty()) + report_fatal_error( + "Functions with the interrupt attribute cannot have arguments!"); CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FixedArg); MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(), @@ -3019,7 +3036,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // We ought to be able to use LocVT directly but O32 sets it to i32 // when allocating floating point values to integer registers. // This shouldn't influence how we load the value into registers unless - // we are targetting softfloat. + // we are targeting softfloat. if (VA.getValVT().isFloatingPoint() && !Subtarget.useSoftFloat()) LocVT = VA.getValVT(); } @@ -3033,9 +3050,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain, // Create load nodes to retrieve arguments from the stack SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue ArgValue = DAG.getLoad( + LocVT, DL, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, false, 0); OutChains.push_back(ArgValue.getValue(1)); ArgValue = UnpackFromArgumentSlot(ArgValue, VA, Ins[i].ArgVT, DL, DAG); @@ -3098,8 +3116,20 @@ MipsTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const } SDValue -MipsTargetLowering::LowerReturn(SDValue Chain, - CallingConv::ID CallConv, bool IsVarArg, +MipsTargetLowering::LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, + SDLoc DL, SelectionDAG &DAG) const { + + MachineFunction &MF = DAG.getMachineFunction(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + + MipsFI->setISR(); + + return DAG.getNode(MipsISD::ERet, DL, MVT::Other, RetOps); +} + +SDValue +MipsTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, + bool IsVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, SDLoc DL, SelectionDAG &DAG) const { @@ -3192,7 +3222,11 @@ MipsTargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - // Return on Mips is always a "jr $ra" + // ISRs must use "eret". + if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt")) + return LowerInterruptReturn(RetOps, DL, DAG); + + // Standard return on Mips is a "jr $ra" return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps); } @@ -3300,7 +3334,7 @@ static std::pair<bool, bool> parsePhysicalReg(StringRef C, StringRef &Prefix, // Search for the first numeric character. StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1; - I = std::find_if(B, E, std::ptr_fun(isdigit)); + I = std::find_if(B, E, isdigit); Prefix = StringRef(B, I - B); @@ -3669,7 +3703,7 @@ void MipsTargetLowering::passByValArg( unsigned NumRegs = LastReg - FirstReg; if (NumRegs) { - const ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs(); + ArrayRef<MCPhysReg> ArgRegs = ABI.GetByValArgRegs(); bool LeftoverBytes = (NumRegs * RegSizeInBytes > ByValSizeInBytes); unsigned I = 0; @@ -3755,7 +3789,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains, SDValue Chain, SDLoc DL, SelectionDAG &DAG, CCState &State) const { - const ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs(); + ArrayRef<MCPhysReg> ArgRegs = ABI.GetVarArgRegs(); unsigned Idx = State.getFirstUnallocated(ArgRegs); unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes(); MVT RegTy = MVT::getIntegerVT(RegSizeInBytes * 8); @@ -3812,7 +3846,7 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size, if (State->getCallingConv() != CallingConv::Fast) { unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes(); - const ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs(); + ArrayRef<MCPhysReg> IntArgRegs = ABI.GetByValArgRegs(); // FIXME: The O32 case actually describes no shadow registers. const MCPhysReg *ShadowRegs = ABI.IsO32() ? IntArgRegs.data() : Mips64DPRegs; @@ -3860,8 +3894,7 @@ MipsTargetLowering::emitPseudoSELECT(MachineInstr *MI, MachineBasicBlock *BB, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... diff --git a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h index b3d861d..b33e125 100644 --- a/contrib/llvm/lib/Target/Mips/MipsISelLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsISelLowering.h @@ -67,6 +67,10 @@ namespace llvm { // Return Ret, + // Interrupt, exception, error trap Return + ERet, + + // Software Exception Return. EH_RETURN, // Node used to extract integer from accumulator. @@ -231,6 +235,9 @@ namespace llvm { return MVT::i32; } + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const override; @@ -258,17 +265,25 @@ namespace llvm { EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const override; - struct LTStr { - bool operator()(const char *S1, const char *S2) const { - return strcmp(S1, S2) < 0; - } - }; - void HandleByVal(CCState *, unsigned &, unsigned) const override; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return ABI.IsN64() ? Mips::A0_64 : Mips::A0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return ABI.IsN64() ? Mips::A1_64 : Mips::A1; + } + /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Mips doesn't have any special address spaces so we just reserve @@ -290,9 +305,10 @@ namespace llvm { unsigned GOTFlag = IsN32OrN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT; SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty), getTargetNode(N, Ty, DAG, GOTFlag)); - SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT, - MachinePointerInfo::getGOT(), false, false, - false, 0); + SDValue Load = + DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); unsigned LoFlag = IsN32OrN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO; SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty, getTargetNode(N, Ty, DAG, LoFlag)); @@ -487,6 +503,9 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, SelectionDAG &DAG) const override; + SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps, SDLoc DL, + SelectionDAG &DAG) const; + bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const override; // Inline asm support diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td index cb91225..377260f 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrFPU.td @@ -136,7 +136,7 @@ multiclass ABSS_M<string opstr, InstrItinClass Itin, multiclass ROUND_M<string opstr, InstrItinClass Itin> { def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>, FGR_32; - def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 { + def _D64 : StdMMR6Rel, ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>, FGR_64 { let DecoderNamespace = "Mips64"; } } @@ -267,24 +267,25 @@ defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>, ISA_MIPS1_NOT_32R6_64R6, //===----------------------------------------------------------------------===// // Floating Point Instructions //===----------------------------------------------------------------------===// -def ROUND_W_S : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, +def ROUND_W_S : MMRel, StdMMR6Rel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0xc, 16>, ISA_MIPS2; -def TRUNC_W_S : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, +defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; +def TRUNC_W_S : MMRel, StdMMR6Rel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>, ABSS_FM<0xd, 16>, ISA_MIPS2; -def CEIL_W_S : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, +def CEIL_W_S : MMRel, StdMMR6Rel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>, ABSS_FM<0xe, 16>, ISA_MIPS2; -def FLOOR_W_S : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, +def FLOOR_W_S : MMRel, StdMMR6Rel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>, ABSS_FM<0xf, 16>, ISA_MIPS2; def CVT_W_S : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x24, 16>; -defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2; defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2; defm CEIL_W : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2; defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2; defm CVT_W : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>; let DecoderNamespace = "Mips64" in { + let AdditionalPredicates = [NotInMicroMips] in { def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>, ABSS_FM<0x8, 16>, FGR_64; def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>, @@ -301,14 +302,17 @@ let DecoderNamespace = "Mips64" in { ABSS_FM<0xb, 16>, FGR_64; def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>, ABSS_FM<0xb, 17>, FGR_64; + } } def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x20, 20>; -def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>, - ABSS_FM<0x25, 16>, INSN_MIPS3_32R2; -def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>, - ABSS_FM<0x25, 17>, INSN_MIPS3_32R2; +let AdditionalPredicates = [NotInMicroMips] in{ + def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>, + ABSS_FM<0x25, 16>, INSN_MIPS3_32R2; + def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>, + ABSS_FM<0x25, 17>, INSN_MIPS3_32R2; +} def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>, ABSS_FM<0x20, 17>, FGR_32; @@ -320,8 +324,10 @@ def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>, let DecoderNamespace = "Mips64" in { def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>, ABSS_FM<0x20, 17>, FGR_64; - def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>, - ABSS_FM<0x20, 21>, FGR_64; + let AdditionalPredicates = [NotInMicroMips] in{ + def CVT_S_L : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>, + ABSS_FM<0x20, 21>, FGR_64; + } def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>, ABSS_FM<0x21, 20>, FGR_64; def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>, @@ -345,8 +351,8 @@ def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>, defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>; defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>; -def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>, - ABSS_FM<0x4, 16>, ISA_MIPS2; +def FSQRT_S : MMRel, StdMMR6Rel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, + II_SQRT_S, fsqrt>, ABSS_FM<0x4, 16>, ISA_MIPS2; defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2; // The odd-numbered registers are only referenced when doing loads, @@ -503,13 +509,13 @@ let AdditionalPredicates = [NoNaNsFPMath], def MIPS_BRANCH_F : PatLeaf<(i32 0)>; def MIPS_BRANCH_T : PatLeaf<(i32 1)>; -def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, IIBranch, MIPS_BRANCH_F>, +def BC1F : MMRel, BC1F_FT<"bc1f", brtarget, II_BC1F, MIPS_BRANCH_F>, BC1F_FM<0, 0>, ISA_MIPS1_NOT_32R6_64R6; -def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, IIBranch, MIPS_BRANCH_F, 0>, +def BC1FL : MMRel, BC1F_FT<"bc1fl", brtarget, II_BC1FL, MIPS_BRANCH_F, 0>, BC1F_FM<1, 0>, ISA_MIPS2_NOT_32R6_64R6; -def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, IIBranch, MIPS_BRANCH_T>, +def BC1T : MMRel, BC1F_FT<"bc1t", brtarget, II_BC1T, MIPS_BRANCH_T>, BC1F_FM<0, 1>, ISA_MIPS1_NOT_32R6_64R6; -def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, IIBranch, MIPS_BRANCH_T, 0>, +def BC1TL : MMRel, BC1F_FT<"bc1tl", brtarget, II_BC1TL, MIPS_BRANCH_T, 0>, BC1F_FM<1, 1>, ISA_MIPS2_NOT_32R6_64R6; /// Floating Point Compare diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td index 5f4fcc3..45baf27 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrFormats.td @@ -132,7 +132,7 @@ class PseudoSE<dag outs, dag ins, list<dag> pattern, // These are aliases that require C++ handling to convert to the target // instruction, while InstAliases can be handled directly by tblgen. class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>: - MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> { + MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo>, PredicateControl { let isPseudo = 1; let Pattern = []; } @@ -644,16 +644,16 @@ class BRK_FM<bits<6> funct> : StdArch // Exception return format <Cop0|1|0|funct> //===----------------------------------------------------------------------===// -class ER_FM<bits<6> funct> : StdArch +class ER_FM<bits<6> funct, bit LLBit> : StdArch { bits<32> Inst; let Inst{31-26} = 0x10; let Inst{25} = 1; - let Inst{24-6} = 0; + let Inst{24-7} = 0; + let Inst{6} = LLBit; let Inst{5-0} = funct; } - //===----------------------------------------------------------------------===// // Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0> //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp index bb23cc0..b1d6950 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.cpp @@ -60,8 +60,8 @@ MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI, MachineFrameInfo &MFI = *MF.getFrameInfo(); unsigned Align = MFI.getObjectAlignment(FI); - return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), Flag, - MFI.getObjectSize(FI), Align); + return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI), + Flag, MFI.getObjectSize(FI), Align); } //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td index ab98c90..d9fb8c8 100644 --- a/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -77,6 +77,9 @@ def MipsThreadPointer: SDNode<"MipsISD::ThreadPointer", SDT_MipsThreadPointer>; def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def MipsERet : SDNode<"MipsISD::ERet", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPSideEffect]>; + // These are target-independent nodes, but have target-specific formats. def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart, [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>; @@ -157,7 +160,7 @@ def HasMips3 : Predicate<"Subtarget->hasMips3()">, def HasMips4_32 : Predicate<"Subtarget->hasMips4_32()">, AssemblerPredicate<"FeatureMips4_32">; def NotMips4_32 : Predicate<"!Subtarget->hasMips4_32()">, - AssemblerPredicate<"FeatureMips4_32">; + AssemblerPredicate<"!FeatureMips4_32">; def HasMips4_32r2 : Predicate<"Subtarget->hasMips4_32r2()">, AssemblerPredicate<"FeatureMips4_32r2">; def HasMips5_32r2 : Predicate<"Subtarget->hasMips5_32r2()">, @@ -166,6 +169,8 @@ def HasMips32 : Predicate<"Subtarget->hasMips32()">, AssemblerPredicate<"FeatureMips32">; def HasMips32r2 : Predicate<"Subtarget->hasMips32r2()">, AssemblerPredicate<"FeatureMips32r2">; +def HasMips32r5 : Predicate<"Subtarget->hasMips32r5()">, + AssemblerPredicate<"FeatureMips32r5">; def HasMips32r6 : Predicate<"Subtarget->hasMips32r6()">, AssemblerPredicate<"FeatureMips32r6">; def NotMips32r6 : Predicate<"!Subtarget->hasMips32r6()">, @@ -176,6 +181,8 @@ def IsGP32bit : Predicate<"!Subtarget->isGP64bit()">, AssemblerPredicate<"!FeatureGP64Bit">; def HasMips64 : Predicate<"Subtarget->hasMips64()">, AssemblerPredicate<"FeatureMips64">; +def NotMips64 : Predicate<"!Subtarget->hasMips64()">, + AssemblerPredicate<"!FeatureMips64">; def HasMips64r2 : Predicate<"Subtarget->hasMips64r2()">, AssemblerPredicate<"FeatureMips64r2">; def HasMips64r6 : Predicate<"Subtarget->hasMips64r6()">, @@ -184,6 +191,8 @@ def NotMips64r6 : Predicate<"!Subtarget->hasMips64r6()">, AssemblerPredicate<"!FeatureMips64r6">; def HasMicroMips32r6 : Predicate<"Subtarget->inMicroMips32r6Mode()">, AssemblerPredicate<"FeatureMicroMips,FeatureMips32r6">; +def HasMicroMips64r6 : Predicate<"Subtarget->inMicroMips64r6Mode()">, + AssemblerPredicate<"FeatureMicroMips,FeatureMips64r6">; def InMips16Mode : Predicate<"Subtarget->inMips16Mode()">, AssemblerPredicate<"FeatureMips16">; def HasCnMips : Predicate<"Subtarget->hasCnMips()">, @@ -201,6 +210,12 @@ def NotInMicroMips : Predicate<"!Subtarget->inMicroMipsMode()">, def IsLE : Predicate<"Subtarget->isLittle()">; def IsBE : Predicate<"!Subtarget->isLittle()">; def IsNotNaCl : Predicate<"!Subtarget->isTargetNaCl()">; +def UseTCCInDIV : AssemblerPredicate<"FeatureUseTCCInDIV">; +def HasEVA : Predicate<"Subtarget->hasEVA()">, + AssemblerPredicate<"FeatureEVA,FeatureMips32r2">; +def HasMSA : Predicate<"Subtarget->hasMSA()">, + AssemblerPredicate<"FeatureMSA">; + //===----------------------------------------------------------------------===// // Mips GPR size adjectives. @@ -242,6 +257,7 @@ class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; } class ISA_MIPS32R2_NOT_32R6_64R6 { list<Predicate> InsnPredicates = [HasMips32r2, NotMips32r6, NotMips64r6]; } +class ISA_MIPS32R5 { list<Predicate> InsnPredicates = [HasMips32r5]; } class ISA_MIPS64 { list<Predicate> InsnPredicates = [HasMips64]; } class ISA_MIPS64_NOT_64R6 { list<Predicate> InsnPredicates = [HasMips64, NotMips64r6]; @@ -249,9 +265,21 @@ class ISA_MIPS64_NOT_64R6 { class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; } class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; } class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; } +class ISA_MICROMIPS { list<Predicate> InsnPredicates = [InMicroMips]; } class ISA_MICROMIPS32R6 { list<Predicate> InsnPredicates = [HasMicroMips32r6]; } +class ISA_MICROMIPS64R6 { + list<Predicate> InsnPredicates = [HasMicroMips64r6]; +} +class ISA_MICROMIPS32_NOT_MIPS32R6 { + list<Predicate> InsnPredicates = [InMicroMips, NotMips32r6]; +} + +class INSN_EVA { list<Predicate> InsnPredicates = [HasEVA]; } +class INSN_EVA_NOT_32R6_64R6 { + list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6, HasEVA]; +} // The portions of MIPS-III that were also added to MIPS32 class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; } @@ -283,6 +311,28 @@ class INSN_MIPS5_32R2_NOT_32R6_64R6 { list<Predicate> InsnPredicates = [HasMips5_32r2, NotMips32r6, NotMips64r6]; } +class ASE_CNMIPS { + list<Predicate> InsnPredicates = [HasCnMips]; +} + +class ASE_MSA { + list<Predicate> InsnPredicates = [HasMSA]; +} + +class ASE_MSA_NOT_MSA64 { + list<Predicate> InsnPredicates = [HasMSA, NotMips64]; +} + +class ASE_MSA64 { + list<Predicate> InsnPredicates = [HasMSA, HasMips64]; +} + +// Class used for separating microMIPSr6 and microMIPS (r3) instruction. +// It can be used only on instructions that doesn't inherit PredicateControl. +class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl { + let InsnPredicates = [InMicroMips, NotMips32r6, NotMips64r6]; +} + //===----------------------------------------------------------------------===// class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl { @@ -335,6 +385,81 @@ include "MipsInstrFormats.td" // Mips Operand, Complex Patterns and Transformations Definitions. //===----------------------------------------------------------------------===// +class ConstantSImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = []> + : AsmOperandClass { + let Name = "ConstantSImm" # Bits; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isConstantSImm<" # Bits # ">"; + let SuperClasses = Supers; + let DiagnosticType = "SImm" # Bits; +} + +class ConstantUImmAsmOperandClass<int Bits, list<AsmOperandClass> Supers = [], + int Offset = 0> : AsmOperandClass { + let Name = "ConstantUImm" # Bits # "_" # Offset; + let RenderMethod = "addConstantUImmOperands<" # Bits # ", " # Offset # ">"; + let PredicateMethod = "isConstantUImm<" # Bits # ", " # Offset # ">"; + let SuperClasses = Supers; + let DiagnosticType = "UImm" # Bits # "_" # Offset; +} + +def ConstantUImm10AsmOperandClass + : ConstantUImmAsmOperandClass<10, []>; +def ConstantUImm8AsmOperandClass + : ConstantUImmAsmOperandClass<8, [ConstantUImm10AsmOperandClass]>; +def ConstantUImm7AsmOperandClass + : ConstantUImmAsmOperandClass<7, [ConstantUImm8AsmOperandClass]>; +def ConstantUImm6AsmOperandClass + : ConstantUImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>; +def ConstantSImm6AsmOperandClass + : ConstantSImmAsmOperandClass<6, [ConstantUImm7AsmOperandClass]>; +def ConstantUImm5Plus1AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 1>; +def ConstantUImm5Plus32AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32>; +def ConstantUImm5Plus33AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 33>; +def ConstantUImm5Plus32NormalizeAsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass], 32> { + let Name = "ConstantUImm5_32_Norm"; + // We must also subtract 32 when we render the operand. + let RenderMethod = "addConstantUImmOperands<5, 32, -32>"; +} +def ConstantUImm5Lsl2AsmOperandClass : AsmOperandClass { + let Name = "UImm5Lsl2"; + let RenderMethod = "addImmOperands"; + let PredicateMethod = "isScaledUImm<5, 2>"; + let SuperClasses = [ConstantUImm6AsmOperandClass]; + let DiagnosticType = "UImm5_Lsl2"; +} +def ConstantUImm5ReportUImm6AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]> { + let Name = "ConstantUImm5_0_Report_UImm6"; + let DiagnosticType = "UImm5_0_Report_UImm6"; +} +def ConstantUImm5AsmOperandClass + : ConstantUImmAsmOperandClass<5, [ConstantUImm6AsmOperandClass]>; +def ConstantUImm4AsmOperandClass + : ConstantUImmAsmOperandClass< + 4, [ConstantUImm5AsmOperandClass, + ConstantUImm5Plus32AsmOperandClass, + ConstantUImm5Plus32NormalizeAsmOperandClass]>; +def ConstantUImm3AsmOperandClass + : ConstantUImmAsmOperandClass<3, [ConstantUImm4AsmOperandClass]>; +def ConstantUImm2Plus1AsmOperandClass + : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass], 1>; +def ConstantUImm2AsmOperandClass + : ConstantUImmAsmOperandClass<2, [ConstantUImm3AsmOperandClass]>; +def ConstantUImm1AsmOperandClass + : ConstantUImmAsmOperandClass<1, [ConstantUImm2AsmOperandClass]>; +def ConstantImmzAsmOperandClass : AsmOperandClass { + let Name = "ConstantImmz"; + let RenderMethod = "addConstantUImmOperands<1>"; + let PredicateMethod = "isConstantImmz"; + let SuperClasses = [ConstantUImm1AsmOperandClass]; + let DiagnosticType = "Immz"; +} + def MipsJumpTargetAsmOperand : AsmOperandClass { let Name = "JumpTarget"; let ParserMethod = "parseJumpTarget"; @@ -360,6 +485,10 @@ def calltarget : Operand<iPTR> { def imm64: Operand<i64>; +def simm6 : Operand<i32> { + let ParserMatchClass = ConstantSImm6AsmOperandClass; + let OperandType = "OPERAND_IMMEDIATE"; +} def simm9 : Operand<i32>; def simm10 : Operand<i32>; def simm11 : Operand<i32>; @@ -380,23 +509,12 @@ def simm18_lsl3 : Operand<i32> { let ParserMatchClass = MipsJumpTargetAsmOperand; } -def simm20 : Operand<i32> { -} +def simm20 : Operand<i32>; +def simm32 : Operand<i32>; def uimm20 : Operand<i32> { } -def MipsUImm10AsmOperand : AsmOperandClass { - let Name = "UImm10"; - let RenderMethod = "addImmOperands"; - let ParserMethod = "parseImm"; - let PredicateMethod = "isUImm<10>"; -} - -def uimm10 : Operand<i32> { - let ParserMatchClass = MipsUImm10AsmOperand; -} - def simm16_64 : Operand<i64> { let DecoderMethod = "DecodeSimm16"; } @@ -404,23 +522,71 @@ def simm16_64 : Operand<i64> { // Zero def uimmz : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantImmzAsmOperandClass; +} + +// Unsigned Operands +foreach I = {1, 2, 3, 4, 5, 6, 7, 8, 10} in + def uimm # I : Operand<i32> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = + !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass"); + } + +def uimm2_plus1 : Operand<i32> { + let PrintMethod = "printUnsignedImm"; + let EncoderMethod = "getUImmWithOffsetEncoding<2, 1>"; + let DecoderMethod = "DecodeUImmWithOffset<2, 1>"; + let ParserMatchClass = ConstantUImm2Plus1AsmOperandClass; } -// Unsigned Operand -def uimm2 : Operand<i32> { +def uimm5_plus1 : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>"; + let DecoderMethod = "DecodeUImmWithOffset<5, 1>"; + let ParserMatchClass = ConstantUImm5Plus1AsmOperandClass; } -def uimm3 : Operand<i32> { +def uimm5_plus32 : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5Plus32AsmOperandClass; } -def uimm5 : Operand<i32> { +def uimm5_plus33 : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let EncoderMethod = "getUImmWithOffsetEncoding<5, 1>"; + let DecoderMethod = "DecodeUImmWithOffset<5, 1>"; + let ParserMatchClass = ConstantUImm5Plus33AsmOperandClass; } -def uimm6 : Operand<i32> { +def uimm5_plus32_normalize : Operand<i32> { let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass; +} + +def uimm5_lsl2 : Operand<OtherVT> { + let EncoderMethod = "getUImm5Lsl2Encoding"; + let DecoderMethod = "DecodeUImm5lsl2"; + let ParserMatchClass = ConstantUImm5Lsl2AsmOperandClass; +} + +def uimm5_plus32_normalize_64 : Operand<i64> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5Plus32NormalizeAsmOperandClass; +} + +foreach I = {5} in + def uimm # I # _64 : Operand<i64> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = + !cast<AsmOperandClass>("ConstantUImm" # I # "AsmOperandClass"); + } + +// Like uimm5_64 but reports a less confusing error for 32-63 when +// an instruction alias permits that. +def uimm5_64_report_uimm6 : Operand<i64> { + let PrintMethod = "printUnsignedImm"; + let ParserMatchClass = ConstantUImm5ReportUImm6AsmOperandClass; } def uimm16 : Operand<i32> { @@ -435,6 +601,22 @@ def MipsMemAsmOperand : AsmOperandClass { let ParserMethod = "parseMemOperand"; } +def MipsMemSimm9AsmOperand : AsmOperandClass { + let Name = "MemOffsetSimm9"; + let SuperClasses = [MipsMemAsmOperand]; + let RenderMethod = "addMemOperands"; + let ParserMethod = "parseMemOperand"; + let PredicateMethod = "isMemWithSimmOffset<9>"; +} + +def MipsMemSimm9GPRAsmOperand : AsmOperandClass { + let Name = "MemOffsetSimm9GPR"; + let SuperClasses = [MipsMemAsmOperand]; + let RenderMethod = "addMemOperands"; + let ParserMethod = "parseMemOperand"; + let PredicateMethod = "isMemWithSimmOffsetGPR<9>"; +} + def MipsMemSimm11AsmOperand : AsmOperandClass { let Name = "MemOffsetSimm11"; let SuperClasses = [MipsMemAsmOperand]; @@ -485,6 +667,13 @@ def mem_msa : mem_generic { def mem_simm9 : mem_generic { let MIOperandInfo = (ops ptr_rc, simm9); let EncoderMethod = "getMemEncoding"; + let ParserMatchClass = MipsMemSimm9AsmOperand; +} + +def mem_simm9gpr : mem_generic { + let MIOperandInfo = (ops ptr_rc, simm9); + let EncoderMethod = "getMemEncoding"; + let ParserMatchClass = MipsMemSimm9GPRAsmOperand; } def mem_simm11 : mem_generic { @@ -512,12 +701,6 @@ def PtrRC : Operand<iPTR> { let ParserMatchClass = GPR32AsmOperand; } -// size operand of ext instruction -def size_ext : Operand<i32> { - let EncoderMethod = "getSizeExtEncoding"; - let DecoderMethod = "DecodeExtSize"; -} - // size operand of ins instruction def size_ins : Operand<i32> { let EncoderMethod = "getSizeInsEncoding"; @@ -657,7 +840,7 @@ class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin, [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], itin, FrmR, opstr>; -// Load Upper Imediate +// Load Upper Immediate class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>: InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"), [], II_LUI, FrmI, opstr>, IsAsCheapAsAMove { @@ -675,14 +858,19 @@ class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, let mayLoad = 1; } -class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, +class StoreMemory<string opstr, DAGOperand RO, DAGOperand MO, + SDPatternOperator OpNode = null_frag, InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> : - InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"), + InstSE<(outs), (ins RO:$rt, MO:$addr), !strconcat(opstr, "\t$rt, $addr"), [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> { let DecoderMethod = "DecodeMem"; let mayStore = 1; } +class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag, + InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> : + StoreMemory<opstr, RO, mem, OpNode, Itin, Addr>; + // Load/Store Left/Right let canFoldAsLoad = 1 in class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO, @@ -740,7 +928,7 @@ class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op, RegisterOperand RO, bit DelaySlot = 1> : InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset), !strconcat(opstr, "\t$rs, $rt, $offset"), - [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch, + [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], II_BCC, FrmI, opstr> { let isBranch = 1; let isTerminator = 1; @@ -752,7 +940,7 @@ class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op, RegisterOperand RO, bit DelaySlot = 1> : InstSE<(outs), (ins RO:$rs, opnd:$offset), !strconcat(opstr, "\t$rs, $offset"), - [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch, + [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], II_BCCZ, FrmI, opstr> { let isBranch = 1; let isTerminator = 1; @@ -778,7 +966,7 @@ class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type, class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator, SDPatternOperator targetoperator, string bopstr> : InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"), - [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> { + [(operator targetoperator:$target)], II_J, FrmJ, bopstr> { let isTerminator=1; let isBarrier=1; let hasDelaySlot = 1; @@ -788,7 +976,7 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator, // Unconditional branch class UncondBranch<Instruction BEQInst> : - PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>, + PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], II_B>, PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> { let isBranch = 1; let isTerminator = 1; @@ -802,7 +990,7 @@ class UncondBranch<Instruction BEQInst> : let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in class JumpFR<string opstr, RegisterOperand RO, SDPatternOperator operator = null_frag>: - InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch, + InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], II_JR, FrmR, opstr>; // Indirect branch @@ -815,23 +1003,23 @@ class IndirectBranch<string opstr, RegisterOperand RO> : JumpFR<opstr, RO> { let isCall=1, hasDelaySlot=1, Defs = [RA] in { class JumpLink<string opstr, DAGOperand opnd> : InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"), - [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> { + [(MipsJmpLink imm:$target)], II_JAL, FrmJ, opstr> { let DecoderMethod = "DecodeJumpTarget"; } class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst, Register RetReg, RegisterOperand ResRO = RO>: - PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], IIBranch>, + PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], II_JALR>, PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>; class JumpLinkReg<string opstr, RegisterOperand RO>: InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"), - [], IIBranch, FrmR>; + [], II_JALR, FrmR, opstr>; class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO, bit DelaySlot = 1> : InstSE<(outs), (ins RO:$rs, opnd:$offset), - !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr> { + !strconcat(opstr, "\t$rs, $offset"), [], II_BCCZAL, FrmI, opstr> { let hasDelaySlot = DelaySlot; } @@ -840,17 +1028,17 @@ let isCall=1, hasDelaySlot=1, Defs = [RA] in { let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1, hasExtraSrcRegAllocReq = 1, Defs = [AT] in { class TailCall<Instruction JumpInst> : - PseudoSE<(outs), (ins calltarget:$target), [], IIBranch>, + PseudoSE<(outs), (ins calltarget:$target), [], II_J>, PseudoInstExpansion<(JumpInst jmptarget:$target)>; class TailCallReg<RegisterOperand RO, Instruction JRInst, RegisterOperand ResRO = RO> : - PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], IIBranch>, + PseudoSE<(outs), (ins RO:$rs), [(MipsTailCall RO:$rs)], II_JR>, PseudoInstExpansion<(JRInst ResRO:$rs)>; } class BAL_BR_Pseudo<Instruction RealInst> : - PseudoSE<(outs), (ins brtarget:$offset), [], IIBranch>, + PseudoSE<(outs), (ins brtarget:$offset), [], II_BCCZAL>, PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> { let isBranch = 1; let isTerminator = 1; @@ -997,9 +1185,10 @@ class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO, [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>; // Subword Swap -class SubwordSwap<string opstr, RegisterOperand RO>: - InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], - NoItinerary, FrmR, opstr> { +class SubwordSwap<string opstr, RegisterOperand RO, + InstrItinClass itin = NoItinerary>: + InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [], itin, + FrmR, opstr> { let hasSideEffects = 0; } @@ -1010,8 +1199,8 @@ class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> : // Ext and Ins class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd, - SDPatternOperator Op = null_frag>: - InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size), + Operand SizeOpnd, SDPatternOperator Op = null_frag> : + InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, SizeOpnd:$size), !strconcat(opstr, " $rt, $rs, $pos, $size"), [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], II_EXT, FrmR, opstr>, ISA_MIPS32R2; @@ -1074,6 +1263,9 @@ class TrapBase<Instruction RealInst> let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in def RetRA : PseudoSE<(outs), (ins), [(MipsRet)]>; +let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, hasSideEffects=1 in +def ERet : PseudoSE<(outs), (ins), [(MipsERet)]>; + let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt), [(callseq_start timm:$amt)]>; @@ -1215,10 +1407,11 @@ def LH : Load<"lh", GPR32Opnd, sextloadi16, II_LH, addrDefault>, MMRel, LW_FM<0x21>; def LHu : Load<"lhu", GPR32Opnd, zextloadi16, II_LHU>, MMRel, LW_FM<0x25>; let AdditionalPredicates = [NotInMicroMips] in { -def LW : Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel, +def LW : StdMMR6Rel, Load<"lw", GPR32Opnd, load, II_LW, addrDefault>, MMRel, LW_FM<0x23>; } -def SB : Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, LW_FM<0x28>; +def SB : StdMMR6Rel, Store<"sb", GPR32Opnd, truncstorei8, II_SB>, MMRel, + LW_FM<0x28>; def SH : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>; let AdditionalPredicates = [NotInMicroMips] in { def SW : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>; @@ -1259,15 +1452,17 @@ let DecoderNamespace = "COP3_" in { } } -def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32; -def SYNCI : MMRel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2; +def SYNC : MMRel, StdMMR6Rel, SYNC_FT<"sync">, SYNC_FM, ISA_MIPS32; +def SYNCI : MMRel, StdMMR6Rel, SYNCI_FT<"synci">, SYNCI_FM, ISA_MIPS32R2; -def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2; -def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2; -def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2; -def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2; -def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2; -def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2; +let AdditionalPredicates = [NotInMicroMips] in { + def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>, ISA_MIPS2; + def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>, ISA_MIPS2; + def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>, ISA_MIPS2; + def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>, ISA_MIPS2; + def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>, ISA_MIPS2; + def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>, ISA_MIPS2; +} def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>, ISA_MIPS2_NOT_32R6_64R6; @@ -1290,14 +1485,15 @@ def TRAP : TrapBase<BREAK>; def SDBBP : MMRel, SYS_FT<"sdbbp">, SDBBP_FM, ISA_MIPS32_NOT_32R6_64R6; let AdditionalPredicates = [NotInMicroMips] in { -def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32; + def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18, 0x0>, INSN_MIPS3_32; + def ERETNC : MMRel, ER_FT<"eretnc">, ER_FM<0x18, 0x1>, ISA_MIPS32R5; + def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f, 0x0>, ISA_MIPS32; } -def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32; let AdditionalPredicates = [NotInMicroMips] in { -def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2; + def EI : MMRel, StdMMR6Rel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2; + def DI : MMRel, StdMMR6Rel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2; } -def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2; let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug AdditionalPredicates = [NotInMicroMips] in { @@ -1359,7 +1555,8 @@ def TAILCALL_R : TailCallReg<GPR32Opnd, JR>; // Indirect branches are matched as PseudoIndirectBranch/PseudoIndirectBranch64 // then are expanded to JR, JR64, JALR, or JALR64 depending on the ISA. class PseudoIndirectBranchBase<RegisterOperand RO> : - MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], IIBranch> { + MipsPseudo<(outs), (ins RO:$rs), [(brind RO:$rs)], + II_IndirectBranchPseudo> { let isTerminator=1; let isBarrier=1; let hasDelaySlot = 1; @@ -1369,12 +1566,12 @@ class PseudoIndirectBranchBase<RegisterOperand RO> : def PseudoIndirectBranch : PseudoIndirectBranchBase<GPR32Opnd>; -// Return instructions are matched as a RetRA instruction, then ar expanded +// Return instructions are matched as a RetRA instruction, then are expanded // into PseudoReturn/PseudoReturn64 after register allocation. Finally, // MipsAsmPrinter expands this into JR, JR64, JALR, or JALR64 depending on the // ISA. class PseudoReturnBase<RegisterOperand RO> : MipsPseudo<(outs), (ins RO:$rs), - [], IIBranch> { + [], II_ReturnPseudo> { let isTerminator = 1; let isBarrier = 1; let hasDelaySlot = 1; @@ -1441,8 +1638,11 @@ def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, ISA_MIPS32_NOT_32R6_64R6; -/// Word Swap Bytes Within Halfwords -def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2; +let AdditionalPredicates = [NotInMicroMips] in { + /// Word Swap Bytes Within Halfwords + def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd, II_WSBH>, SEB_FM<2, 0x20>, + ISA_MIPS32R2; +} /// No operation. def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>; @@ -1485,10 +1685,12 @@ def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, II_DIV, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6; def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, II_DIVU, 0, 1, 1>, ISA_MIPS1_NOT_32R6_64R6; - +let AdditionalPredicates = [NotInMicroMips] in { def RDHWR : MMRel, ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM; - -def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>; +} +// TODO: Add '0 < pos+size <= 32' constraint check to ext instruction +def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, uimm5_plus1, MipsExt>, + EXT_FM<0>; def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>; /// Move Control Registers From/To CPU Registers @@ -1499,9 +1701,9 @@ def MTC2 : MTC3OP<"mtc2", COP2Opnd, GPR32Opnd>, MFC3OP_FM<0x12, 4>; class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary, FrmOther, asmstr>; -def SSNOP : MMRel, Barrier<"ssnop">, BARRIER_FM<1>; +def SSNOP : MMRel, StdMMR6Rel, Barrier<"ssnop">, BARRIER_FM<1>; def EHB : MMRel, Barrier<"ehb">, BARRIER_FM<3>; -def PAUSE : MMRel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2; +def PAUSE : MMRel, StdMMR6Rel, Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2; // JR_HB and JALR_HB are defined here using the new style naming // scheme because some of this code is shared with Mips32r6InstrInfo.td @@ -1562,11 +1764,60 @@ def CACHE : MMRel, CacheOp<"cache", mem>, CACHEOP_FM<0b101111>, def PREF : MMRel, CacheOp<"pref", mem>, CACHEOP_FM<0b110011>, INSN_MIPS3_32_NOT_32R6_64R6; +def ROL : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "rol\t$rs, $rt, $rd">; +def ROLImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "rol\t$rs, $rt, $imm">; +def : MipsInstAlias<"rol $rd, $rs", + (ROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>; +def : MipsInstAlias<"rol $rd, $imm", + (ROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>; + +def ROR : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "ror\t$rs, $rt, $rd">; +def RORImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "ror\t$rs, $rt, $imm">; +def : MipsInstAlias<"ror $rd, $rs", + (ROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>; +def : MipsInstAlias<"ror $rd, $imm", + (RORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>; + +def DROL : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "drol\t$rs, $rt, $rd">, ISA_MIPS64; +def DROLImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "drol\t$rs, $rt, $imm">, ISA_MIPS64; +def : MipsInstAlias<"drol $rd, $rs", + (DROL GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64; +def : MipsInstAlias<"drol $rd, $imm", + (DROLImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64; + +def DROR : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rd), + "dror\t$rs, $rt, $rd">, ISA_MIPS64; +def DRORImm : MipsAsmPseudoInst<(outs), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "dror\t$rs, $rt, $imm">, ISA_MIPS64; +def : MipsInstAlias<"dror $rd, $rs", + (DROR GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, ISA_MIPS64; +def : MipsInstAlias<"dror $rd, $imm", + (DRORImm GPR32Opnd:$rd, GPR32Opnd:$rd, simm16:$imm), 0>, ISA_MIPS64; + //===----------------------------------------------------------------------===// // Instruction aliases //===----------------------------------------------------------------------===// def : MipsInstAlias<"move $dst, $src", - (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>, + (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>, + GPR_32 { + let AdditionalPredicates = [NotInMicroMips]; +} +def : MipsInstAlias<"move $dst, $src", + (ADDu GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>, GPR_32 { let AdditionalPredicates = [NotInMicroMips]; } @@ -1630,27 +1881,27 @@ def : MipsInstAlias<"beqz $rs,$offset", def : MipsInstAlias<"beqzl $rs,$offset", (BEQL GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>; def : MipsInstAlias<"syscall", (SYSCALL 0), 1>; - + def : MipsInstAlias<"break", (BREAK 0, 0), 1>; def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>; let AdditionalPredicates = [NotInMicroMips] in { -def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2; -} -def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2; - -def : MipsInstAlias<"teq $rs, $rt", - (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tge $rs, $rt", - (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tgeu $rs, $rt", - (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tlt $rs, $rt", - (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tltu $rs, $rt", - (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; -def : MipsInstAlias<"tne $rs, $rt", - (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; - + def : MipsInstAlias<"ei", (EI ZERO), 1>, ISA_MIPS32R2; + def : MipsInstAlias<"di", (DI ZERO), 1>, ISA_MIPS32R2; +} +let AdditionalPredicates = [NotInMicroMips] in { + def : MipsInstAlias<"teq $rs, $rt", + (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tge $rs, $rt", + (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tgeu $rs, $rt", + (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tlt $rs, $rt", + (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tltu $rs, $rt", + (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; + def : MipsInstAlias<"tne $rs, $rt", + (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>, ISA_MIPS2; +} def : MipsInstAlias<"sll $rd, $rt, $rs", (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>; def : MipsInstAlias<"sub, $rd, $rs, $imm", @@ -1678,7 +1929,7 @@ def : MipsInstAlias<"sync", class LoadImmediate32<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32), !strconcat(instr_asm, "\t$rt, $imm32")> ; -def LoadImm32 : LoadImmediate32<"li", uimm5, GPR32Opnd>; +def LoadImm32 : LoadImmediate32<"li", simm32, GPR32Opnd>; class LoadAddressFromReg32<string instr_asm, Operand MemOpnd, RegisterOperand RO> : @@ -1689,13 +1940,16 @@ def LoadAddrReg32 : LoadAddressFromReg32<"la", mem, GPR32Opnd>; class LoadAddressFromImm32<string instr_asm, Operand Od, RegisterOperand RO> : MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32), !strconcat(instr_asm, "\t$rt, $imm32")> ; -def LoadAddrImm32 : LoadAddressFromImm32<"la", uimm5, GPR32Opnd>; +def LoadAddrImm32 : LoadAddressFromImm32<"la", simm32, GPR32Opnd>; def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs), "jal\t$rd, $rs"> ; def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs), "jal\t$rs"> ; +def NORImm : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), + "nor\t$rs, $rt, $imm"> ; + let hasDelaySlot = 1 in { def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins imm64:$imm64, brtarget:$offset), @@ -1718,12 +1972,62 @@ def BLTU : CondBranchPseudo<"bltu">; def BLEU : CondBranchPseudo<"bleu">; def BGEU : CondBranchPseudo<"bgeu">; def BGTU : CondBranchPseudo<"bgtu">; +def BLTL : CondBranchPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6; +def BLEL : CondBranchPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6; +def BGEL : CondBranchPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6; +def BGTL : CondBranchPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6; +def BLTUL: CondBranchPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6; +def BLEUL: CondBranchPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6; +def BGEUL: CondBranchPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6; +def BGTUL: CondBranchPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6; + +class CondBranchImmPseudo<string instr_asm> : + MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset), + !strconcat(instr_asm, "\t$rs, $imm, $offset")>; + +def BLTImmMacro : CondBranchImmPseudo<"blt">; +def BLEImmMacro : CondBranchImmPseudo<"ble">; +def BGEImmMacro : CondBranchImmPseudo<"bge">; +def BGTImmMacro : CondBranchImmPseudo<"bgt">; +def BLTUImmMacro : CondBranchImmPseudo<"bltu">; +def BLEUImmMacro : CondBranchImmPseudo<"bleu">; +def BGEUImmMacro : CondBranchImmPseudo<"bgeu">; +def BGTUImmMacro : CondBranchImmPseudo<"bgtu">; +def BLTLImmMacro : CondBranchImmPseudo<"bltl">, ISA_MIPS2_NOT_32R6_64R6; +def BLELImmMacro : CondBranchImmPseudo<"blel">, ISA_MIPS2_NOT_32R6_64R6; +def BGELImmMacro : CondBranchImmPseudo<"bgel">, ISA_MIPS2_NOT_32R6_64R6; +def BGTLImmMacro : CondBranchImmPseudo<"bgtl">, ISA_MIPS2_NOT_32R6_64R6; +def BLTULImmMacro : CondBranchImmPseudo<"bltul">, ISA_MIPS2_NOT_32R6_64R6; +def BLEULImmMacro : CondBranchImmPseudo<"bleul">, ISA_MIPS2_NOT_32R6_64R6; +def BGEULImmMacro : CondBranchImmPseudo<"bgeul">, ISA_MIPS2_NOT_32R6_64R6; +def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6; + +// FIXME: Predicates are removed because instructions are matched regardless of +// predicates, because PredicateControl was not in the hierarchy. This was +// done to emit more precise error message from expansion function. +// Once the tablegen-erated errors are made better, this needs to be fixed and +// predicates needs to be restored. + +def SDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "div\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6; + +def UDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "divu\t$rs, $rt">; //, ISA_MIPS1_NOT_32R6_64R6; + +def DSDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "ddiv\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6; + +def DUDivMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "ddivu\t$rs, $rt">; //, ISA_MIPS64_NOT_64R6; + +def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr), + "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6; def Ulhu : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr), - "ulhu\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6; + "ulhu\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6; def Ulw : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr), - "ulw\t$rt, $addr">, ISA_MIPS1_NOT_32R6_64R6; + "ulw\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6; //===----------------------------------------------------------------------===// // Arbitrary patterns that map to one or more instructions @@ -1939,6 +2243,16 @@ let AddedComplexity = 40 in { } } +// Atomic load patterns. +def : MipsPat<(atomic_load_8 addr:$a), (LB addr:$a)>; +def : MipsPat<(atomic_load_16 addr:$a), (LH addr:$a)>; +def : MipsPat<(atomic_load_32 addr:$a), (LW addr:$a)>; + +// Atomic store patterns. +def : MipsPat<(atomic_store_8 addr:$a, GPR32:$v), (SB GPR32:$v, addr:$a)>; +def : MipsPat<(atomic_store_16 addr:$a, GPR32:$v), (SH GPR32:$v, addr:$a)>; +def : MipsPat<(atomic_store_32 addr:$a, GPR32:$v), (SW GPR32:$v, addr:$a)>; + //===----------------------------------------------------------------------===// // Floating Point Support //===----------------------------------------------------------------------===// @@ -1964,6 +2278,10 @@ include "MipsDSPInstrInfo.td" include "MipsMSAInstrFormats.td" include "MipsMSAInstrInfo.td" +// EVA +include "MipsEVAInstrFormats.td" +include "MipsEVAInstrInfo.td" + // Micromips include "MicroMipsInstrFormats.td" include "MicroMipsInstrInfo.td" @@ -1972,3 +2290,11 @@ include "MicroMipsInstrFPU.td" // Micromips r6 include "MicroMips32r6InstrFormats.td" include "MicroMips32r6InstrInfo.td" + +// Micromips64 r6 +include "MicroMips64r6InstrFormats.td" +include "MicroMips64r6InstrInfo.td" + +// Micromips DSP +include "MicroMipsDSPInstrFormats.td" +include "MicroMipsDSPInstrInfo.td" diff --git a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp index 90f8cc0..49fb99a 100644 --- a/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsLongBranch.cpp @@ -148,7 +148,7 @@ void MipsLongBranch::splitMBB(MachineBasicBlock *MBB) { // Insert NewMBB and fix control flow. MachineBasicBlock *Tgt = getTargetMBB(*FirstBr); NewMBB->transferSuccessors(MBB); - NewMBB->removeSuccessor(Tgt); + NewMBB->removeSuccessor(Tgt, true); MBB->addSuccessor(NewMBB); MBB->addSuccessor(Tgt); MF->insert(std::next(MachineFunction::iterator(MBB)), NewMBB); @@ -161,7 +161,7 @@ void MipsLongBranch::initMBBInfo() { // Split the MBBs if they have two branches. Each basic block should have at // most one branch after this loop is executed. for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E;) - splitMBB(I++); + splitMBB(&*I++); MF->RenumberBlocks(); MBBInfos.clear(); @@ -262,8 +262,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { static_cast<const MipsInstrInfo *>(Subtarget.getInstrInfo()); MF->insert(FallThroughMBB, LongBrMBB); - MBB->removeSuccessor(TgtMBB); - MBB->addSuccessor(LongBrMBB); + MBB->replaceSuccessor(TgtMBB, LongBrMBB); if (IsPIC) { MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB); @@ -434,7 +433,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) { I.Br->addOperand(MachineOperand::CreateMBB(LongBrMBB)); } else // Change branch destination and reverse condition. - replaceBranch(*MBB, I.Br, DL, FallThroughMBB); + replaceBranch(*MBB, I.Br, DL, &*FallThroughMBB); } static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) { diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td index bff2d0f..7d25ea5 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td +++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrFormats.td @@ -7,18 +7,12 @@ // //===----------------------------------------------------------------------===// -def HasMSA : Predicate<"Subtarget->hasMSA()">, - AssemblerPredicate<"FeatureMSA">; - -class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> { - let Predicates = [HasMSA]; +class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, + PredicateControl, ASE_MSA { + let EncodingPredicates = [HasStdEnc]; let Inst{31-26} = 0b011110; } -class MSA64Inst : MSAInst { - let Predicates = [HasMSA, HasMips64]; -} - class MSACBranch : MSAInst { let Inst{31-26} = 0b010001; } @@ -27,10 +21,6 @@ class MSASpecial : MSAInst { let Inst{31-26} = 0b000000; } -class MSA64Special : MSA64Inst { - let Inst{31-26} = 0b000000; -} - class MSAPseudo<dag outs, dag ins, list<dag> pattern, InstrItinClass itin = IIPseudo>: MipsPseudo<outs, ins, pattern, itin> { @@ -100,7 +90,7 @@ class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst { let Inst{5-0} = minor; } -class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSA64Inst { +class MSA_2R_FILL_D_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst { bits<5> rs; bits<5> wd; @@ -293,7 +283,7 @@ class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst { let Inst{5-0} = minor; } -class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst { +class MSA_ELM_COPY_D_FMT<bits<4> major, bits<6> minor>: MSAInst { bits<4> n; bits<5> ws; bits<5> rd; @@ -345,7 +335,7 @@ class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst { let Inst{5-0} = minor; } -class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSA64Inst { +class MSA_ELM_INSERT_D_FMT<bits<4> major, bits<6> minor>: MSAInst { bits<6> n; bits<5> rs; bits<5> wd; @@ -450,7 +440,7 @@ class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial { let Inst{5-0} = minor; } -class SPECIAL_DLSA_FMT<bits<6> minor>: MSA64Special { +class SPECIAL_DLSA_FMT<bits<6> minor>: MSASpecial { bits<5> rs; bits<5> rt; bits<5> rd; diff --git a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td index 970e98e..eacfcec 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/contrib/llvm/lib/Target/Mips/MipsMSAInstrInfo.td @@ -63,30 +63,13 @@ def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT", def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT", SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>; +def immZExt1Ptr : ImmLeaf<iPTR, [{return isUInt<1>(Imm);}]>; +def immZExt2Ptr : ImmLeaf<iPTR, [{return isUInt<2>(Imm);}]>; def immZExt4Ptr : ImmLeaf<iPTR, [{return isUInt<4>(Imm);}]>; def immZExt6Ptr : ImmLeaf<iPTR, [{return isUInt<6>(Imm);}]>; // Operands -// The immediate of an LSA instruction needs special handling -// as the encoded value should be subtracted by one. -def uimm2LSAAsmOperand : AsmOperandClass { - let Name = "LSAImm"; - let ParserMethod = "parseLSAImm"; - let RenderMethod = "addImmOperands"; -} - -def LSAImm : Operand<i32> { - let PrintMethod = "printUnsignedImm"; - let EncoderMethod = "getLSAImmEncoding"; - let DecoderMethod = "DecodeLSAImm"; - let ParserMatchClass = uimm2LSAAsmOperand; -} - -def uimm4 : Operand<i32> { - let PrintMethod = "printUnsignedImm8"; -} - def uimm4_ptr : Operand<iPTR> { let PrintMethod = "printUnsignedImm8"; } @@ -95,10 +78,6 @@ def uimm6_ptr : Operand<iPTR> { let PrintMethod = "printUnsignedImm8"; } -def uimm8 : Operand<i32> { - let PrintMethod = "printUnsignedImm8"; -} - def simm5 : Operand<i32>; def vsplat_uimm1 : Operand<vAny> { @@ -639,7 +618,6 @@ class COPY_S_D_ENC : MSA_ELM_COPY_D_FMT<0b0010, 0b011001>; class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>; class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>; class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>; -class COPY_U_D_ENC : MSA_ELM_COPY_D_FMT<0b0011, 0b011001>; class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>; @@ -1195,47 +1173,14 @@ class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode, InstrItinClass Itinerary = itin; } -// This class is deprecated and will be removed soon. -class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { - dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm3:$m); - string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))]; - InstrItinClass Itinerary = itin; -} - -// This class is deprecated and will be removed soon. -class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { - dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm4:$m); - string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))]; - InstrItinClass Itinerary = itin; -} - -// This class is deprecated and will be removed soon. -class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { - dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm5:$m); - string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))]; - InstrItinClass Itinerary = itin; -} - -// This class is deprecated and will be removed soon. -class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, - InstrItinClass itin = NoItinerary> { +class MSA_BIT_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode, + Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD, + RegisterOperand ROWS = ROWD, + InstrItinClass itin = NoItinerary> { dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWS:$ws, uimm6:$m); + dag InOperandList = (ins ROWS:$ws, ImmOp:$m); string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m"); - list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))]; + list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))]; InstrItinClass Itinerary = itin; } @@ -1291,13 +1236,14 @@ class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode, } class MSA_ELM_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, + RegisterOperand ROWD, RegisterOperand ROWS, + Operand ImmOp, ImmLeaf Imm, InstrItinClass itin = NoItinerary> { dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, uimm4:$n); + dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ImmOp:$n); string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]"); list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws, - immZExt4:$n))]; + Imm:$n))]; string Constraints = "$wd = $wd_in"; InstrItinClass Itinerary = itin; } @@ -1479,7 +1425,7 @@ class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> { dag InOperandList = (ins ROWD:$wt, brtarget:$offset); string AsmString = !strconcat(instr_asm, "\t$wt, $offset"); list<dag> Pattern = []; - InstrItinClass Itinerary = IIBranch; + InstrItinClass Itinerary = NoItinerary; bit isBranch = 1; bit isTerminator = 1; bit hasDelaySlot = 1; @@ -1519,13 +1465,14 @@ class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty, } class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode, - RegisterOperand ROWD, RegisterOperand ROWS = ROWD, + Operand ImmOp, ImmLeaf Imm, RegisterOperand ROWD, + RegisterOperand ROWS = ROWD, InstrItinClass itin = NoItinerary> { dag OutOperandList = (outs ROWD:$wd); - dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws, uimmz:$n2); + dag InOperandList = (ins ROWD:$wd_in, ImmOp:$n, ROWS:$ws, uimmz:$n2); string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[$n2]"); list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, - immZExt6:$n, + Imm:$n, ROWS:$ws, immz:$n2))]; InstrItinClass Itinerary = itin; @@ -1934,8 +1881,6 @@ class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16, GPR32Opnd, MSA128HOpnd>; class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32, GPR32Opnd, MSA128WOpnd>; -class COPY_U_D_DESC : MSA_COPY_DESC_BASE<"copy_u.d", vextract_zext_i64, v2i64, - GPR64Opnd, MSA128DOpnd>; class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32, MSA128W>; @@ -2346,13 +2291,13 @@ class INSERT_FW_VIDX64_PSEUDO_DESC : class INSERT_FD_VIDX64_PSEUDO_DESC : MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd, GPR64Opnd>; -class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, +class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, uimm4, immZExt4, MSA128BOpnd>; -class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, +class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, uimm3, immZExt3, MSA128HOpnd>; -class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, +class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", insve_v4i32, uimm2, immZExt2, MSA128WOpnd>; -class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, +class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", insve_v2i64, uimm1, immZExt1, MSA128DOpnd>; class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode, @@ -2381,7 +2326,7 @@ class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD, RegisterOperand RORS = RORD, RegisterOperand RORT = RORD, InstrItinClass itin = NoItinerary > { dag OutOperandList = (outs RORD:$rd); - dag InOperandList = (ins RORS:$rs, RORT:$rt, LSAImm:$sa); + dag InOperandList = (ins RORS:$rs, RORT:$rt, uimm2_plus1:$sa); string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $sa"); list<dag> Pattern = [(set RORD:$rd, (add RORT:$rt, (shl RORS:$rs, @@ -2561,23 +2506,23 @@ class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>; class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>; class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>; -class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, - MSA128BOpnd>; -class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, - MSA128HOpnd>; -class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, - MSA128WOpnd>; -class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, - MSA128DOpnd>; - -class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, - MSA128BOpnd>; -class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, - MSA128HOpnd>; -class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, - MSA128WOpnd>; -class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, - MSA128DOpnd>; +class SAT_S_B_DESC : MSA_BIT_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b, uimm3, + immZExt3, MSA128BOpnd>; +class SAT_S_H_DESC : MSA_BIT_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h, uimm4, + immZExt4, MSA128HOpnd>; +class SAT_S_W_DESC : MSA_BIT_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w, uimm5, + immZExt5, MSA128WOpnd>; +class SAT_S_D_DESC : MSA_BIT_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d, uimm6, + immZExt6, MSA128DOpnd>; + +class SAT_U_B_DESC : MSA_BIT_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b, uimm3, + immZExt3, MSA128BOpnd>; +class SAT_U_H_DESC : MSA_BIT_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h, uimm4, + immZExt4, MSA128HOpnd>; +class SAT_U_W_DESC : MSA_BIT_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w, uimm5, + immZExt5, MSA128WOpnd>; +class SAT_U_D_DESC : MSA_BIT_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d, uimm6, + immZExt6, MSA128DOpnd>; class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>; class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>; @@ -2589,13 +2534,17 @@ class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>; class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>; class SLDI_B_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.b", int_mips_sldi_b, - MSA128BOpnd>; + MSA128BOpnd, MSA128BOpnd, uimm4, + immZExt4>; class SLDI_H_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.h", int_mips_sldi_h, - MSA128HOpnd>; + MSA128HOpnd, MSA128HOpnd, uimm3, + immZExt3>; class SLDI_W_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.w", int_mips_sldi_w, - MSA128WOpnd>; + MSA128WOpnd, MSA128WOpnd, uimm2, + immZExt2>; class SLDI_D_DESC : MSA_ELM_SLD_DESC_BASE<"sldi.d", int_mips_sldi_d, - MSA128DOpnd>; + MSA128DOpnd, MSA128DOpnd, uimm1, + immZExt1>; class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>; class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>; @@ -2648,14 +2597,14 @@ class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>; class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>; class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>; -class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b, - MSA128BOpnd>; -class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h, - MSA128HOpnd>; -class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w, - MSA128WOpnd>; -class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d, - MSA128DOpnd>; +class SRARI_B_DESC : MSA_BIT_X_DESC_BASE<"srari.b", int_mips_srari_b, uimm3, + immZExt3, MSA128BOpnd>; +class SRARI_H_DESC : MSA_BIT_X_DESC_BASE<"srari.h", int_mips_srari_h, uimm4, + immZExt4, MSA128HOpnd>; +class SRARI_W_DESC : MSA_BIT_X_DESC_BASE<"srari.w", int_mips_srari_w, uimm5, + immZExt5, MSA128WOpnd>; +class SRARI_D_DESC : MSA_BIT_X_DESC_BASE<"srari.d", int_mips_srari_d, uimm6, + immZExt6, MSA128DOpnd>; class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>; class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>; @@ -2676,14 +2625,14 @@ class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>; class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>; class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>; -class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b, - MSA128BOpnd>; -class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h, - MSA128HOpnd>; -class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w, - MSA128WOpnd>; -class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d, - MSA128DOpnd>; +class SRLRI_B_DESC : MSA_BIT_X_DESC_BASE<"srlri.b", int_mips_srlri_b, uimm3, + immZExt3, MSA128BOpnd>; +class SRLRI_H_DESC : MSA_BIT_X_DESC_BASE<"srlri.h", int_mips_srlri_h, uimm4, + immZExt4, MSA128HOpnd>; +class SRLRI_W_DESC : MSA_BIT_X_DESC_BASE<"srlri.w", int_mips_srlri_w, uimm5, + immZExt5, MSA128WOpnd>; +class SRLRI_D_DESC : MSA_BIT_X_DESC_BASE<"srlri.d", int_mips_srlri_d, uimm6, + immZExt6, MSA128DOpnd>; class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode, ValueType TyNode, RegisterOperand ROWD, @@ -2991,12 +2940,11 @@ def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC; def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC; def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC; def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC; -def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC; +def COPY_S_D : COPY_S_D_ENC, COPY_S_D_DESC, ASE_MSA64; def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC; def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC; -def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC; -def COPY_U_D : COPY_U_D_ENC, COPY_U_D_DESC; +def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC, ASE_MSA64; def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC; def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC; @@ -3108,7 +3056,7 @@ def FFQR_D : FFQR_D_ENC, FFQR_D_DESC; def FILL_B : FILL_B_ENC, FILL_B_DESC; def FILL_H : FILL_H_ENC, FILL_H_DESC; def FILL_W : FILL_W_ENC, FILL_W_DESC; -def FILL_D : FILL_D_ENC, FILL_D_DESC; +def FILL_D : FILL_D_ENC, FILL_D_DESC, ASE_MSA64; def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC; def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC; @@ -3238,7 +3186,7 @@ def ILVR_D : ILVR_D_ENC, ILVR_D_DESC; def INSERT_B : INSERT_B_ENC, INSERT_B_DESC; def INSERT_H : INSERT_H_ENC, INSERT_H_DESC; def INSERT_W : INSERT_W_ENC, INSERT_W_DESC; -def INSERT_D : INSERT_D_ENC, INSERT_D_DESC; +def INSERT_D : INSERT_D_ENC, INSERT_D_DESC, ASE_MSA64; // INSERT_FW_PSEUDO defined after INSVE_W // INSERT_FD_PSEUDO defined after INSVE_D @@ -3280,7 +3228,7 @@ def LDI_W : LDI_W_ENC, LDI_W_DESC; def LDI_D : LDI_D_ENC, LDI_D_DESC; def LSA : LSA_ENC, LSA_DESC; -def DLSA : DLSA_ENC, DLSA_DESC; +def DLSA : DLSA_ENC, DLSA_DESC, ASE_MSA64; def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC; def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC; @@ -3787,6 +3735,28 @@ def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64, def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8, MSA128B, NoItinerary>; +// Vector extraction with fixed index. +// +// Extracting 32-bit values on MSA32 should always use COPY_S_W rather than +// COPY_U_W, even for the zero-extended case. This is because our forward +// compatibility strategy is to consider registers to be infinitely +// sign-extended so that a MIPS64 can execute MIPS32 code without getting +// different register values. +def : MSAPat<(vextract_zext_i32 (v4i32 MSA128W:$ws), immZExt2Ptr:$idx), + (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64; +def : MSAPat<(vextract_zext_i32 (v4f32 MSA128W:$ws), immZExt2Ptr:$idx), + (COPY_S_W MSA128W:$ws, immZExt2:$idx)>, ASE_MSA_NOT_MSA64; + +// Extracting 64-bit values on MSA64 should always use COPY_S_D rather than +// COPY_U_D, even for the zero-extended case. This is because our forward +// compatibility strategy is to consider registers to be infinitely +// sign-extended so that a hypothetical MIPS128 would be able to execute MIPS64 +// code without getting different register values. +def : MSAPat<(vextract_zext_i64 (v2i64 MSA128D:$ws), immZExt1Ptr:$idx), + (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64; +def : MSAPat<(vextract_zext_i64 (v2f64 MSA128D:$ws), immZExt1Ptr:$idx), + (COPY_S_D MSA128D:$ws, immZExt1:$idx)>, ASE_MSA64; + // Vector extraction with variable index def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)), (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws, diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp index 0d1ee04..c7d2738 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.cpp @@ -24,42 +24,6 @@ static cl::opt<bool> FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true), cl::desc("Always use $gp as the global base register.")); -// class MipsCallEntry. -MipsCallEntry::MipsCallEntry(StringRef N) { -#ifndef NDEBUG - Name = N; - Val = nullptr; -#endif -} - -MipsCallEntry::MipsCallEntry(const GlobalValue *V) { -#ifndef NDEBUG - Val = V; -#endif -} - -bool MipsCallEntry::isConstant(const MachineFrameInfo *) const { - return false; -} - -bool MipsCallEntry::isAliased(const MachineFrameInfo *) const { - return false; -} - -bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const { - return false; -} - -void MipsCallEntry::printCustom(raw_ostream &O) const { - O << "MipsCallEntry: "; -#ifndef NDEBUG - if (Val) - O << Val->getName(); - else - O << Name; -#endif -} - MipsFunctionInfo::~MipsFunctionInfo() {} bool MipsFunctionInfo::globalBaseRegSet() const { @@ -111,27 +75,32 @@ void MipsFunctionInfo::createEhDataRegsFI() { } } +void MipsFunctionInfo::createISRRegFI() { + // ISRs require spill slots for Status & ErrorPC Coprocessor 0 registers. + // The current implementation only supports Mips32r2+ not Mips64rX. Status + // is always 32 bits, ErrorPC is 32 or 64 bits dependant on architecture, + // however Mips32r2+ is the supported architecture. + const TargetRegisterClass *RC = &Mips::GPR32RegClass; + + for (int I = 0; I < 2; ++I) + ISRDataRegFI[I] = MF.getFrameInfo()->CreateStackObject( + RC->getSize(), RC->getAlignment(), false); +} + bool MipsFunctionInfo::isEhDataRegFI(int FI) const { return CallsEhReturn && (FI == EhDataRegFI[0] || FI == EhDataRegFI[1] || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]); } -MachinePointerInfo MipsFunctionInfo::callPtrInfo(StringRef Name) { - std::unique_ptr<const MipsCallEntry> &E = ExternalCallEntries[Name]; - - if (!E) - E = llvm::make_unique<MipsCallEntry>(Name); - - return MachinePointerInfo(E.get()); +bool MipsFunctionInfo::isISRRegFI(int FI) const { + return IsISR && (FI == ISRDataRegFI[0] || FI == ISRDataRegFI[1]); +} +MachinePointerInfo MipsFunctionInfo::callPtrInfo(const char *ES) { + return MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)); } -MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) { - std::unique_ptr<const MipsCallEntry> &E = GlobalCallEntries[Val]; - - if (!E) - E = llvm::make_unique<MipsCallEntry>(Val); - - return MachinePointerInfo(E.get()); +MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) { + return MachinePointerInfo(MF.getPSVManager().getGlobalValueCallEntry(GV)); } int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) { diff --git a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h index 32436ef..a2f6ee0 100644 --- a/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h +++ b/contrib/llvm/lib/Target/Mips/MipsMachineFunction.h @@ -15,12 +15,10 @@ #define LLVM_LIB_TARGET_MIPS_MIPSMACHINEFUNCTION_H #include "Mips16HardFloatInfo.h" -#include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/PseudoSourceValue.h" -#include "llvm/IR/GlobalValue.h" #include "llvm/IR/ValueMap.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetMachine.h" @@ -30,31 +28,13 @@ namespace llvm { -/// \brief A class derived from PseudoSourceValue that represents a GOT entry -/// resolved by lazy-binding. -class MipsCallEntry : public PseudoSourceValue { -public: - explicit MipsCallEntry(StringRef N); - explicit MipsCallEntry(const GlobalValue *V); - bool isConstant(const MachineFrameInfo *) const override; - bool isAliased(const MachineFrameInfo *) const override; - bool mayAlias(const MachineFrameInfo *) const override; - -private: - void printCustom(raw_ostream &O) const override; -#ifndef NDEBUG - std::string Name; - const GlobalValue *Val; -#endif -}; - /// MipsFunctionInfo - This class is derived from MachineFunction private /// Mips target-specific information for each MachineFunction. class MipsFunctionInfo : public MachineFunctionInfo { public: MipsFunctionInfo(MachineFunction &MF) : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0), - VarArgsFrameIndex(0), CallsEhReturn(false), SaveS2(false), + VarArgsFrameIndex(0), CallsEhReturn(false), IsISR(false), SaveS2(false), MoveF64ViaSpillFI(-1) {} ~MipsFunctionInfo(); @@ -86,13 +66,21 @@ public: int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; } bool isEhDataRegFI(int FI) const; - /// \brief Create a MachinePointerInfo that has a MipsCallEntr object - /// representing a GOT entry for an external function. - MachinePointerInfo callPtrInfo(StringRef Name); + /// Create a MachinePointerInfo that has an ExternalSymbolPseudoSourceValue + /// object representing a GOT entry for an external function. + MachinePointerInfo callPtrInfo(const char *ES); + + // Functions with the "interrupt" attribute require special prologues, + // epilogues and additional spill slots. + bool isISR() const { return IsISR; } + void setISR() { IsISR = true; } + void createISRRegFI(); + int getISRRegFI(unsigned Reg) const { return ISRDataRegFI[Reg]; } + bool isISRRegFI(int FI) const; - /// \brief Create a MachinePointerInfo that has a MipsCallEntr object + /// Create a MachinePointerInfo that has a GlobalValuePseudoSourceValue object /// representing a GOT entry for a global function. - MachinePointerInfo callPtrInfo(const GlobalValue *Val); + MachinePointerInfo callPtrInfo(const GlobalValue *GV); void setSaveS2() { SaveS2 = true; } bool hasSaveS2() const { return SaveS2; } @@ -136,17 +124,18 @@ private: /// Frame objects for spilling eh data registers. int EhDataRegFI[4]; + /// ISR - Whether the function is an Interrupt Service Routine. + bool IsISR; + + /// Frame objects for spilling C0_STATUS, C0_EPC + int ISRDataRegFI[2]; + // saveS2 bool SaveS2; /// FrameIndex for expanding BuildPairF64 nodes to spill and reload when the /// O32 FPXX ABI is enabled. -1 is used to denote invalid index. int MoveF64ViaSpillFI; - - /// MipsCallEntry maps. - StringMap<std::unique_ptr<const MipsCallEntry>> ExternalCallEntries; - ValueMap<const GlobalValue *, std::unique_ptr<const MipsCallEntry>> - GlobalCallEntries; }; } // end of namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp index f6647e6..28e5a42 100644 --- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.cpp @@ -84,6 +84,16 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const MipsSubtarget &Subtarget = MF->getSubtarget<MipsSubtarget>(); + const Function *F = MF->getFunction(); + if (F->hasFnAttribute("interrupt")) { + if (Subtarget.hasMips64()) + return Subtarget.hasMips64r6() ? CSR_Interrupt_64R6_SaveList + : CSR_Interrupt_64_SaveList; + else + return Subtarget.hasMips32r6() ? CSR_Interrupt_32R6_SaveList + : CSR_Interrupt_32_SaveList; + } + if (Subtarget.isSingleFloat()) return CSR_SingleFloatOnly_SaveList; @@ -284,6 +294,16 @@ getFrameRegister(const MachineFunction &MF) const { } bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const { + // Avoid realigning functions that explicitly do not want to be realigned. + // Normally, we should report an error when a function should be dynamically + // realigned but also has the attribute no-realign-stack. Unfortunately, + // with this attribute, MachineFrameInfo clamps each new object's alignment + // to that of the stack's alignment as specified by the ABI. As a result, + // the information of whether we have objects with larger alignment + // requirement than the stack's alignment is already lost at this point. + if (!TargetRegisterInfo::canRealignStack(MF)) + return false; + const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); unsigned FP = Subtarget.isGP32bit() ? Mips::FP : Mips::FP_64; unsigned BP = Subtarget.isGP32bit() ? Mips::S7 : Mips::S7_64; @@ -306,42 +326,3 @@ bool MipsRegisterInfo::canRealignStack(const MachineFunction &MF) const { // sized objects. return MF.getRegInfo().canReserveReg(BP); } - -bool MipsRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>(); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - - bool CanRealign = canRealignStack(MF); - - // Avoid realigning functions that explicitly do not want to be realigned. - // Normally, we should report an error when a function should be dynamically - // realigned but also has the attribute no-realign-stack. Unfortunately, - // with this attribute, MachineFrameInfo clamps each new object's alignment - // to that of the stack's alignment as specified by the ABI. As a result, - // the information of whether we have objects with larger alignment - // requirement than the stack's alignment is already lost at this point. - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) - return false; - - const Function *F = MF.getFunction(); - if (F->hasFnAttribute(Attribute::StackAlignment)) { -#ifdef DEBUG - if (!CanRealign) - DEBUG(dbgs() << "It's not possible to realign the stack of the function: " - << F->getName() << "\n"); -#endif - return CanRealign; - } - - unsigned StackAlignment = Subtarget.getFrameLowering()->getStackAlignment(); - if (MFI->getMaxAlignment() > StackAlignment) { -#ifdef DEBUG - if (!CanRealign) - DEBUG(dbgs() << "It's not possible to realign the stack of the function: " - << F->getName() << "\n"); -#endif - return CanRealign; - } - - return false; -} diff --git a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h index ee1f6bc..5de68a2 100644 --- a/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h +++ b/contrib/llvm/lib/Target/Mips/MipsRegisterInfo.h @@ -61,9 +61,7 @@ public: RegScavenger *RS = nullptr) const; // Stack realignment queries. - bool canRealignStack(const MachineFunction &MF) const; - - bool needsStackRealignment(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; /// Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp index 096b3be..a4abd62 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp @@ -17,6 +17,7 @@ #include "MipsMachineFunction.h" #include "MipsSEInstrInfo.h" #include "MipsSubtarget.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -319,6 +320,15 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB, bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, bool FP64) const { + const MachineOperand &Op1 = I->getOperand(1); + const MachineOperand &Op2 = I->getOperand(2); + + if ((Op1.isReg() && Op1.isUndef()) || (Op2.isReg() && Op2.isUndef())) { + unsigned DstReg = I->getOperand(0).getReg(); + BuildMI(MBB, I, I->getDebugLoc(), TII.get(Mips::IMPLICIT_DEF), DstReg); + return true; + } + // For fpxx and when mfhc1 is not available, use: // spill + reload via ldc1 // @@ -335,8 +345,8 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, if ((Subtarget.isABI_FPXX() && !Subtarget.hasMTHC1()) || (FP64 && !Subtarget.useOddSPReg())) { unsigned DstReg = I->getOperand(0).getReg(); - unsigned SrcReg = I->getOperand(1).getReg(); - unsigned N = I->getOperand(2).getImm(); + unsigned SrcReg = Op1.getReg(); + unsigned N = Op2.getImm(); int64_t Offset = 4 * (Subtarget.isLittle() ? N : (1 - N)); // It should be impossible to have FGR64 on MIPS-II or MIPS32r1 (which are @@ -352,8 +362,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB, // We re-use the same spill slot each time so that the stack frame doesn't // grow too much in functions with a large number of moves. int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC); - TII.storeRegToStack(MBB, I, SrcReg, I->getOperand(1).isKill(), FI, RC, - &RegInfo, 0); + TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0); TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset); return true; } @@ -376,12 +385,12 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + DebugLoc dl; MipsABIInfo ABI = STI.getABI(); unsigned SP = ABI.GetStackPtr(); unsigned FP = ABI.GetFramePtr(); unsigned ZERO = ABI.GetNullPtr(); - unsigned ADDu = ABI.GetPtrAdduOp(); + unsigned MOVE = ABI.GetGPRMoveOp(); unsigned ADDiu = ABI.GetPtrAddiuOp(); unsigned AND = ABI.IsN64() ? Mips::AND64 : Mips::AND; @@ -407,6 +416,9 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); + if (MF.getFunction()->hasFnAttribute("interrupt")) + emitInterruptPrologueStub(MF, MBB); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); if (CSI.size()) { @@ -491,7 +503,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, // if framepointer enabled, set it to point to the stack pointer. if (hasFP(MF)) { // Insert instruction "move $fp, $sp" at this location. - BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO) + BuildMI(MBB, MBBI, dl, TII.get(MOVE), FP).addReg(SP).addReg(ZERO) .setMIFlag(MachineInstr::FrameSetup); // emit ".cfi_def_cfa_register $fp" @@ -514,7 +526,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, if (hasBP(MF)) { // move $s7, $sp unsigned BP = STI.isABI_N64() ? Mips::S7_64 : Mips::S7; - BuildMI(MBB, MBBI, dl, TII.get(ADDu), BP) + BuildMI(MBB, MBBI, dl, TII.get(MOVE), BP) .addReg(SP) .addReg(ZERO); } @@ -522,6 +534,135 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF, } } +void MipsSEFrameLowering::emitInterruptPrologueStub( + MachineFunction &MF, MachineBasicBlock &MBB) const { + + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + MachineBasicBlock::iterator MBBI = MBB.begin(); + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Report an error the target doesn't support Mips32r2 or later. + // The epilogue relies on the use of the "ehb" to clear execution + // hazards. Pre R2 Mips relies on an implementation defined number + // of "ssnop"s to clear the execution hazard. Support for ssnop hazard + // clearing is not provided so reject that configuration. + if (!STI.hasMips32r2()) + report_fatal_error( + "\"interrupt\" attribute is not supported on pre-MIPS32R2 or " + "MIPS16 targets."); + + // The GP register contains the "user" value, so we cannot perform + // any gp relative loads until we restore the "kernel" or "system" gp + // value. Until support is written we shall only accept the static + // relocation model. + if ((STI.getRelocationModel() != Reloc::Static)) + report_fatal_error("\"interrupt\" attribute is only supported for the " + "static relocation model on MIPS at the present time."); + + if (!STI.isABI_O32() || STI.hasMips64()) + report_fatal_error("\"interrupt\" attribute is only supported for the " + "O32 ABI on MIPS32R2+ at the present time."); + + // Perform ISR handling like GCC + StringRef IntKind = + MF.getFunction()->getFnAttribute("interrupt").getValueAsString(); + const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass; + + // EIC interrupt handling needs to read the Cause register to disable + // interrupts. + if (IntKind == "eic") { + // Coprocessor registers are always live per se. + MBB.addLiveIn(Mips::COP013); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K0) + .addReg(Mips::COP013) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EXT), Mips::K0) + .addReg(Mips::K0) + .addImm(10) + .addImm(6) + .setMIFlag(MachineInstr::FrameSetup); + } + + // Fetch and spill EPC + MBB.addLiveIn(Mips::COP014); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1) + .addReg(Mips::COP014) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, + MipsFI->getISRRegFI(0), PtrRC, + STI.getRegisterInfo(), 0); + + // Fetch and Spill Status + MBB.addLiveIn(Mips::COP012); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MFC0), Mips::K1) + .addReg(Mips::COP012) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + + STI.getInstrInfo()->storeRegToStack(MBB, MBBI, Mips::K1, false, + MipsFI->getISRRegFI(1), PtrRC, + STI.getRegisterInfo(), 0); + + // Build the configuration for disabling lower priority interrupts. Non EIC + // interrupts need to be masked off with zero, EIC from the Cause register. + unsigned InsPosition = 8; + unsigned InsSize = 0; + unsigned SrcReg = Mips::ZERO; + + // If the interrupt we're tied to is the EIC, switch the source for the + // masking off interrupts to the cause register. + if (IntKind == "eic") { + SrcReg = Mips::K0; + InsPosition = 10; + InsSize = 6; + } else + InsSize = StringSwitch<unsigned>(IntKind) + .Case("sw0", 1) + .Case("sw1", 2) + .Case("hw0", 3) + .Case("hw1", 4) + .Case("hw2", 5) + .Case("hw3", 6) + .Case("hw4", 7) + .Case("hw5", 8) + .Default(0); + assert(InsSize != 0 && "Unknown interrupt type!"); + + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1) + .addReg(SrcReg) + .addImm(InsPosition) + .addImm(InsSize) + .addReg(Mips::K1) + .setMIFlag(MachineInstr::FrameSetup); + + // Mask off KSU, ERL, EXL + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1) + .addReg(Mips::ZERO) + .addImm(1) + .addImm(4) + .addReg(Mips::K1) + .setMIFlag(MachineInstr::FrameSetup); + + // Disable the FPU as we are not spilling those register sets. + if (!STI.useSoftFloat()) + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::INS), Mips::K1) + .addReg(Mips::ZERO) + .addImm(29) + .addImm(1) + .addReg(Mips::K1) + .setMIFlag(MachineInstr::FrameSetup); + + // Set the new status + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012) + .addReg(Mips::K1) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); +} + void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); @@ -533,12 +674,12 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, const MipsRegisterInfo &RegInfo = *static_cast<const MipsRegisterInfo *>(STI.getRegisterInfo()); - DebugLoc dl = MBBI->getDebugLoc(); + DebugLoc DL = MBBI->getDebugLoc(); MipsABIInfo ABI = STI.getABI(); unsigned SP = ABI.GetStackPtr(); unsigned FP = ABI.GetFramePtr(); unsigned ZERO = ABI.GetNullPtr(); - unsigned ADDu = ABI.GetPtrAdduOp(); + unsigned MOVE = ABI.GetGPRMoveOp(); // if framepointer enabled, restore the stack pointer. if (hasFP(MF)) { @@ -549,7 +690,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, --I; // Insert instruction "move $sp, $fp" at this location. - BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO); + BuildMI(MBB, I, DL, TII.get(MOVE), SP).addReg(FP).addReg(ZERO); } if (MipsFI->callsEhReturn()) { @@ -568,6 +709,9 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, } } + if (MF.getFunction()->hasFnAttribute("interrupt")) + emitInterruptEpilogueStub(MF, MBB); + // Get the number of bytes from FrameInfo uint64_t StackSize = MFI->getStackSize(); @@ -578,13 +722,59 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF, TII.adjustStackPtr(SP, StackSize, MBB, MBBI); } +void MipsSEFrameLowering::emitInterruptEpilogueStub( + MachineFunction &MF, MachineBasicBlock &MBB) const { + + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>(); + DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Perform ISR handling like GCC + const TargetRegisterClass *PtrRC = &Mips::GPR32RegClass; + + // Disable Interrupts. + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::DI), Mips::ZERO); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::EHB)); + + // Restore EPC + STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, + MipsFI->getISRRegFI(0), PtrRC, + STI.getRegisterInfo()); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP014) + .addReg(Mips::K1) + .addImm(0); + + // Restore Status + STI.getInstrInfo()->loadRegFromStackSlot(MBB, MBBI, Mips::K1, + MipsFI->getISRRegFI(1), PtrRC, + STI.getRegisterInfo()); + BuildMI(MBB, MBBI, DL, STI.getInstrInfo()->get(Mips::MTC0), Mips::COP012) + .addReg(Mips::K1) + .addImm(0); +} + +int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + MipsABIInfo ABI = STI.getABI(); + + if (MFI->isFixedObjectIndex(FI)) + FrameReg = hasFP(MF) ? ABI.GetFramePtr() : ABI.GetStackPtr(); + else + FrameReg = hasBP(MF) ? ABI.GetBasePtr() : ABI.GetStackPtr(); + + return MFI->getObjectOffset(FI) + MFI->getStackSize() - + getOffsetOfLocalArea() + MFI->getOffsetAdjustment(); +} + bool MipsSEFrameLowering:: spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); - MachineBasicBlock *EntryBlock = MF->begin(); + MachineBasicBlock *EntryBlock = &MF->front(); const TargetInstrInfo &TII = *STI.getInstrInfo(); for (unsigned i = 0, e = CSI.size(); i != e; ++i) { @@ -599,6 +789,26 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, if (!IsRAAndRetAddrIsTaken) EntryBlock->addLiveIn(Reg); + // ISRs require HI/LO to be spilled into kernel registers to be then + // spilled to the stack frame. + bool IsLOHI = (Reg == Mips::LO0 || Reg == Mips::LO0_64 || + Reg == Mips::HI0 || Reg == Mips::HI0_64); + const Function *Func = MBB.getParent()->getFunction(); + if (IsLOHI && Func->hasFnAttribute("interrupt")) { + DebugLoc DL = MI->getDebugLoc(); + + unsigned Op = 0; + if (!STI.getABI().ArePtrs64bit()) { + Op = (Reg == Mips::HI0) ? Mips::MFHI : Mips::MFLO; + Reg = Mips::K0; + } else { + Op = (Reg == Mips::HI0) ? Mips::MFHI64 : Mips::MFLO64; + Reg = Mips::K0_64; + } + BuildMI(MBB, MI, DL, TII.get(Op), Mips::K0) + .setMIFlag(MachineInstr::FrameSetup); + } + // Insert the spill to the stack frame. bool IsKill = !IsRAAndRetAddrIsTaken; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); @@ -622,7 +832,8 @@ MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { } /// Mark \p Reg and all registers aliasing it in the bitset. -void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, unsigned Reg) { +static void setAliasRegs(MachineFunction &MF, BitVector &SavedRegs, + unsigned Reg) { const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) SavedRegs.set(*AI); @@ -648,6 +859,10 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF, if (MipsFI->callsEhReturn()) MipsFI->createEhDataRegsFI(); + // Create spill slots for Coprocessor 0 registers if function is an ISR. + if (MipsFI->isISR()) + MipsFI->createISRRegFI(); + // Expand pseudo instructions which load, store or copy accumulators. // Add an emergency spill slot if a pseudo was expanded. if (ExpandPseudo(MF).expand()) { diff --git a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h index 9cb32e6..63cd3ce 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEFrameLowering.h @@ -27,6 +27,9 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI, @@ -37,8 +40,13 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS) const override; unsigned ehDataReg(unsigned I) const; -}; +private: + void emitInterruptEpilogueStub(MachineFunction &MF, + MachineBasicBlock &MBB) const; + void emitInterruptPrologueStub(MachineFunction &MF, + MachineBasicBlock &MBB) const; +}; } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 2ebfbd1..6f001ea 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -136,7 +136,7 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) { MachineBasicBlock::iterator I = MBB.begin(); MachineRegisterInfo &RegInfo = MF.getRegInfo(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; unsigned V0, V1, GlobalBaseReg = MipsFI->getGlobalBaseReg(); const TargetRegisterClass *RC; const MipsABIInfo &ABI = static_cast<const MipsTargetMachine &>(TM).getABI(); diff --git a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index b319fd0..efe22fb 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -1181,6 +1181,10 @@ bool MipsSETargetLowering::isEligibleForTailCallOptimization( if (!EnableMipsTailCalls) return false; + // Exception has to be cleared with eret. + if (FI.isISR()) + return false; + // Return false if either the callee or caller has a byval argument. if (CCInfo.getInRegsParamsCount() > 0 || FI.hasByvalArg()) return false; @@ -1786,9 +1790,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); case Intrinsic::mips_fadd_w: - case Intrinsic::mips_fadd_d: + case Intrinsic::mips_fadd_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away case Intrinsic::mips_fceq_w: case Intrinsic::mips_fceq_d: @@ -1831,9 +1837,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2), ISD::SETUNE); case Intrinsic::mips_fdiv_w: - case Intrinsic::mips_fdiv_d: + case Intrinsic::mips_fdiv_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } case Intrinsic::mips_ffint_u_w: case Intrinsic::mips_ffint_u_d: return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0), @@ -1856,6 +1864,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::mips_fexp2_w: case Intrinsic::mips_fexp2_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. EVT ResTy = Op->getValueType(0); return DAG.getNode( ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1), @@ -1869,11 +1878,14 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0), Op->getOperand(1), Op->getOperand(2), Op->getOperand(3)); case Intrinsic::mips_fmul_w: - case Intrinsic::mips_fmul_d: + case Intrinsic::mips_fmul_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } case Intrinsic::mips_fmsub_w: case Intrinsic::mips_fmsub_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. EVT ResTy = Op->getValueType(0); return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1), DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy, @@ -1886,9 +1898,11 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::mips_fsqrt_d: return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1)); case Intrinsic::mips_fsub_w: - case Intrinsic::mips_fsub_d: + case Intrinsic::mips_fsub_d: { + // TODO: If intrinsics have fast-math-flags, propagate them. return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1), Op->getOperand(2)); + } case Intrinsic::mips_ftrunc_u_w: case Intrinsic::mips_ftrunc_u_d: return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0), diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp index 786307b..e6f7fe9 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp @@ -88,7 +88,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (isMicroMips) Opc = Mips::MOVE16_MM; else - Opc = Mips::ADDu, ZeroReg = Mips::ZERO; + Opc = Mips::OR, ZeroReg = Mips::ZERO; } else if (Mips::CCRRegClass.contains(SrcReg)) Opc = Mips::CFC1; else if (Mips::FGR32RegClass.contains(SrcReg)) @@ -141,7 +141,7 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opc = Mips::FMOV_D64; else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg. if (Mips::GPR64RegClass.contains(SrcReg)) - Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64; + Opc = Mips::OR64, ZeroReg = Mips::ZERO_64; else if (Mips::HI64RegClass.contains(SrcReg)) Opc = Mips::MFHI64, SrcReg = 0; else if (Mips::LO64RegClass.contains(SrcReg)) @@ -182,7 +182,6 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, int64_t Offset) const { DebugLoc DL; - if (I != MBB.end()) DL = I->getDebugLoc(); MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore); unsigned Opc = 0; @@ -213,6 +212,33 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::ST_W; else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) Opc = Mips::ST_D; + else if (Mips::LO32RegClass.hasSubClassEq(RC)) + Opc = Mips::SW; + else if (Mips::LO64RegClass.hasSubClassEq(RC)) + Opc = Mips::SD; + else if (Mips::HI32RegClass.hasSubClassEq(RC)) + Opc = Mips::SW; + else if (Mips::HI64RegClass.hasSubClassEq(RC)) + Opc = Mips::SD; + + // Hi, Lo are normally caller save but they are callee save + // for interrupt handling. + const Function *Func = MBB.getParent()->getFunction(); + if (Func->hasFnAttribute("interrupt")) { + if (Mips::HI32RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFHI), Mips::K0); + SrcReg = Mips::K0; + } else if (Mips::HI64RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFHI64), Mips::K0_64); + SrcReg = Mips::K0_64; + } else if (Mips::LO32RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFLO), Mips::K0); + SrcReg = Mips::K0; + } else if (Mips::LO64RegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(Mips::MFLO64), Mips::K0_64); + SrcReg = Mips::K0_64; + } + } assert(Opc && "Register class not handled!"); BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)) @@ -228,6 +254,11 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad); unsigned Opc = 0; + const Function *Func = MBB.getParent()->getFunction(); + bool ReqIndirectLoad = Func->hasFnAttribute("interrupt") && + (DestReg == Mips::LO0 || DestReg == Mips::LO0_64 || + DestReg == Mips::HI0 || DestReg == Mips::HI0_64); + if (Mips::GPR32RegClass.hasSubClassEq(RC)) Opc = Mips::LW; else if (Mips::GPR64RegClass.hasSubClassEq(RC)) @@ -254,10 +285,44 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Opc = Mips::LD_W; else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64)) Opc = Mips::LD_D; + else if (Mips::HI32RegClass.hasSubClassEq(RC)) + Opc = Mips::LW; + else if (Mips::HI64RegClass.hasSubClassEq(RC)) + Opc = Mips::LD; + else if (Mips::LO32RegClass.hasSubClassEq(RC)) + Opc = Mips::LW; + else if (Mips::LO64RegClass.hasSubClassEq(RC)) + Opc = Mips::LD; assert(Opc && "Register class not handled!"); - BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset) - .addMemOperand(MMO); + + if (!ReqIndirectLoad) + BuildMI(MBB, I, DL, get(Opc), DestReg) + .addFrameIndex(FI) + .addImm(Offset) + .addMemOperand(MMO); + else { + // Load HI/LO through K0. Notably the DestReg is encoded into the + // instruction itself. + unsigned Reg = Mips::K0; + unsigned LdOp = Mips::MTLO; + if (DestReg == Mips::HI0) + LdOp = Mips::MTHI; + + if (Subtarget.getABI().ArePtrs64bit()) { + Reg = Mips::K0_64; + if (DestReg == Mips::HI0_64) + LdOp = Mips::MTHI64; + else + LdOp = Mips::MTLO64; + } + + BuildMI(MBB, I, DL, get(Opc), Reg) + .addFrameIndex(FI) + .addImm(Offset) + .addMemOperand(MMO); + BuildMI(MBB, I, DL, get(LdOp)).addReg(Reg); + } } bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { @@ -271,6 +336,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case Mips::RetRA: expandRetRA(MBB, MI); break; + case Mips::ERet: + expandERet(MBB, MI); + break; case Mips::PseudoMFHI: Opc = isMicroMips ? Mips::MFHI16_MM : Mips::MFHI; expandPseudoMFHiLo(MBB, MI, Opc); @@ -360,7 +428,7 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { MipsABIInfo ABI = Subtarget.getABI(); - DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc(); + DebugLoc DL; unsigned ADDu = ABI.GetPtrAdduOp(); unsigned ADDiu = ABI.GetPtrAddiuOp(); @@ -438,6 +506,11 @@ void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB, BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA); } +void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const { + BuildMI(MBB, I, I->getDebugLoc(), get(Mips::ERET)); +} + std::pair<bool, bool> MipsSEInstrInfo::compareOpndSize(unsigned Opc, const MachineFunction &MF) const { diff --git a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h index bebbabf..5d73545 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h +++ b/contrib/llvm/lib/Target/Mips/MipsSEInstrInfo.h @@ -82,6 +82,8 @@ private: void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + void expandERet(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; + std::pair<bool, bool> compareOpndSize(unsigned Opc, const MachineFunction &MF) const; diff --git a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp index 132c3a1..b1e2885 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp @@ -126,17 +126,19 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II, } bool EhDataRegFI = MipsFI->isEhDataRegFI(FrameIndex); - + bool IsISRRegFI = MipsFI->isISRRegFI(FrameIndex); // The following stack frame objects are always referenced relative to $sp: // 1. Outgoing arguments. // 2. Pointer to dynamically allocated stack space. // 3. Locations for callee-saved registers. // 4. Locations for eh data registers. + // 5. Locations for ISR saved Coprocessor 0 registers 12 & 14. // Everything else is referenced relative to whatever register // getFrameRegister() returns. unsigned FrameReg; - if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI) + if ((FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI) || EhDataRegFI || + IsISRRegFI) FrameReg = ABI.GetStackPtr(); else if (RegInfo->needsStackRealignment(MF)) { if (MFI->hasVarSizedObjects() && !MFI->isFixedObjectIndex(FrameIndex)) diff --git a/contrib/llvm/lib/Target/Mips/MipsSchedule.td b/contrib/llvm/lib/Target/Mips/MipsSchedule.td index 54b5d28..37f9e49 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSchedule.td +++ b/contrib/llvm/lib/Target/Mips/MipsSchedule.td @@ -16,8 +16,8 @@ def IMULDIV : FuncUnit; //===----------------------------------------------------------------------===// // Instruction Itinerary classes used for Mips //===----------------------------------------------------------------------===// -def IIAlu : InstrItinClass; -def IIBranch : InstrItinClass; +// IIM16Alu is a placeholder class for most MIPS16 instructions. +def IIM16Alu : InstrItinClass; def IIPseudo : InstrItinClass; def II_ABS : InstrItinClass; @@ -28,7 +28,19 @@ def II_ADD_D : InstrItinClass; def II_ADD_S : InstrItinClass; def II_AND : InstrItinClass; def II_ANDI : InstrItinClass; +def II_B : InstrItinClass; def II_BADDU : InstrItinClass; +def II_BBIT : InstrItinClass; // bbit[01], bbit[01]32 +def II_BC : InstrItinClass; +def II_BC1F : InstrItinClass; +def II_BC1FL : InstrItinClass; +def II_BC1T : InstrItinClass; +def II_BC1TL : InstrItinClass; +def II_BCC : InstrItinClass; // beq and bne +def II_BCCZ : InstrItinClass; // b[gl][et]z +def II_BCCZAL : InstrItinClass; // bgezal and bltzal +def II_BCCZALS : InstrItinClass; // bgezals and bltzals +def II_BCCZC : InstrItinClass; // beqzc, bnezc def II_CEIL : InstrItinClass; def II_CFC1 : InstrItinClass; def II_CLO : InstrItinClass; @@ -68,21 +80,39 @@ def II_DSUB : InstrItinClass; def II_EXT : InstrItinClass; // Any EXT instruction def II_FLOOR : InstrItinClass; def II_INS : InstrItinClass; // Any INS instruction +def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo. +def II_J : InstrItinClass; +def II_JAL : InstrItinClass; +def II_JALR : InstrItinClass; +def II_JALRC : InstrItinClass; +def II_JALRS : InstrItinClass; +def II_JALS : InstrItinClass; +def II_JR : InstrItinClass; +def II_JRADDIUSP : InstrItinClass; +def II_JRC : InstrItinClass; +def II_ReturnPseudo : InstrItinClass; // Return pseudo. def II_LB : InstrItinClass; +def II_LBE : InstrItinClass; def II_LBU : InstrItinClass; +def II_LBUE : InstrItinClass; def II_LD : InstrItinClass; def II_LDC1 : InstrItinClass; def II_LDL : InstrItinClass; def II_LDR : InstrItinClass; def II_LDXC1 : InstrItinClass; def II_LH : InstrItinClass; +def II_LHE : InstrItinClass; def II_LHU : InstrItinClass; +def II_LHUE : InstrItinClass; def II_LUI : InstrItinClass; def II_LUXC1 : InstrItinClass; def II_LW : InstrItinClass; +def II_LWE : InstrItinClass; def II_LWC1 : InstrItinClass; def II_LWL : InstrItinClass; +def II_LWLE : InstrItinClass; def II_LWR : InstrItinClass; +def II_LWRE : InstrItinClass; def II_LWU : InstrItinClass; def II_LWXC1 : InstrItinClass; def II_MADD : InstrItinClass; @@ -134,6 +164,7 @@ def II_ROTRV : InstrItinClass; def II_ROUND : InstrItinClass; def II_SAVE : InstrItinClass; def II_SB : InstrItinClass; +def II_SBE : InstrItinClass; def II_SD : InstrItinClass; def II_SDC1 : InstrItinClass; def II_SDL : InstrItinClass; @@ -144,6 +175,7 @@ def II_SEH : InstrItinClass; def II_SEQ_SNE : InstrItinClass; // seq and sne def II_SEQI_SNEI : InstrItinClass; // seqi and snei def II_SH : InstrItinClass; +def II_SHE : InstrItinClass; def II_SLL : InstrItinClass; def II_SLLV : InstrItinClass; def II_SLTI_SLTIU : InstrItinClass; // slti and sltiu @@ -159,11 +191,15 @@ def II_SUB_D : InstrItinClass; def II_SUB_S : InstrItinClass; def II_SUXC1 : InstrItinClass; def II_SW : InstrItinClass; +def II_SWE : InstrItinClass; def II_SWC1 : InstrItinClass; def II_SWL : InstrItinClass; +def II_SWLE : InstrItinClass; def II_SWR : InstrItinClass; +def II_SWRE : InstrItinClass; def II_SWXC1 : InstrItinClass; def II_TRUNC : InstrItinClass; +def II_WSBH : InstrItinClass; def II_XOR : InstrItinClass; def II_XORI : InstrItinClass; @@ -171,7 +207,7 @@ def II_XORI : InstrItinClass; // Mips Generic instruction itineraries. //===----------------------------------------------------------------------===// def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ - InstrItinData<IIAlu , [InstrStage<1, [ALU]>]>, + InstrItinData<IIM16Alu , [InstrStage<1, [ALU]>]>, InstrItinData<II_ADDI , [InstrStage<1, [ALU]>]>, InstrItinData<II_ADDIU , [InstrStage<1, [ALU]>]>, InstrItinData<II_ADDU , [InstrStage<1, [ALU]>]>, @@ -240,7 +276,29 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_SAVE , [InstrStage<1, [ALU]>]>, InstrItinData<II_SEQ_SNE , [InstrStage<1, [ALU]>]>, InstrItinData<II_SEQI_SNEI , [InstrStage<1, [ALU]>]>, - InstrItinData<IIBranch , [InstrStage<1, [ALU]>]>, + InstrItinData<II_B , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BBIT , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1F , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1FL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1T , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BC1TL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZ , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZAL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZALS , [InstrStage<1, [ALU]>]>, + InstrItinData<II_BCCZC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_IndirectBranchPseudo, [InstrStage<1, [ALU]>]>, + InstrItinData<II_J , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JAL , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALR , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALRC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALRS , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JALS , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JR , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JRADDIUSP , [InstrStage<1, [ALU]>]>, + InstrItinData<II_JRC , [InstrStage<1, [ALU]>]>, + InstrItinData<II_ReturnPseudo , [InstrStage<1, [ALU]>]>, InstrItinData<II_DMUL , [InstrStage<17, [IMULDIV]>]>, InstrItinData<II_DMULT , [InstrStage<17, [IMULDIV]>]>, InstrItinData<II_DMULTU , [InstrStage<17, [IMULDIV]>]>, @@ -313,3 +371,5 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_MFHC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTHC1 , [InstrStage<2, [ALU]>]> ]>; + +include "MipsScheduleP5600.td" diff --git a/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td new file mode 100644 index 0000000..d32ae4f --- /dev/null +++ b/contrib/llvm/lib/Target/Mips/MipsScheduleP5600.td @@ -0,0 +1,392 @@ +//==- MipsScheduleP5600.td - P5600 Scheduling Definitions --*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +def MipsP5600Model : SchedMachineModel { + int IssueWidth = 2; // 2x dispatched per cycle + int MicroOpBufferSize = 48; // min(48, 48, 64) + int LoadLatency = 4; + int MispredictPenalty = 8; // TODO: Estimated + + let CompleteModel = 1; +} + +let SchedModel = MipsP5600Model in { + +// ALQ Pipelines +// ============= + +def P5600ALQ : ProcResource<1> { let BufferSize = 16; } +def P5600IssueALU : ProcResource<1> { let Super = P5600ALQ; } + +// ALU Pipeline +// ------------ + +def P5600WriteALU : SchedWriteRes<[P5600IssueALU]>; + +// and, lui, nor, or, slti, sltiu, sub, subu, xor +def : ItinRW<[P5600WriteALU], + [II_AND, II_LUI, II_NOR, II_OR, II_SLTI_SLTIU, II_SUBU, II_XOR]>; + +// AGQ Pipelines +// ============= + +def P5600AGQ : ProcResource<3> { let BufferSize = 16; } +def P5600IssueAL2 : ProcResource<1> { let Super = P5600AGQ; } +def P5600IssueCTISTD : ProcResource<1> { let Super = P5600AGQ; } +def P5600IssueLDST : ProcResource<1> { let Super = P5600AGQ; } + +def P5600AL2Div : ProcResource<1>; +// Pseudo-resource used to block CTISTD when handling multi-pipeline splits. +def P5600CTISTD : ProcResource<1>; + +// CTISTD Pipeline +// --------------- + +def P5600WriteJump : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]>; +def P5600WriteJumpAndLink : SchedWriteRes<[P5600IssueCTISTD, P5600CTISTD]> { + let Latency = 2; +} + +// b, beq, beql, bg[et]z, bl[et]z, bne, bnel, j, syscall, jal, bltzal, jalx, +// jalr, jr.hb, jr +def : ItinRW<[P5600WriteJump], [II_B, II_BCC, II_BCCZ, II_BCCZAL, II_J, II_JR]>; +def : ItinRW<[P5600WriteJumpAndLink], [II_JAL, II_JALR]>; + +// LDST Pipeline +// ------------- + +def P5600WriteLoad : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 4; +} + +def P5600WriteLoadShifted : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> { + let Latency = 4; +} + +def P5600WritePref : SchedWriteRes<[P5600IssueLDST]>; + +def P5600WriteStore : SchedWriteRes<[P5600IssueLDST, P5600CTISTD]> { + // FIXME: This is a bit pessimistic. P5600CTISTD is only used during cycle 2 + // not during 0, 1, and 2. + let ResourceCycles = [ 1, 3 ]; +} + +def P5600WriteGPRFromBypass : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 2; +} + +def P5600WriteStoreFromOtherUnits : SchedWriteRes<[P5600IssueLDST]>; +def P5600WriteLoadToOtherUnits : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 0; +} + +// l[bhw], l[bh]u, ll +def : ItinRW<[P5600WriteLoad], [II_LB, II_LBU, II_LH, II_LHU, II_LW, II_LWU]>; + +// lw[lr] +def : ItinRW<[P5600WriteLoadShifted], [II_LWL, II_LWR]>; + +// s[bhw], sw[lr] +def : ItinRW<[P5600WriteStore], [II_SB, II_SH, II_SW, II_SWL, II_SWR]>; + +// pref +// (this instruction does not exist in the backend yet) +def : ItinRW<[P5600WritePref], []>; + +// sc +// (this instruction does not exist in the backend yet) +def : ItinRW<[P5600WriteStore], []>; + +// LDST is also used in moves from general purpose registers to floating point +// and MSA. +def P5600WriteMoveGPRToOtherUnits : SchedWriteRes<[P5600IssueLDST]> { + let Latency = 0; +} + +// AL2 Pipeline +// ------------ + +def P5600WriteAL2 : SchedWriteRes<[P5600IssueAL2]>; +def P5600WriteAL2BitExt : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; } +def P5600WriteAL2ShadowMov : SchedWriteRes<[P5600IssueAL2]> { let Latency = 2; } +def P5600WriteAL2CondMov : SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> { + let Latency = 2; +} +def P5600WriteAL2Div : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> { + // Estimated worst case + let Latency = 34; + let ResourceCycles = [1, 34]; +} +def P5600WriteAL2DivU : SchedWriteRes<[P5600IssueAL2, P5600AL2Div]> { + // Estimated worst case + let Latency = 34; + let ResourceCycles = [1, 34]; +} +def P5600WriteAL2Mul : SchedWriteRes<[P5600IssueAL2]> { let Latency = 3; } +def P5600WriteAL2Mult: SchedWriteRes<[P5600IssueAL2]> { let Latency = 5; } +def P5600WriteAL2MAdd: SchedWriteRes<[P5600IssueAL2, P5600CTISTD]> { + let Latency = 5; +} + +// clo, clz, di, mfhi, mflo +def : ItinRW<[P5600WriteAL2], [II_CLO, II_CLZ, II_MFHI_MFLO]>; + +// ehb, rdhwr, rdpgpr, wrpgpr, wsbh +def : ItinRW<[P5600WriteAL2ShadowMov], [II_RDHWR]>; + +// mov[nz] +def : ItinRW<[P5600WriteAL2CondMov], [II_MOVN, II_MOVZ]>; + +// divu? +def : ItinRW<[P5600WriteAL2Div], [II_DIV]>; +def : ItinRW<[P5600WriteAL2DivU], [II_DIVU]>; + +// mul +def : ItinRW<[P5600WriteAL2Mul], [II_MUL]>; +// multu?, multu? +def : ItinRW<[P5600WriteAL2Mult], [II_MULT, II_MULTU]>; +// maddu?, msubu?, mthi, mtlo +def : ItinRW<[P5600WriteAL2MAdd], + [II_MADD, II_MADDU, II_MSUB, II_MSUBU, II_MTHI_MTLO]>; + +// ext, ins +def : ItinRW<[P5600WriteAL2BitExt], + [II_EXT, II_INS]>; + +// Either ALU or AL2 Pipelines +// --------------------------- +// +// Some instructions can choose between ALU and AL2, but once dispatched to +// ALQ or AGQ respectively they are committed to that path. +// The decision is based on the outcome of the most recent selection when the +// choice was last available. For now, we assume ALU is always chosen. + +def P5600WriteEitherALU : SchedWriteVariant< + // FIXME: Implement selection predicate + [SchedVar<SchedPredicate<[{1}]>, [P5600WriteALU]>, + SchedVar<SchedPredicate<[{0}]>, [P5600WriteAL2]> + ]>; + +// add, addi, addiu, addu, andi, ori, rotr, se[bh], sllv?, sr[al]v?, slt, sltu, +// xori +def : ItinRW<[P5600WriteEitherALU], + [II_ADDI, II_ADDIU, II_ANDI, II_ORI, II_ROTR, II_SEB, II_SEH, + II_SLT_SLTU, II_SLL, II_SRA, II_SRL, II_XORI, II_ADDU, II_SLLV, + II_SRAV, II_SRLV]>; + +// FPU Pipelines +// ============= + +def P5600FPQ : ProcResource<3> { let BufferSize = 16; } +def P5600IssueFPUS : ProcResource<1> { let Super = P5600FPQ; } +def P5600IssueFPUL : ProcResource<1> { let Super = P5600FPQ; } +def P5600IssueFPULoad : ProcResource<1> { let Super = P5600FPQ; } + +def P5600FPUDivSqrt : ProcResource<2>; + +def P5600WriteFPUS : SchedWriteRes<[P5600IssueFPUS]>; +def P5600WriteFPUL : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 4; } +def P5600WriteFPUL_MADDSUB : SchedWriteRes<[P5600IssueFPUL]> { let Latency = 6; } +def P5600WriteFPUDivS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 23 / 27 + let Latency = 23; // Using common case + let ResourceCycles = [ 1, 23 ]; +} +def P5600WriteFPUDivD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 31 / 35 + let Latency = 31; // Using common case + let ResourceCycles = [ 1, 31 ]; +} +def P5600WriteFPURcpS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 19 / 23 + let Latency = 19; // Using common case + let ResourceCycles = [ 1, 19 ]; +} +def P5600WriteFPURcpD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 31 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPURsqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 27 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPURsqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 31 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPUSqrtS : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 27 / 31 + let Latency = 27; // Using common case + let ResourceCycles = [ 1, 27 ]; +} +def P5600WriteFPUSqrtD : SchedWriteRes<[P5600IssueFPUL, P5600FPUDivSqrt]> { + // Best/Common/Worst case = 7 / 35 / 39 + let Latency = 35; // Using common case + let ResourceCycles = [ 1, 35 ]; +} +def P5600WriteMSAShortLogic : SchedWriteRes<[P5600IssueFPUS]>; +def P5600WriteMSAShortInt : SchedWriteRes<[P5600IssueFPUS]> { let Latency = 2; } +def P5600WriteMoveOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPUS]>; + +// FPUS is also used in moves from floating point and MSA registers to general +// purpose registers. +def P5600WriteMoveFPUSToOtherUnits : SchedWriteRes<[P5600IssueFPUS]> { + let Latency = 0; +} + +// FPUL is also used in moves from floating point and MSA registers to general +// purpose registers. +def P5600WriteMoveFPULToOtherUnits : SchedWriteRes<[P5600IssueFPUL]>; + +// Short Pipe +// ---------- +// +// abs.[ds], abs.ps, bc1[tf]l?, mov[tf].[ds], mov[tf], mov.[ds], [cm][ft]c1, +// m[ft]hc1, neg.[ds], neg.ps, nor.v, nori.b, or.v, ori.b, xor.v, xori.b, +// sdxc1, sdc1, st.[bhwd], swc1, swxc1 +def : ItinRW<[P5600WriteFPUS], [II_ABS, II_MOVF_D, II_MOVF_S, II_MOVT_D, + II_MOVT_S, II_MOV_D, II_MOV_S, II_NEG]>; + +// adds_a.[bhwd], adds_[asu].[bhwd], addvi?.[bhwd], asub_[us].[bhwd], +// aver?_[us].[bhwd] +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADD_A_[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDS_[ASU]_[BHWD]$")>; +// TODO: ADDVI_[BHW] might be 1 cycle latency rather than 2. Need to confirm it. +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ADDVI?_[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortInt], (instregex "^ASUB_[US].[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortInt], (instregex "^AVER?_[US].[BHWD]$")>; + +// and.v, andi.b, move.v, ldi.[bhwd] +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^MOVE_V$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^LDI_[BHWD]$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)_V$")>; +def : InstRW<[P5600WriteMSAShortLogic], (instregex "^(AND|OR|[XN]OR)I_B$")>; + +// Long Pipe +// ---------- +// +// add.[ds], add.ps, cvt.d.[sw], cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, +// cvt.ps.[sw], c.<cc>.[ds], c.<cc>.ps, mul.[ds], mul.ps, sub.[ds], sub.ps, +// trunc.w.[ds], trunc.w.ps +def : ItinRW<[P5600WriteFPUL], + [II_ADD_D, II_ADD_S, II_CVT, II_C_CC_D, II_C_CC_S, II_MUL_D, + II_MUL_S, II_SUB_D, II_SUB_S, II_TRUNC]>; + +// div.[ds], div.ps +def : ItinRW<[P5600WriteFPUDivS], [II_DIV_S]>; +def : ItinRW<[P5600WriteFPUDivD], [II_DIV_D]>; + +// sqrt.[ds], sqrt.ps +def : ItinRW<[P5600WriteFPUSqrtS], [II_SQRT_S]>; +def : ItinRW<[P5600WriteFPUSqrtD], [II_SQRT_D]>; + +// madd.[ds], msub.[ds], nmadd.[ds], nmsub.[ds], +// Operand 0 is read on cycle 5. All other operands are read on operand 0. +def : ItinRW<[SchedReadAdvance<5>, P5600WriteFPUL_MADDSUB], + [II_MADD_D, II_MADD_S, II_MSUB_D, II_MSUB_S, II_NMADD_D, + II_NMADD_S, II_NMSUB_D, II_NMSUB_S]>; + +// madd.ps, msub.ps, nmadd.ps, nmsub.ps +// Operand 0 and 1 are read on cycle 5. All others are read on operand 0. +// (none of these instructions exist in the backend yet) + +// Load Pipe +// --------- +// +// This is typically used in conjunction with the load pipeline under the AGQ +// All the instructions are in the 'Tricky Instructions' section. + +def P5600WriteLoadOtherUnitsToFPU : SchedWriteRes<[P5600IssueFPULoad]> { + let Latency = 4; +} + +// Tricky Instructions +// =================== +// +// These instructions are split across multiple uops (in different pipelines) +// that must cooperate to complete the operation + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteMoveGPRToFPU : WriteSequence<[P5600WriteMoveGPRToOtherUnits, + P5600WriteMoveOtherUnitsToFPU]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteMoveFPUToGPR : WriteSequence<[P5600WriteMoveFPUSToOtherUnits, + P5600WriteGPRFromBypass]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteStoreFPUS : WriteSequence<[P5600WriteMoveFPUSToOtherUnits, + P5600WriteStoreFromOtherUnits]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteStoreFPUL : WriteSequence<[P5600WriteMoveFPULToOtherUnits, + P5600WriteStoreFromOtherUnits]>; + +// FIXME: This isn't quite right since the implementation of WriteSequence +// current aggregates the resources and ignores the exact cycle they are +// used. +def P5600WriteLoadFPU : WriteSequence<[P5600WriteLoadToOtherUnits, + P5600WriteLoadOtherUnitsToFPU]>; + +// ctc1, mtc1, mthc1 +def : ItinRW<[P5600WriteMoveGPRToFPU], [II_CTC1, II_MTC1, II_MTHC1]>; + +// bc1[ft], cfc1, mfc1, mfhc1, movf, movt +def : ItinRW<[P5600WriteMoveFPUToGPR], + [II_BC1F, II_BC1T, II_CFC1, II_MFC1, II_MFHC1, II_MOVF, II_MOVT]>; + +// swc1, swxc1, st.[bhwd] +def : ItinRW<[P5600WriteStoreFPUS], [II_SWC1, II_SWXC1]>; +def : InstRW<[P5600WriteStoreFPUS], (instregex "^ST_[BHWD]$")>; + +// movn.[ds], movz.[ds] +def : ItinRW<[P5600WriteStoreFPUL], [II_MOVN_D, II_MOVN_S, II_MOVZ_D, II_MOVZ_S]>; + +// l[dw]x?c1, ld.[bhwd] +def : ItinRW<[P5600WriteLoadFPU], [II_LDC1, II_LDXC1, II_LWC1, II_LWXC1]>; +def : InstRW<[P5600WriteLoadFPU], (instregex "LD_[BHWD]")>; + +// Unsupported Instructions +// ======================== +// +// The following instruction classes are never valid on P5600. +// II_DADDIU, II_DADDU, II_DMFC1, II_DMTC1, II_DMULT, II_DMULTU, II_DROTR, +// II_DROTR32, II_DROTRV, II_DDIV, II_DSLL, II_DSLL32, II_DSLLV, II_DSRA, +// II_DSRA32, II_DSRAV, II_DSRL, II_DSRL32, II_DSRLV, II_DSUBU, II_DDIVU, +// II_JALRC, II_LD, II_LD[LR], II_LUXC1, II_RESTORE, II_SAVE, II_SD, II_SDC1, +// II_SDL, II_SDR, II_SDXC1 +// +// The following instructions are never valid on P5600. +// addq.ph, rdhwr, repl.ph, repl.qb, subq.ph, subu_s.qb +// +// Guesswork +// ========= +// +// This section is largely temporary guesswork. + +// ceil.[lw].[ds], floor.[lw].[ds] +// Reason behind guess: trunc.[lw].ds and the various cvt's are in FPUL +def : ItinRW<[P5600WriteFPUL], [II_CEIL, II_FLOOR, II_ROUND]>; + +// rotrv +// Reason behind guess: rotr is in the same category and the two register forms +// generally follow the immediate forms in this category +def : ItinRW<[P5600WriteEitherALU], [II_ROTRV]>; +} diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp index 471b6e1..8a18b51 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.cpp @@ -69,8 +69,9 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU, HasMips3_32(false), HasMips3_32r2(false), HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false), InMips16Mode(false), InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false), - HasDSPR2(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), - HasMSA(false), TM(TM), TargetTriple(TT), TSInfo(), + HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), + Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasEVA(false), TM(TM), + TargetTriple(TT), TSInfo(), InstrInfo( MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))), FrameLowering(MipsFrameLowering::create(*this)), diff --git a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h index 1db8881..fbb01fe 100644 --- a/contrib/llvm/lib/Target/Mips/MipsSubtarget.h +++ b/contrib/llvm/lib/Target/Mips/MipsSubtarget.h @@ -42,9 +42,15 @@ class MipsSubtarget : public MipsGenSubtargetInfo { Mips3, Mips4, Mips5, Mips64, Mips64r2, Mips64r3, Mips64r5, Mips64r6 }; + enum class CPU { P5600 }; + // Mips architecture version MipsArchEnum MipsArchVersion; + // Processor implementation (unused but required to exist by + // tablegen-erated code). + CPU ProcImpl; + // IsLittle - The target is Little Endian bool IsLittle; @@ -116,8 +122,8 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // InMicroMips -- can process MicroMips instructions bool InMicroMipsMode; - // HasDSP, HasDSPR2 -- supports DSP ASE. - bool HasDSP, HasDSPR2; + // HasDSP, HasDSPR2, HasDSPR3 -- supports DSP ASE. + bool HasDSP, HasDSPR2, HasDSPR3; // Allow mixed Mips16 and Mips32 in one source file bool AllowMixed16_32; @@ -130,6 +136,12 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // HasMSA -- supports MSA ASE. bool HasMSA; + // UseTCCInDIV -- Enables the use of trapping in the assembler. + bool UseTCCInDIV; + + // HasEVA -- supports EVA ASE. + bool HasEVA; + InstrItineraryData InstrItins; // We can override the determination of whether we are in mips16 mode @@ -189,7 +201,7 @@ public: } bool hasMips32r5() const { return (MipsArchVersion >= Mips32r5 && MipsArchVersion < Mips32Max) || - hasMips64r2(); + hasMips64r5(); } bool hasMips32r6() const { return (MipsArchVersion >= Mips32r6 && MipsArchVersion < Mips32Max) || @@ -228,9 +240,12 @@ public: } bool inMicroMipsMode() const { return InMicroMipsMode; } bool inMicroMips32r6Mode() const { return InMicroMipsMode && hasMips32r6(); } + bool inMicroMips64r6Mode() const { return InMicroMipsMode && hasMips64r6(); } bool hasDSP() const { return HasDSP; } bool hasDSPR2() const { return HasDSPR2; } + bool hasDSPR3() const { return HasDSPR3; } bool hasMSA() const { return HasMSA; } + bool hasEVA() const { return HasEVA; } bool useSmallSection() const { return UseSmallSection; } bool hasStandardEncoding() const { return !inMips16Mode(); } diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp index 1c77745..3e63872 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsTargetMachine.cpp @@ -233,7 +233,7 @@ void MipsPassConfig::addPreRegAlloc() { } TargetIRAnalysis MipsTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { if (Subtarget->allowMixed16_32()) { DEBUG(errs() << "No Target Transform Info Pass Added\n"); // FIXME: This is no longer necessary as the TTI returned is per-function. diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp index 0f2db60..146f33b 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -76,7 +76,7 @@ bool MipsTargetObjectFile:: IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM, SectionKind Kind) const { return (IsGlobalInSmallSectionImpl(GV, TM) && - (Kind.isDataRel() || Kind.isBSS() || Kind.isCommon())); + (Kind.isData() || Kind.isBSS() || Kind.isCommon())); } /// Return true if this global address should be placed into small data/bss @@ -107,7 +107,8 @@ IsGlobalInSmallSectionImpl(const GlobalValue *GV, return false; Type *Ty = GV->getType()->getElementType(); - return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty)); + return IsInSmallSection( + GV->getParent()->getDataLayout().getTypeAllocSize(Ty)); } MCSection * @@ -120,7 +121,7 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, // Handle Small Section classification here. if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallBSSSection; - if (Kind.isDataRel() && IsGlobalInSmallSection(GV, TM, Kind)) + if (Kind.isData() && IsGlobalInSmallSection(GV, TM, Kind)) return SmallDataSection; // Otherwise, we work the same as ELF. @@ -128,21 +129,20 @@ MipsTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, } /// Return true if this constant should be placed into small data section. -bool MipsTargetObjectFile:: -IsConstantInSmallSection(const Constant *CN, const TargetMachine &TM) const { +bool MipsTargetObjectFile::IsConstantInSmallSection( + const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const { return (static_cast<const MipsTargetMachine &>(TM) .getSubtargetImpl() ->useSmallSection() && - LocalSData && IsInSmallSection(TM.getDataLayout()->getTypeAllocSize( - CN->getType()))); + LocalSData && IsInSmallSection(DL.getTypeAllocSize(CN->getType()))); } -MCSection * -MipsTargetObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { - if (IsConstantInSmallSection(C, *TM)) +/// Return true if this constant should be placed into small data section. +MCSection *MipsTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { + if (IsConstantInSmallSection(DL, C, *TM)) return SmallDataSection; // Otherwise, we work the same as ELF. - return TargetLoweringObjectFileELF::getSectionForConstant(Kind, C); + return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C); } diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h index 725f2ff..ba04343 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h +++ b/contrib/llvm/lib/Target/Mips/MipsTargetObjectFile.h @@ -36,10 +36,10 @@ class MipsTargetMachine; const TargetMachine &TM) const override; /// Return true if this constant should be placed into small data section. - bool IsConstantInSmallSection(const Constant *CN, + bool IsConstantInSmallSection(const DataLayout &DL, const Constant *CN, const TargetMachine &TM) const; - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h index 6ce1be7..b3222f5 100644 --- a/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h +++ b/contrib/llvm/lib/Target/Mips/MipsTargetStreamer.h @@ -12,6 +12,7 @@ #include "MCTargetDesc/MipsABIFlagsSection.h" #include "MCTargetDesc/MipsABIInfo.h" +#include "llvm/ADT/Optional.h" #include "llvm/MC/MCELFStreamer.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" @@ -77,8 +78,12 @@ public: // PIC support virtual void emitDirectiveCpLoad(unsigned RegNo); + virtual void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts, + int Offset); virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg); + virtual void emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister); // FP abiflags directives virtual void emitDirectiveModuleFP(); @@ -97,18 +102,18 @@ public: // structure values. template <class PredicateLibrary> void updateABIInfo(const PredicateLibrary &P) { - ABI = &P.getABI(); + ABI = P.getABI(); ABIFlagsSection.setAllFromPredicates(P); } MipsABIFlagsSection &getABIFlagsSection() { return ABIFlagsSection; } const MipsABIInfo &getABI() const { - assert(ABI && "ABI hasn't been set!"); + assert(ABI.hasValue() && "ABI hasn't been set!"); return *ABI; } protected: - const MipsABIInfo *ABI; + llvm::Optional<MipsABIInfo> ABI; MipsABIFlagsSection ABIFlagsSection; bool GPRInfoSet; @@ -188,8 +193,12 @@ public: // PIC support void emitDirectiveCpLoad(unsigned RegNo) override; + void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts, + int Offset) override; void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + void emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) override; // FP abiflags directives void emitDirectiveModuleFP() override; @@ -237,8 +246,12 @@ public: // PIC support void emitDirectiveCpLoad(unsigned RegNo) override; + void emitDirectiveCpRestore(SmallVector<MCInst, 3> &StoreInsts, + int Offset) override; void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset, const MCSymbol &Sym, bool IsReg) override; + void emitDirectiveCpreturn(unsigned SaveLocation, + bool SaveLocationIsRegister) override; void emitMipsAbiFlags(); }; diff --git a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h index 02c5a21..f0f223a 100644 --- a/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h @@ -15,11 +15,9 @@ #define LLVM_LIB_TARGET_NVPTX_INSTPRINTER_NVPTXINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" namespace llvm { -class MCOperand; class MCSubtargetInfo; class NVPTXInstPrinter : public MCInstPrinter { diff --git a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h index b432e06..9ac3c88 100644 --- a/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h @@ -22,6 +22,7 @@ class Triple; class NVPTXMCAsmInfo : public MCAsmInfo { virtual void anchor(); + public: explicit NVPTXMCAsmInfo(const Triple &TheTriple); }; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTX.h b/contrib/llvm/lib/Target/NVPTX/NVPTX.h index fe28214..e5fae85 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTX.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTX.h @@ -41,24 +41,6 @@ enum CondCodes { }; } -inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) { - switch (CC) { - case NVPTXCC::NE: - return "ne"; - case NVPTXCC::EQ: - return "eq"; - case NVPTXCC::LT: - return "lt"; - case NVPTXCC::LE: - return "le"; - case NVPTXCC::GT: - return "gt"; - case NVPTXCC::GE: - return "ge"; - } - llvm_unreachable("Unknown condition code"); -} - FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel); ModulePass *createNVPTXAssignValidGlobalNamesPass(); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index ecb0f0a..e8c3608 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -355,7 +355,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) { if (isABI) { if (Ty->isFloatingPointTy() || Ty->isIntegerTy()) { unsigned size = 0; - if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty)) { + if (auto *ITy = dyn_cast<IntegerType>(Ty)) { size = ITy->getBitWidth(); if (size < 32) size = 32; @@ -635,9 +635,7 @@ static bool usedInGlobalVarDef(const Constant *C) { return false; if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) { - if (GV->getName() == "llvm.used") - return false; - return true; + return GV->getName() != "llvm.used"; } for (const User *U : C->users()) @@ -682,7 +680,7 @@ static bool usedInOneFunc(const User *U, Function const *&oneFunc) { static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) { if (!gv->hasInternalLinkage()) return false; - const PointerType *Pty = gv->getType(); + PointerType *Pty = gv->getType(); if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED) return false; @@ -720,7 +718,7 @@ static bool useFuncSeen(const Constant *C, void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { llvm::DenseMap<const Function *, bool> seenMap; for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { - const Function *F = FI; + const Function *F = &*FI; if (F->isDeclaration()) { if (F->use_empty()) @@ -870,9 +868,8 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) { DenseSet<const GlobalVariable *> GVVisiting; // Visit each global variable, in order - for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); - I != E; ++I) - VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting); + for (const GlobalVariable &I : M.globals()) + VisitGlobalVariableForEmission(&I, Globals, GVVisited, GVVisiting); assert(GVVisited.size() == M.getGlobalList().size() && "Missed a global variable"); @@ -1029,10 +1026,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, GVar->getName().startswith("nvvm.")) return; - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // GlobalVariables are always constant pointers themselves. - const PointerType *PTy = GVar->getType(); + PointerType *PTy = GVar->getType(); Type *ETy = PTy->getElementType(); if (GVar->hasExternalLinkage()) { @@ -1159,7 +1156,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, } if (GVar->getAlignment() == 0) - O << " .align " << (int) TD->getPrefTypeAlignment(ETy); + O << " .align " << (int)DL.getPrefTypeAlignment(ETy); else O << " .align " << GVar->getAlignment(); @@ -1185,9 +1182,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, printScalarConstant(Initializer, O); } } else { - // The frontend adds zero-initializer to variables that don't have an - // initial value, so skip warning for this case. - if (!GVar->getInitializer()->isNullValue()) { + // The frontend adds zero-initializer to device and constant variables + // that don't have an initial value, and UndefValue to shared + // variables, so skip warning for this case. + if (!GVar->getInitializer()->isNullValue() && + !isa<UndefValue>(GVar->getInitializer())) { report_fatal_error("initial value of '" + GVar->getName() + "' is not allowed in addrspace(" + Twine(PTy->getAddressSpace()) + ")"); @@ -1205,7 +1204,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, case Type::StructTyID: case Type::ArrayTyID: case Type::VectorTyID: - ElementSize = TD->getTypeStoreSize(ETy); + ElementSize = DL.getTypeStoreSize(ETy); // Ptx allows variable initilization only for constant and // global state spaces. if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) || @@ -1299,7 +1298,7 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace, } std::string -NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const { +NVPTXAsmPrinter::getPTXFundamentalTypeStr(Type *Ty, bool useB4PTR) const { switch (Ty->getTypeID()) { default: llvm_unreachable("unexpected type"); @@ -1339,16 +1338,16 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const { void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // GlobalVariables are always constant pointers themselves. - const PointerType *PTy = GVar->getType(); + PointerType *PTy = GVar->getType(); Type *ETy = PTy->getElementType(); O << "."; emitPTXAddressSpace(PTy->getAddressSpace(), O); if (GVar->getAlignment() == 0) - O << " .align " << (int) TD->getPrefTypeAlignment(ETy); + O << " .align " << (int)DL.getPrefTypeAlignment(ETy); else O << " .align " << GVar->getAlignment(); @@ -1370,7 +1369,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, case Type::StructTyID: case Type::ArrayTyID: case Type::VectorTyID: - ElementSize = TD->getTypeStoreSize(ETy); + ElementSize = DL.getTypeStoreSize(ETy); O << " .b8 "; getSymbol(GVar)->print(O, MAI); O << "["; @@ -1385,32 +1384,32 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar, return; } -static unsigned int getOpenCLAlignment(const DataLayout *TD, Type *Ty) { +static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) { if (Ty->isSingleValueType()) - return TD->getPrefTypeAlignment(Ty); + return DL.getPrefTypeAlignment(Ty); - const ArrayType *ATy = dyn_cast<ArrayType>(Ty); + auto *ATy = dyn_cast<ArrayType>(Ty); if (ATy) - return getOpenCLAlignment(TD, ATy->getElementType()); + return getOpenCLAlignment(DL, ATy->getElementType()); - const StructType *STy = dyn_cast<StructType>(Ty); + auto *STy = dyn_cast<StructType>(Ty); if (STy) { unsigned int alignStruct = 1; // Go through each element of the struct and find the // largest alignment. for (unsigned i = 0, e = STy->getNumElements(); i != e; i++) { Type *ETy = STy->getElementType(i); - unsigned int align = getOpenCLAlignment(TD, ETy); + unsigned int align = getOpenCLAlignment(DL, ETy); if (align > alignStruct) alignStruct = align; } return alignStruct; } - const FunctionType *FTy = dyn_cast<FunctionType>(Ty); + auto *FTy = dyn_cast<FunctionType>(Ty); if (FTy) - return TD->getPointerPrefAlignment(); - return TD->getPrefTypeAlignment(Ty); + return DL.getPointerPrefAlignment(); + return DL.getPrefTypeAlignment(Ty); } void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, @@ -1419,13 +1418,8 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, O << "_param_" << paramIndex; } -void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) { - CurrentFnSym->print(O, MAI); - O << "_param_" << paramIndex; -} - void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const AttributeSet &PAL = F->getAttributes(); const TargetLowering *TLI = nvptxSubtarget->getTargetLowering(); Function::const_arg_iterator I, E; @@ -1433,7 +1427,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { bool first = true; bool isKernelFunc = llvm::isKernelFunction(*F); bool isABI = (nvptxSubtarget->getSmVersion() >= 20); - MVT thePointerTy = TLI->getPointerTy(*TD); + MVT thePointerTy = TLI->getPointerTy(DL); O << "(\n"; @@ -1485,9 +1479,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // size = typeallocsize of element type unsigned align = PAL.getParamAlignment(paramIndex + 1); if (align == 0) - align = TD->getABITypeAlignment(Ty); + align = DL.getABITypeAlignment(Ty); - unsigned sz = TD->getTypeAllocSize(Ty); + unsigned sz = DL.getTypeAllocSize(Ty); O << "\t.param .align " << align << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1495,7 +1489,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { continue; } // Just a scalar - const PointerType *PTy = dyn_cast<PointerType>(Ty); + auto *PTy = dyn_cast<PointerType>(Ty); if (isKernelFunc) { if (PTy) { // Special handling for pointer arguments to kernel @@ -1519,7 +1513,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { O << ".ptr .global "; break; } - O << ".align " << (int) getOpenCLAlignment(TD, ETy) << " "; + O << ".align " << (int)getOpenCLAlignment(DL, ETy) << " "; } printParamName(I, paramIndex, O); continue; @@ -1556,7 +1550,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { } // param has byVal attribute. So should be a pointer - const PointerType *PTy = dyn_cast<PointerType>(Ty); + auto *PTy = dyn_cast<PointerType>(Ty); assert(PTy && "Param with byval attribute should be a pointer type"); Type *ETy = PTy->getElementType(); @@ -1566,9 +1560,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // size = typeallocsize of element type unsigned align = PAL.getParamAlignment(paramIndex + 1); if (align == 0) - align = TD->getABITypeAlignment(ETy); + align = DL.getABITypeAlignment(ETy); - unsigned sz = TD->getTypeAllocSize(ETy); + unsigned sz = DL.getTypeAllocSize(ETy); O << "\t.param .align " << align << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1579,7 +1573,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // Further, if a part is vector, print the above for // each vector element. SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*TLI, getDataLayout(), ETy, vtparts); + ComputeValueVTs(*TLI, DL, ETy, vtparts); for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { unsigned elems = 1; EVT elemtype = vtparts[i]; @@ -1786,10 +1780,10 @@ static void ConvertDoubleToBytes(unsigned char *p, double val) { void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); if (isa<UndefValue>(CPV) || CPV->isNullValue()) { - int s = TD->getTypeAllocSize(CPV->getType()); + int s = DL.getTypeAllocSize(CPV->getType()); if (s < Bytes) s = Bytes; aggBuffer->addZeros(s); @@ -1800,7 +1794,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, switch (CPV->getType()->getTypeID()) { case Type::IntegerTyID: { - const Type *ETy = CPV->getType(); + Type *ETy = CPV->getType(); if (ETy == Type::getInt8Ty(CPV->getContext())) { unsigned char c = (unsigned char)cast<ConstantInt>(CPV)->getZExtValue(); ConvertIntToBytes<>(ptr, c); @@ -1817,7 +1811,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, break; } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { if (const ConstantInt *constInt = dyn_cast<ConstantInt>( - ConstantFoldConstantExpression(Cexpr, *TD))) { + ConstantFoldConstantExpression(Cexpr, DL))) { int int32 = (int)(constInt->getZExtValue()); ConvertIntToBytes<>(ptr, int32); aggBuffer->addBytes(ptr, 4, Bytes); @@ -1839,7 +1833,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, break; } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { if (const ConstantInt *constInt = dyn_cast<ConstantInt>( - ConstantFoldConstantExpression(Cexpr, *TD))) { + ConstantFoldConstantExpression(Cexpr, DL))) { long long int64 = (long long)(constInt->getZExtValue()); ConvertIntToBytes<>(ptr, int64); aggBuffer->addBytes(ptr, 8, Bytes); @@ -1860,7 +1854,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, case Type::FloatTyID: case Type::DoubleTyID: { const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); - const Type *Ty = CFP->getType(); + Type *Ty = CFP->getType(); if (Ty == Type::getFloatTy(CPV->getContext())) { float float32 = (float) CFP->getValueAPF().convertToFloat(); ConvertFloatToBytes(ptr, float32); @@ -1881,7 +1875,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, const Value *v = Cexpr->stripPointerCasts(); aggBuffer->addSymbol(v, Cexpr); } - unsigned int s = TD->getTypeAllocSize(CPV->getType()); + unsigned int s = DL.getTypeAllocSize(CPV->getType()); aggBuffer->addZeros(s); break; } @@ -1891,7 +1885,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, case Type::StructTyID: { if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) || isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) { - int ElementSize = TD->getTypeAllocSize(CPV->getType()); + int ElementSize = DL.getTypeAllocSize(CPV->getType()); bufferAggregateConstant(CPV, aggBuffer); if (Bytes > ElementSize) aggBuffer->addZeros(Bytes - ElementSize); @@ -1909,7 +1903,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, AggBuffer *aggBuffer) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); int Bytes; // Old constants @@ -1934,12 +1928,12 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, StructType *ST = cast<StructType>(CPV->getType()); for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) { if (i == (e - 1)) - Bytes = TD->getStructLayout(ST)->getElementOffset(0) + - TD->getTypeAllocSize(ST) - - TD->getStructLayout(ST)->getElementOffset(i); + Bytes = DL.getStructLayout(ST)->getElementOffset(0) + + DL.getTypeAllocSize(ST) - + DL.getStructLayout(ST)->getElementOffset(i); else - Bytes = TD->getStructLayout(ST)->getElementOffset(i + 1) - - TD->getStructLayout(ST)->getElementOffset(i); + Bytes = DL.getStructLayout(ST)->getElementOffset(i + 1) - + DL.getStructLayout(ST)->getElementOffset(i); bufferLEByte(cast<Constant>(CPV->getOperand(i)), Bytes, aggBuffer); } } @@ -1951,18 +1945,6 @@ void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, // buildTypeNameMap - Run through symbol table looking for type names. // -bool NVPTXAsmPrinter::isImageType(const Type *Ty) { - - std::map<const Type *, std::string>::iterator PI = TypeNameMap.find(Ty); - - if (PI != TypeNameMap.end() && (!PI->second.compare("struct._image1d_t") || - !PI->second.compare("struct._image2d_t") || - !PI->second.compare("struct._image3d_t"))) - return true; - - return false; -} - bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) { switch (MI.getOpcode()) { @@ -2054,7 +2036,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) // If the code isn't optimized, there may be outstanding folding // opportunities. Attempt to fold the expression using DataLayout as a // last resort before giving up. - if (Constant *C = ConstantFoldConstantExpression(CE, *TM.getDataLayout())) + if (Constant *C = ConstantFoldConstantExpression(CE, getDataLayout())) if (C != CE) return lowerConstantForGV(C, ProcessingGeneric); @@ -2083,7 +2065,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) } case Instruction::GetElementPtr: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Generate a symbolic expression for the byte address APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0); @@ -2109,7 +2091,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) return lowerConstantForGV(CE->getOperand(0), ProcessingGeneric); case Instruction::IntToPtr: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Handle casts to pointers by changing them into casts to the appropriate // integer type. This promotes constant folding and simplifies this code. @@ -2120,7 +2102,7 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric) } case Instruction::PtrToInt: { - const DataLayout &DL = *TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); // Support only foldable casts to/from pointers that can be eliminated by // changing the pointer to the appropriately sized integer type. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h index f6f7685..76bf179 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -212,28 +212,21 @@ private: MCOperand GetSymbolRef(const MCSymbol *Symbol); unsigned encodeVirtualRegister(unsigned Reg); - void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {} - void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier, raw_ostream &O); void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O, const char *Modifier = nullptr); - void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const; void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, bool = false); - void printParamName(int paramIndex, raw_ostream &O); void printParamName(Function::const_arg_iterator I, int paramIndex, raw_ostream &O); void emitGlobals(const Module &M); void emitHeader(Module &M, raw_ostream &O, const NVPTXSubtarget &STI); void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const; void emitVirtualRegister(unsigned int vr, raw_ostream &); - void emitFunctionExternParamList(const MachineFunction &MF); void emitFunctionParamList(const Function *, raw_ostream &O); void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O); void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF); - void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize); - bool isImageType(const Type *Ty); void printReturnValStr(const Function *, raw_ostream &O); void printReturnValStr(const MachineFunction &MF, raw_ostream &O); bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, @@ -271,7 +264,7 @@ private: // Build the map between type name and ID based on module's type // symbol table. - std::map<const Type *, std::string> TypeNameMap; + std::map<Type *, std::string> TypeNameMap; // List of variables demoted to a function scope. std::map<const Function *, std::vector<const GlobalVariable *> > localDecls; @@ -282,19 +275,15 @@ private: void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O); void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const; - std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const; + std::string getPTXFundamentalTypeStr(Type *Ty, bool = true) const; void printScalarConstant(const Constant *CPV, raw_ostream &O); void printFPConstant(const ConstantFP *Fp, raw_ostream &O); void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer); void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer); - void printOperandProper(const MachineOperand &MO); - void emitLinkageDirective(const GlobalValue *V, raw_ostream &O); void emitDeclarations(const Module &, raw_ostream &O); void emitDeclaration(const Function *, raw_ostream &O); - - static const char *getRegisterName(unsigned RegNo); void emitDemotedVars(const Function *, raw_ostream &); bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo, diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp index 69a229e..95813c8 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp @@ -98,7 +98,7 @@ private: /// This reordering exposes to optimizeMemoryInstruction more /// optimization opportunities on loads and stores. /// - /// If this function succesfully hoists an eliminable addrspacecast or V is + /// If this function successfully hoists an eliminable addrspacecast or V is /// already such an addrspacecast, it returns the transformed value (which is /// guaranteed to be an addrspacecast); otherwise, it returns nullptr. Value *hoistAddrSpaceCastFrom(Value *V, int Depth = 0); @@ -267,14 +267,14 @@ bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) { return false; bool Changed = false; - for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) { - for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) { + for (BasicBlock &B : F) { + for (Instruction &I : B) { if (isa<LoadInst>(I)) { // V = load P - Changed |= optimizeMemoryInstruction(I, 0); + Changed |= optimizeMemoryInstruction(&I, 0); } else if (isa<StoreInst>(I)) { // store V, P - Changed |= optimizeMemoryInstruction(I, 1); + Changed |= optimizeMemoryInstruction(&I, 1); } } } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp index 6fd09c4..62ca5e9 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -81,7 +81,7 @@ bool GenericToNVVM::runOnModule(Module &M) { for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;) { - GlobalVariable *GV = I++; + GlobalVariable *GV = &*I++; if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && !llvm::isSampler(*GV) && !GV->getName().startswith("llvm.")) { @@ -117,7 +117,7 @@ bool GenericToNVVM::runOnModule(Module &M) { Value *Operand = II->getOperand(i); if (isa<Constant>(Operand)) { II->setOperand( - i, remapConstant(&M, I, cast<Constant>(Operand), Builder)); + i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder)); } } } @@ -132,10 +132,8 @@ bool GenericToNVVM::runOnModule(Module &M) { // Walk through the metadata section and update the debug information // associated with the global variables in the default address space. - for (Module::named_metadata_iterator I = M.named_metadata_begin(), - E = M.named_metadata_end(); - I != E; I++) { - remapNamedMDNode(VM, I); + for (NamedMDNode &I : M.named_metadata()) { + remapNamedMDNode(VM, &I); } // Walk through the global variable initializers, and replace any use of @@ -318,9 +316,8 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, NewOperands[0], NewOperands[1]); case Instruction::FCmp: // CompareConstantExpr (fcmp) - assert(false && "Address space conversion should have no effect " - "on float point CompareConstantExpr (fcmp)!"); - return C; + llvm_unreachable("Address space conversion should have no effect " + "on float point CompareConstantExpr (fcmp)!"); case Instruction::ExtractElement: // ExtractElementConstantExpr return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]); @@ -364,8 +361,7 @@ Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, return Builder.CreateCast(Instruction::CastOps(C->getOpcode()), NewOperands[0], C->getType()); } - assert(false && "GenericToNVVM encountered an unsupported ConstantExpr"); - return C; + llvm_unreachable("GenericToNVVM encountered an unsupported ConstantExpr"); } } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 232a611..2d0098b 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -12,6 +12,8 @@ //===----------------------------------------------------------------------===// #include "NVPTXISelDAGToDAG.h" +#include "NVPTXUtilities.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/CommandLine.h" @@ -530,7 +532,7 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { if (!Src) return NVPTX::PTXLdStInstCode::GENERIC; - if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) { + if (auto *PT = dyn_cast<PointerType>(Src->getType())) { switch (PT->getAddressSpace()) { case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL; case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL; @@ -544,6 +546,39 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { return NVPTX::PTXLdStInstCode::GENERIC; } +static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, + unsigned CodeAddrSpace, MachineFunction *F) { + // To use non-coherent caching, the load has to be from global + // memory and we have to prove that the memory area is not written + // to anywhere for the duration of the kernel call, not even after + // the load. + // + // To ensure that there are no writes to the memory, we require the + // underlying pointer to be a noalias (__restrict) kernel parameter + // that is never used for a write. We can only do this for kernel + // functions since from within a device function, we cannot know if + // there were or will be writes to the memory from the caller - or we + // could, but then we would have to do inter-procedural analysis. + if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL || + !isKernelFunction(*F->getFunction())) { + return false; + } + + // We use GetUnderlyingObjects() here instead of + // GetUnderlyingObject() mainly because the former looks through phi + // nodes while the latter does not. We need to look through phi + // nodes to handle pointer induction variables. + SmallVector<Value *, 8> Objs; + GetUnderlyingObjects(const_cast<Value *>(N->getMemOperand()->getValue()), + Objs, F->getDataLayout()); + for (Value *Obj : Objs) { + auto *A = dyn_cast<const Argument>(Obj); + if (!A || !A->onlyReadsMemory() || !A->hasNoAliasAttr()) return false; + } + + return true; +} + SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) { unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); switch (IID) { @@ -638,6 +673,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) { // Address Space Setting unsigned int codeAddrSpace = getCodeAddrSpace(LD); + if (canLowerToLDG(LD, *Subtarget, codeAddrSpace, MF)) { + return SelectLDGLDU(N); + } + // Volatile Setting // - .volatile is only availalble for .global and .shared bool isVolatile = LD->isVolatile(); @@ -872,6 +911,10 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) { // Address Space Setting unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD); + if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) { + return SelectLDGLDU(N); + } + // Volatile Setting // - .volatile is only availalble for .global and .shared bool IsVolatile = MemSD->isVolatile(); @@ -1425,6 +1468,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1474,6 +1518,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1522,6 +1567,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1563,6 +1609,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1612,6 +1659,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1660,6 +1708,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1707,6 +1756,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1756,6 +1806,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1804,6 +1855,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1845,6 +1897,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { switch (N->getOpcode()) { default: return nullptr; + case ISD::LOAD: case ISD::INTRINSIC_W_CHAIN: if (IsLDG) { switch (EltVT.getSimpleVT().SimpleTy) { @@ -1894,6 +1947,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { } } break; + case NVPTXISD::LoadV2: case NVPTXISD::LDGV2: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -1942,6 +1996,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDU(SDNode *N) { break; } break; + case NVPTXISD::LoadV4: case NVPTXISD::LDGV4: switch (EltVT.getSimpleVT().SimpleTy) { default: @@ -5039,7 +5094,7 @@ bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N, } if (!Src) return false; - if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) + if (auto *PT = dyn_cast<PointerType>(Src->getType())) return (PT->getAddressSpace() == spN); return false; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b75cf40..7663696 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -124,6 +124,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // condition branches. setJumpIsExpensive(true); + // Wide divides are _very_ slow. Try to reduce the width of the divide if + // possible. + addBypassSlowDiv(64, 32); + // By default, use the Source scheduling if (sched4reg) setSchedulingPreference(Sched::RegPressure); @@ -275,6 +279,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SELECT); // Now deduce the information based on the above mentioned // actions @@ -910,7 +915,7 @@ std::string NVPTXTargetLowering::getPrototype( O << "("; if (retTy->isFloatingPointTy() || retTy->isIntegerTy()) { unsigned size = 0; - if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) { + if (auto *ITy = dyn_cast<IntegerType>(retTy)) { size = ITy->getBitWidth(); if (size < 32) size = 32; @@ -981,7 +986,7 @@ std::string NVPTXTargetLowering::getPrototype( O << "_"; continue; } - const PointerType *PTy = dyn_cast<PointerType>(Ty); + auto *PTy = dyn_cast<PointerType>(Ty); assert(PTy && "Param with byval attribute should be a pointer type"); Type *ETy = PTy->getElementType(); @@ -1318,7 +1323,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // struct or vector SmallVector<EVT, 16> vtparts; SmallVector<uint64_t, 16> Offsets; - const PointerType *PTy = dyn_cast<PointerType>(Args[i].Ty); + auto *PTy = dyn_cast<PointerType>(Args[i].Ty); assert(PTy && "Type of a byval parameter should be pointer"); ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(), vtparts, &Offsets, 0); @@ -2007,15 +2012,6 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { return Result; } -SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, - int idx, EVT v) const { - std::string *name = nvTM->getManagedStrPool()->getManagedString(inname); - std::stringstream suffix; - suffix << idx; - *name += suffix.str(); - return DAG.getTargetExternalSymbol(name->c_str(), v); -} - SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { std::string ParamSym; @@ -2029,10 +2025,6 @@ NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const { return DAG.getTargetExternalSymbol(SavedStr->c_str(), v); } -SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) { - return getExtSymb(DAG, ".HLPPARAM", idx); -} - // Check to see if the kernel argument is image*_t or sampler_t bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { @@ -2040,8 +2032,8 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { "struct._image3d_t", "struct._sampler_t" }; - const Type *Ty = arg->getType(); - const PointerType *PTy = dyn_cast<PointerType>(Ty); + Type *Ty = arg->getType(); + auto *PTy = dyn_cast<PointerType>(Ty); if (!PTy) return false; @@ -2049,14 +2041,11 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) { if (!context) return false; - const StructType *STy = dyn_cast<StructType>(PTy->getElementType()); + auto *STy = dyn_cast<StructType>(PTy->getElementType()); const std::string TypeName = STy && !STy->isLiteral() ? STy->getName() : ""; - for (int i = 0, e = array_lengthof(specialTypes); i != e; ++i) - if (TypeName == specialTypes[i]) - return true; - - return false; + return std::find(std::begin(specialTypes), std::end(specialTypes), + TypeName) != std::end(specialTypes); } SDValue NVPTXTargetLowering::LowerFormalArguments( @@ -2082,10 +2071,9 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( std::vector<Type *> argTypes; std::vector<const Argument *> theArgs; - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) { - theArgs.push_back(I); - argTypes.push_back(I->getType()); + for (const Argument &I : F->args()) { + theArgs.push_back(&I); + argTypes.push_back(I.getType()); } // argTypes.size() (or theArgs.size()) and Ins.size() need not match. // Ins.size() will be larger @@ -2545,20 +2533,6 @@ void NVPTXTargetLowering::LowerAsmOperandForConstraint( TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } -// NVPTX suuport vector of legal types of any length in Intrinsics because the -// NVPTX specific type legalizer -// will legalize them to the PTX supported length. -bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const { - if (isTypeLegal(VT)) - return true; - if (VT.isVector()) { - MVT eVT = VT.getVectorElementType(); - if (isTypeLegal(eVT)) - return true; - } - return false; -} - static unsigned getOpcForTextureInstr(unsigned Intrinsic) { switch (Intrinsic) { default: @@ -3747,9 +3721,7 @@ bool NVPTXTargetLowering::isLegalAddressingMode(const DataLayout &DL, // - [immAddr] if (AM.BaseGV) { - if (AM.BaseOffs || AM.HasBaseReg || AM.Scale) - return false; - return true; + return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale; } switch (AM.Scale) { @@ -3820,11 +3792,6 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } -/// getFunctionAlignment - Return the Log2 alignment of this function. -unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const { - return 4; -} - //===----------------------------------------------------------------------===// // NVPTX DAG Combining //===----------------------------------------------------------------------===// @@ -4057,6 +4024,67 @@ static SDValue PerformANDCombine(SDNode *N, return SDValue(); } +static SDValue PerformSELECTCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Currently this detects patterns for integer min and max and + // lowers them to PTX-specific intrinsics that enable hardware + // support. + + const SDValue Cond = N->getOperand(0); + if (Cond.getOpcode() != ISD::SETCC) return SDValue(); + + const SDValue LHS = Cond.getOperand(0); + const SDValue RHS = Cond.getOperand(1); + const SDValue True = N->getOperand(1); + const SDValue False = N->getOperand(2); + if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) + return SDValue(); + + const EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + + const ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); + SDValue Larger; // The larger of LHS and RHS when condition is true. + switch (CC) { + case ISD::SETULT: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + Larger = RHS; + break; + + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETUGT: + case ISD::SETUGE: + Larger = LHS; + break; + + default: + return SDValue(); + } + const bool IsMax = (Larger == True); + const bool IsSigned = ISD::isSignedIntSetCC(CC); + + unsigned IntrinsicId; + if (VT == MVT::i32) { + if (IsSigned) + IntrinsicId = IsMax ? Intrinsic::nvvm_max_i : Intrinsic::nvvm_min_i; + else + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ui : Intrinsic::nvvm_min_ui; + } else { + assert(VT == MVT::i64); + if (IsSigned) + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ll : Intrinsic::nvvm_min_ll; + else + IntrinsicId = IsMax ? Intrinsic::nvvm_max_ull : Intrinsic::nvvm_min_ull; + } + + SDLoc DL(N); + return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + DCI.DAG.getConstant(IntrinsicId, DL, VT), LHS, RHS); +} + enum OperandSignedness { Signed = 0, Unsigned, @@ -4113,25 +4141,16 @@ static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) { APInt Val = CI->getAPIntValue(); if (LHSSign == Unsigned) { - if (Val.isIntN(OptSize)) { - return true; - } - return false; + return Val.isIntN(OptSize); } else { - if (Val.isSignedIntN(OptSize)) { - return true; - } - return false; + return Val.isSignedIntN(OptSize); } } else { OperandSignedness RHSSign; if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign)) return false; - if (LHSSign != RHSSign) - return false; - - return true; + return LHSSign == RHSSign; } } @@ -4247,6 +4266,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformSHLCombine(N, DCI, OptLevel); case ISD::AND: return PerformANDCombine(N, DCI); + case ISD::SELECT: + return PerformSELECTCombine(N, DCI); } return SDValue(); } @@ -4509,25 +4530,25 @@ void NVPTXTargetLowering::ReplaceNodeResults( void NVPTXSection::anchor() {} NVPTXTargetObjectFile::~NVPTXTargetObjectFile() { - delete TextSection; - delete DataSection; - delete BSSSection; - delete ReadOnlySection; - - delete StaticCtorSection; - delete StaticDtorSection; - delete LSDASection; - delete EHFrameSection; - delete DwarfAbbrevSection; - delete DwarfInfoSection; - delete DwarfLineSection; - delete DwarfFrameSection; - delete DwarfPubTypesSection; - delete DwarfDebugInlineSection; - delete DwarfStrSection; - delete DwarfLocSection; - delete DwarfARangesSection; - delete DwarfRangesSection; + delete static_cast<NVPTXSection *>(TextSection); + delete static_cast<NVPTXSection *>(DataSection); + delete static_cast<NVPTXSection *>(BSSSection); + delete static_cast<NVPTXSection *>(ReadOnlySection); + + delete static_cast<NVPTXSection *>(StaticCtorSection); + delete static_cast<NVPTXSection *>(StaticDtorSection); + delete static_cast<NVPTXSection *>(LSDASection); + delete static_cast<NVPTXSection *>(EHFrameSection); + delete static_cast<NVPTXSection *>(DwarfAbbrevSection); + delete static_cast<NVPTXSection *>(DwarfInfoSection); + delete static_cast<NVPTXSection *>(DwarfLineSection); + delete static_cast<NVPTXSection *>(DwarfFrameSection); + delete static_cast<NVPTXSection *>(DwarfPubTypesSection); + delete static_cast<const NVPTXSection *>(DwarfDebugInlineSection); + delete static_cast<NVPTXSection *>(DwarfStrSection); + delete static_cast<NVPTXSection *>(DwarfLocSection); + delete static_cast<NVPTXSection *>(DwarfARangesSection); + delete static_cast<NVPTXSection *>(DwarfRangesSection); } MCSection * diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index e5c3732..60914c1 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -441,13 +441,9 @@ public: SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset, - SelectionDAG &DAG) const; const char *getTargetNodeName(unsigned Opcode) const override; - bool isTypeSupportedInIntrinsic(MVT VT) const; - bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, unsigned Intrinsic) const override; @@ -459,8 +455,13 @@ public: bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; - /// getFunctionAlignment - Return the Log2 alignment of this function. - unsigned getFunctionAlignment(const Function *F) const; + bool isTruncateFree(Type *SrcTy, Type *DstTy) const override { + // Truncating 64-bit to 32-bit is free in SASS. + if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) + return false; + return SrcTy->getPrimitiveSizeInBits() == 64 && + DstTy->getPrimitiveSizeInBits() == 32; + } EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override { @@ -515,11 +516,7 @@ public: private: const NVPTXSubtarget &STI; // cache the subtarget here - - SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx, - EVT = MVT::i32) const; SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const; - SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx); SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 76d6597..9f3cf45 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -37,30 +37,31 @@ void NVPTXInstrInfo::copyPhysReg( const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); - if (DestRC != SrcRC) - report_fatal_error("Attempted to created cross-class register copy"); - - if (DestRC == &NVPTX::Int32RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Int1RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Float32RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Int16RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Int64RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Float64RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); - else { + if (DestRC->getSize() != SrcRC->getSize()) + report_fatal_error("Copy one register into another with a different width"); + + unsigned Op; + if (DestRC == &NVPTX::Int1RegsRegClass) { + Op = NVPTX::IMOV1rr; + } else if (DestRC == &NVPTX::Int16RegsRegClass) { + Op = NVPTX::IMOV16rr; + } else if (DestRC == &NVPTX::Int32RegsRegClass) { + Op = (SrcRC == &NVPTX::Int32RegsRegClass ? NVPTX::IMOV32rr + : NVPTX::BITCONVERT_32_F2I); + } else if (DestRC == &NVPTX::Int64RegsRegClass) { + Op = (SrcRC == &NVPTX::Int64RegsRegClass ? NVPTX::IMOV64rr + : NVPTX::BITCONVERT_64_F2I); + } else if (DestRC == &NVPTX::Float32RegsRegClass) { + Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr + : NVPTX::BITCONVERT_32_I2F); + } else if (DestRC == &NVPTX::Float64RegsRegClass) { + Op = (SrcRC == &NVPTX::Float64RegsRegClass ? NVPTX::FMOV64rr + : NVPTX::BITCONVERT_64_I2F); + } else { llvm_unreachable("Bad register copy"); } + BuildMI(MBB, I, DL, get(Op), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); } bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, @@ -86,27 +87,6 @@ bool NVPTXInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, return false; } -bool NVPTXInstrInfo::isReadSpecialReg(MachineInstr &MI) const { - switch (MI.getOpcode()) { - default: - return false; - case NVPTX::INT_PTX_SREG_NTID_X: - case NVPTX::INT_PTX_SREG_NTID_Y: - case NVPTX::INT_PTX_SREG_NTID_Z: - case NVPTX::INT_PTX_SREG_TID_X: - case NVPTX::INT_PTX_SREG_TID_Y: - case NVPTX::INT_PTX_SREG_TID_Z: - case NVPTX::INT_PTX_SREG_CTAID_X: - case NVPTX::INT_PTX_SREG_CTAID_Y: - case NVPTX::INT_PTX_SREG_CTAID_Z: - case NVPTX::INT_PTX_SREG_NCTAID_X: - case NVPTX::INT_PTX_SREG_NCTAID_Y: - case NVPTX::INT_PTX_SREG_NCTAID_Z: - case NVPTX::INT_PTX_SREG_WARPSIZE: - return true; - } -} - bool NVPTXInstrInfo::isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const { bool isLoad = false; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h index 179c068..3e40722 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXInstrInfo.h @@ -56,7 +56,6 @@ public: unsigned &DestReg) const; bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const; bool isStoreInstr(const MachineInstr &MI, unsigned &AddrSpace) const; - bool isReadSpecialReg(MachineInstr &MI) const; virtual bool CanTailMerge(const MachineInstr *MI) const; // Branch analysis. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 0bf72fe..f770c2a 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -6,6 +6,8 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// +// \file // Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when // the size is large or is not a compile-time constant. // @@ -18,19 +20,20 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #define DEBUG_TYPE "nvptx" using namespace llvm; namespace { + // actual analysis class, which is a functionpass struct NVPTXLowerAggrCopies : public FunctionPass { static char ID; @@ -50,179 +53,299 @@ struct NVPTXLowerAggrCopies : public FunctionPass { return "Lower aggregate copies/intrinsics into loops"; } }; -} // namespace char NVPTXLowerAggrCopies::ID = 0; -// Lower MemTransferInst or load-store pair to loop -static void convertTransferToLoop( - Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len, - bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) { - Type *indType = len->getType(); +// Lower memcpy to loop. +void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr, + Value *DstAddr, Value *CopyLen, bool SrcIsVolatile, + bool DstIsVolatile, LLVMContext &Context, + Function &F) { + Type *TypeOfCopyLen = CopyLen->getType(); - BasicBlock *origBB = splitAt->getParent(); - BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split"); - BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB); + BasicBlock *OrigBB = ConvertedInst->getParent(); + BasicBlock *NewBB = + ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split"); + BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB); - origBB->getTerminator()->setSuccessor(0, loopBB); - IRBuilder<> builder(origBB, origBB->getTerminator()); + OrigBB->getTerminator()->setSuccessor(0, LoopBB); + IRBuilder<> Builder(OrigBB->getTerminator()); - // srcAddr and dstAddr are expected to be pointer types, + // SrcAddr and DstAddr are expected to be pointer types, // so no check is made here. - unsigned srcAS = cast<PointerType>(srcAddr->getType())->getAddressSpace(); - unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace(); + unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); + unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); // Cast pointers to (char *) - srcAddr = builder.CreateBitCast(srcAddr, Type::getInt8PtrTy(Context, srcAS)); - dstAddr = builder.CreateBitCast(dstAddr, Type::getInt8PtrTy(Context, dstAS)); + SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS)); + DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS)); - IRBuilder<> loop(loopBB); - // The loop index (ind) is a phi node. - PHINode *ind = loop.CreatePHI(indType, 0); - // Incoming value for ind is 0 - ind->addIncoming(ConstantInt::get(indType, 0), origBB); + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); + LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); - // load from srcAddr+ind + // load from SrcAddr+LoopIndex // TODO: we can leverage the align parameter of llvm.memcpy for more efficient // word-sized loads and stores. - Value *val = loop.CreateLoad(loop.CreateGEP(loop.getInt8Ty(), srcAddr, ind), - srcVolatile); - // store at dstAddr+ind - loop.CreateStore(val, loop.CreateGEP(loop.getInt8Ty(), dstAddr, ind), - dstVolatile); - - // The value for ind coming from backedge is (ind + 1) - Value *newind = loop.CreateAdd(ind, ConstantInt::get(indType, 1)); - ind->addIncoming(newind, loopBB); - - loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB); + Value *Element = + LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP( + LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex), + SrcIsVolatile); + // store at DstAddr+LoopIndex + LoopBuilder.CreateStore(Element, + LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(), + DstAddr, LoopIndex), + DstIsVolatile); + + // The value for LoopIndex coming from backedge is (LoopIndex + 1) + Value *NewIndex = + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); + LoopIndex->addIncoming(NewIndex, LoopBB); + + LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, + NewBB); } -// Lower MemSetInst to loop -static void convertMemSetToLoop(Instruction *splitAt, Value *dstAddr, - Value *len, Value *val, LLVMContext &Context, - Function &F) { - BasicBlock *origBB = splitAt->getParent(); - BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split"); - BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB); +// Lower memmove to IR. memmove is required to correctly copy overlapping memory +// regions; therefore, it has to check the relative positions of the source and +// destination pointers and choose the copy direction accordingly. +// +// The code below is an IR rendition of this C function: +// +// void* memmove(void* dst, const void* src, size_t n) { +// unsigned char* d = dst; +// const unsigned char* s = src; +// if (s < d) { +// // copy backwards +// while (n--) { +// d[n] = s[n]; +// } +// } else { +// // copy forward +// for (size_t i = 0; i < n; ++i) { +// d[i] = s[i]; +// } +// } +// return dst; +// } +void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr, + Value *DstAddr, Value *CopyLen, bool SrcIsVolatile, + bool DstIsVolatile, LLVMContext &Context, + Function &F) { + Type *TypeOfCopyLen = CopyLen->getType(); + BasicBlock *OrigBB = ConvertedInst->getParent(); + + // Create the a comparison of src and dst, based on which we jump to either + // the forward-copy part of the function (if src >= dst) or the backwards-copy + // part (if src < dst). + // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else + // structure. Its block terminators (unconditional branches) are replaced by + // the appropriate conditional branches when the loop is built. + ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT, + SrcAddr, DstAddr, "compare_src_dst"); + TerminatorInst *ThenTerm, *ElseTerm; + SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm, + &ElseTerm); + + // Each part of the function consists of two blocks: + // copy_backwards: used to skip the loop when n == 0 + // copy_backwards_loop: the actual backwards loop BB + // copy_forward: used to skip the loop when n == 0 + // copy_forward_loop: the actual forward loop BB + BasicBlock *CopyBackwardsBB = ThenTerm->getParent(); + CopyBackwardsBB->setName("copy_backwards"); + BasicBlock *CopyForwardBB = ElseTerm->getParent(); + CopyForwardBB->setName("copy_forward"); + BasicBlock *ExitBB = ConvertedInst->getParent(); + ExitBB->setName("memmove_done"); + + // Initial comparison of n == 0 that lets us skip the loops altogether. Shared + // between both backwards and forward copy clauses. + ICmpInst *CompareN = + new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen, + ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0"); + + // Copying backwards. + BasicBlock *LoopBB = + BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB); + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); + Value *IndexPtr = LoopBuilder.CreateSub( + LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr"); + Value *Element = LoopBuilder.CreateLoad( + LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element"); + LoopBuilder.CreateStore(Element, + LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr)); + LoopBuilder.CreateCondBr( + LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)), + ExitBB, LoopBB); + LoopPhi->addIncoming(IndexPtr, LoopBB); + LoopPhi->addIncoming(CopyLen, CopyBackwardsBB); + BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm); + ThenTerm->eraseFromParent(); + + // Copying forward. + BasicBlock *FwdLoopBB = + BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB); + IRBuilder<> FwdLoopBuilder(FwdLoopBB); + PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr"); + Value *FwdElement = FwdLoopBuilder.CreateLoad( + FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element"); + FwdLoopBuilder.CreateStore( + FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi)); + Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd( + FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment"); + FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen), + ExitBB, FwdLoopBB); + FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB); + FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB); + + BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm); + ElseTerm->eraseFromParent(); +} - origBB->getTerminator()->setSuccessor(0, loopBB); - IRBuilder<> builder(origBB, origBB->getTerminator()); +// Lower memset to loop. +void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr, + Value *CopyLen, Value *SetValue, LLVMContext &Context, + Function &F) { + BasicBlock *OrigBB = ConvertedInst->getParent(); + BasicBlock *NewBB = + ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split"); + BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB); - unsigned dstAS = cast<PointerType>(dstAddr->getType())->getAddressSpace(); + OrigBB->getTerminator()->setSuccessor(0, LoopBB); + IRBuilder<> Builder(OrigBB->getTerminator()); // Cast pointer to the type of value getting stored - dstAddr = - builder.CreateBitCast(dstAddr, PointerType::get(val->getType(), dstAS)); + unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); + DstAddr = Builder.CreateBitCast(DstAddr, + PointerType::get(SetValue->getType(), dstAS)); - IRBuilder<> loop(loopBB); - PHINode *ind = loop.CreatePHI(len->getType(), 0); - ind->addIncoming(ConstantInt::get(len->getType(), 0), origBB); + IRBuilder<> LoopBuilder(LoopBB); + PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0); + LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB); - loop.CreateStore(val, loop.CreateGEP(val->getType(), dstAddr, ind), false); + LoopBuilder.CreateStore( + SetValue, + LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex), + false); - Value *newind = loop.CreateAdd(ind, ConstantInt::get(len->getType(), 1)); - ind->addIncoming(newind, loopBB); + Value *NewIndex = + LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1)); + LoopIndex->addIncoming(NewIndex, LoopBB); - loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB); + LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, + NewBB); } bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { - SmallVector<LoadInst *, 4> aggrLoads; - SmallVector<MemTransferInst *, 4> aggrMemcpys; - SmallVector<MemSetInst *, 4> aggrMemsets; + SmallVector<LoadInst *, 4> AggrLoads; + SmallVector<MemIntrinsic *, 4> MemCalls; const DataLayout &DL = F.getParent()->getDataLayout(); LLVMContext &Context = F.getParent()->getContext(); - // - // Collect all the aggrLoads, aggrMemcpys and addrMemsets. - // + // Collect all aggregate loads and mem* calls. for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ++II) { - if (LoadInst *load = dyn_cast<LoadInst>(II)) { - if (!load->hasOneUse()) + if (LoadInst *LI = dyn_cast<LoadInst>(II)) { + if (!LI->hasOneUse()) continue; - if (DL.getTypeStoreSize(load->getType()) < MaxAggrCopySize) + if (DL.getTypeStoreSize(LI->getType()) < MaxAggrCopySize) continue; - User *use = load->user_back(); - if (StoreInst *store = dyn_cast<StoreInst>(use)) { - if (store->getOperand(0) != load) + if (StoreInst *SI = dyn_cast<StoreInst>(LI->user_back())) { + if (SI->getOperand(0) != LI) continue; - aggrLoads.push_back(load); - } - } else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) { - Value *len = intr->getLength(); - // If the number of elements being copied is greater - // than MaxAggrCopySize, lower it to a loop - if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) { - if (len_int->getZExtValue() >= MaxAggrCopySize) { - aggrMemcpys.push_back(intr); - } - } else { - // turn variable length memcpy/memmov into loop - aggrMemcpys.push_back(intr); + AggrLoads.push_back(LI); } - } else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) { - Value *len = memsetintr->getLength(); - if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) { - if (len_int->getZExtValue() >= MaxAggrCopySize) { - aggrMemsets.push_back(memsetintr); + } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) { + // Convert intrinsic calls with variable size or with constant size + // larger than the MaxAggrCopySize threshold. + if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) { + if (LenCI->getZExtValue() >= MaxAggrCopySize) { + MemCalls.push_back(IntrCall); } } else { - // turn variable length memset into loop - aggrMemsets.push_back(memsetintr); + MemCalls.push_back(IntrCall); } } } } - if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) && - (aggrMemsets.size() == 0)) + + if (AggrLoads.size() == 0 && MemCalls.size() == 0) { return false; + } // // Do the transformation of an aggr load/copy/set to a loop // - for (LoadInst *load : aggrLoads) { - StoreInst *store = dyn_cast<StoreInst>(*load->user_begin()); - Value *srcAddr = load->getOperand(0); - Value *dstAddr = store->getOperand(1); - unsigned numLoads = DL.getTypeStoreSize(load->getType()); - Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads); - - convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(), - store->isVolatile(), Context, F); - - store->eraseFromParent(); - load->eraseFromParent(); + for (LoadInst *LI : AggrLoads) { + StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin()); + Value *SrcAddr = LI->getOperand(0); + Value *DstAddr = SI->getOperand(1); + unsigned NumLoads = DL.getTypeStoreSize(LI->getType()); + Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads); + + convertMemCpyToLoop(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile(), + /* Context */ Context, + /* Function F */ F); + + SI->eraseFromParent(); + LI->eraseFromParent(); } - for (MemTransferInst *cpy : aggrMemcpys) { - convertTransferToLoop(/* splitAt */ cpy, - /* srcAddr */ cpy->getSource(), - /* dstAddr */ cpy->getDest(), - /* len */ cpy->getLength(), - /* srcVolatile */ cpy->isVolatile(), - /* dstVolatile */ cpy->isVolatile(), + // Transform mem* intrinsic calls. + for (MemIntrinsic *MemCall : MemCalls) { + if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) { + convertMemCpyToLoop(/* ConvertedInst */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ Memcpy->getLength(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), /* Context */ Context, /* Function F */ F); - cpy->eraseFromParent(); - } - - for (MemSetInst *memsetinst : aggrMemsets) { - Value *len = memsetinst->getLength(); - Value *val = memsetinst->getValue(); - convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context, - F); - memsetinst->eraseFromParent(); + } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) { + convertMemMoveToLoop(/* ConvertedInst */ Memmove, + /* SrcAddr */ Memmove->getRawSource(), + /* DstAddr */ Memmove->getRawDest(), + /* CopyLen */ Memmove->getLength(), + /* SrcIsVolatile */ Memmove->isVolatile(), + /* DstIsVolatile */ Memmove->isVolatile(), + /* Context */ Context, + /* Function F */ F); + + } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) { + convertMemSetToLoop(/* ConvertedInst */ Memset, + /* DstAddr */ Memset->getRawDest(), + /* CopyLen */ Memset->getLength(), + /* SetValue */ Memset->getValue(), + /* Context */ Context, + /* Function F */ F); + } + MemCall->eraseFromParent(); } return true; } +} // namespace + +namespace llvm { +void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); +} + +INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies", + "Lower aggregate copies, and llvm.mem* intrinsics into loops", + false, false) + FunctionPass *llvm::createLowerAggrCopies() { return new NVPTXLowerAggrCopies(); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index 93d0025..624052e 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -81,7 +81,7 @@ bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) { // Check Load, Store, GEP, and BitCast Uses on alloca and make them // use the converted generic address, in order to expose non-generic // addrspacecast to NVPTXFavorNonGenericAddrSpace. For other types - // of instructions this is unecessary and may introduce redudant + // of instructions this is unnecessary and may introduce redundant // address cast. const auto &AllocaUse = *UI++; auto LI = dyn_cast<LoadInst>(AllocaUse.getUser()); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp index b533f31..6656077 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @@ -47,6 +47,36 @@ // ... // } // +// 3. Convert pointers in a byval kernel parameter to pointers in the global +// address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., +// +// struct S { +// int *x; +// int *y; +// }; +// __global__ void foo(S s) { +// int *b = s.y; +// // use b +// } +// +// "b" points to the global address space. In the IR level, +// +// define void @foo({i32*, i32*}* byval %input) { +// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 +// %b = load i32*, i32** %b_ptr +// ; use %b +// } +// +// becomes +// +// define void @foo({i32*, i32*}* byval %input) { +// %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 +// %b = load i32*, i32** %b_ptr +// %b_global = addrspacecast i32* %b to i32 addrspace(1)* +// %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32* +// ; use %b_generic +// } +// // TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes // don't cancel the addrspacecast pair this pass emits. //===----------------------------------------------------------------------===// @@ -54,6 +84,7 @@ #include "NVPTX.h" #include "NVPTXUtilities.h" #include "NVPTXTargetMachine.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -71,9 +102,12 @@ class NVPTXLowerKernelArgs : public FunctionPass { bool runOnFunction(Function &F) override; // handle byval parameters - void handleByValParam(Argument *); - // handle non-byval pointer parameters - void handlePointerParam(Argument *); + void handleByValParam(Argument *Arg); + // Knowing Ptr must point to the global address space, this function + // addrspacecasts Ptr to global and then back to generic. This allows + // NVPTXFavorNonGenericAddrSpace to fold the global-to-generic cast into + // loads/stores that appear later. + void markPointerAsGlobal(Value *Ptr); public: static char ID; // Pass identification, replacement for typeid @@ -104,7 +138,7 @@ INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args", // // The above code allocates some space in the stack and copies the incoming // struct from param space to local space. -// Then replace all occurences of %d by %temp. +// Then replace all occurrences of %d by %temp. // ============================================================================= void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); @@ -128,27 +162,33 @@ void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) { new StoreInst(LI, AllocA, FirstInst); } -void NVPTXLowerKernelArgs::handlePointerParam(Argument *Arg) { - assert(!Arg->hasByValAttr() && - "byval params should be handled by handleByValParam"); - - // Do nothing if the argument already points to the global address space. - if (Arg->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) +void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) { + if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) return; - Instruction *FirstInst = Arg->getParent()->getEntryBlock().begin(); - Instruction *ArgInGlobal = new AddrSpaceCastInst( - Arg, PointerType::get(Arg->getType()->getPointerElementType(), + // Deciding where to emit the addrspacecast pair. + BasicBlock::iterator InsertPt; + if (Argument *Arg = dyn_cast<Argument>(Ptr)) { + // Insert at the functon entry if Ptr is an argument. + InsertPt = Arg->getParent()->getEntryBlock().begin(); + } else { + // Insert right after Ptr if Ptr is an instruction. + InsertPt = ++cast<Instruction>(Ptr)->getIterator(); + assert(InsertPt != InsertPt->getParent()->end() && + "We don't call this function with Ptr being a terminator."); + } + + Instruction *PtrInGlobal = new AddrSpaceCastInst( + Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), ADDRESS_SPACE_GLOBAL), - Arg->getName(), FirstInst); - Value *ArgInGeneric = new AddrSpaceCastInst(ArgInGlobal, Arg->getType(), - Arg->getName(), FirstInst); - // Replace with ArgInGeneric all uses of Args except ArgInGlobal. - Arg->replaceAllUsesWith(ArgInGeneric); - ArgInGlobal->setOperand(0, Arg); + Ptr->getName(), &*InsertPt); + Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(), + Ptr->getName(), &*InsertPt); + // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal. + Ptr->replaceAllUsesWith(PtrInGeneric); + PtrInGlobal->setOperand(0, Ptr); } - // ============================================================================= // Main function for this pass. // ============================================================================= @@ -157,12 +197,32 @@ bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { if (!isKernelFunction(F)) return false; + if (TM && TM->getDrvInterface() == NVPTX::CUDA) { + // Mark pointers in byval structs as global. + for (auto &B : F) { + for (auto &I : B) { + if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { + if (LI->getType()->isPointerTy()) { + Value *UO = GetUnderlyingObject(LI->getPointerOperand(), + F.getParent()->getDataLayout()); + if (Argument *Arg = dyn_cast<Argument>(UO)) { + if (Arg->hasByValAttr()) { + // LI is a load from a pointer within a byval kernel parameter. + markPointerAsGlobal(LI); + } + } + } + } + } + } + } + for (Argument &Arg : F.args()) { if (Arg.getType()->isPointerTy()) { if (Arg.hasByValAttr()) handleByValParam(&Arg); else if (TM && TM->getDrvInterface() == NVPTX::CUDA) - handlePointerParam(&Arg); + markPointerAsGlobal(&Arg); } } return true; diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h index 46b4b33..81a606d 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXMCExpr.h @@ -68,7 +68,7 @@ public: return false; } void visitUsedExpr(MCStreamer &Streamer) const override {}; - MCSection *findAssociatedSection() const override { return nullptr; } + MCFragment *findAssociatedFragment() const override { return nullptr; } // There are no TLS NVPTXMCExprs at the moment. void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} @@ -110,7 +110,7 @@ public: return false; } void visitUsedExpr(MCStreamer &Streamer) const override {}; - MCSection *findAssociatedSection() const override { return nullptr; } + MCFragment *findAssociatedFragment() const override { return nullptr; } // There are no TLS NVPTXMCExprs at the moment. void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {} diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 5fd69a6..17019d7 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -72,7 +72,7 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) { for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { // If last instruction is a return instruction, add an epilogue - if (!I->empty() && I->back().isReturn()) + if (I->isReturnBlock()) TFI.emitEpilogue(MF, *I); } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h index 0d2627d..45a7309 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXSection.h @@ -19,15 +19,14 @@ #include <vector> namespace llvm { -/// NVPTXSection - Represents a section in PTX -/// PTX does not have sections. We create this class in order to use -/// the ASMPrint interface. +/// Represents a section in PTX PTX does not have sections. We create this class +/// in order to use the ASMPrint interface. /// -class NVPTXSection : public MCSection { +class NVPTXSection final : public MCSection { virtual void anchor(); public: NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K, nullptr) {} - virtual ~NVPTXSection() {} + ~NVPTXSection() {} /// Override this as NVPTX has its own way of printing switching /// to a section. diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 248f9e1..aa931b1 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -53,6 +53,7 @@ void initializeGenericToNVVMPass(PassRegistry&); void initializeNVPTXAllocaHoistingPass(PassRegistry &); void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&); void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &); +void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerKernelArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); } @@ -64,14 +65,15 @@ extern "C" void LLVMInitializeNVPTXTarget() { // FIXME: This pass is really intended to be invoked during IR optimization, // but it's very NVPTX-specific. - initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); - initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); - initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry()); - initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry()); - initializeNVPTXFavorNonGenericAddrSpacesPass( - *PassRegistry::getPassRegistry()); - initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry()); - initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry()); + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeNVVMReflectPass(PR); + initializeGenericToNVVMPass(PR); + initializeNVPTXAllocaHoistingPass(PR); + initializeNVPTXAssignValidGlobalNamesPass(PR); + initializeNVPTXFavorNonGenericAddrSpacesPass(PR); + initializeNVPTXLowerKernelArgsPass(PR); + initializeNVPTXLowerAllocaPass(PR); + initializeNVPTXLowerAggrCopiesPass(PR); } static std::string computeDataLayout(bool is64Bit) { @@ -139,6 +141,10 @@ public: FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; + +private: + // if the opt level is aggressive, add GVN; otherwise, add EarlyCSE. + void addEarlyCSEOrGVNPass(); }; } // end anonymous namespace @@ -148,11 +154,18 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { } TargetIRAnalysis NVPTXTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(NVPTXTTIImpl(this, F)); }); } +void NVPTXPassConfig::addEarlyCSEOrGVNPass() { + if (getOptLevel() == CodeGenOpt::Aggressive) + addPass(createGVNPass()); + else + addPass(createEarlyCSEPass()); +} + void NVPTXPassConfig::addIRPasses() { // The following passes are known to not play well with virtual regs hanging // around after register allocation (which in our case, is *all* registers). @@ -161,13 +174,14 @@ void NVPTXPassConfig::addIRPasses() { // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp). disablePass(&PrologEpilogCodeInserterID); disablePass(&MachineCopyPropagationID); - disablePass(&BranchFolderPassID); disablePass(&TailDuplicateID); + addPass(createNVVMReflectPass()); addPass(createNVPTXImageOptimizerPass()); - TargetPassConfig::addIRPasses(); addPass(createNVPTXAssignValidGlobalNamesPass()); addPass(createGenericToNVVMPass()); + + // === Propagate special address spaces === addPass(createNVPTXLowerKernelArgsPass(&getNVPTXTargetMachine())); // NVPTXLowerKernelArgs emits alloca for byval parameters which can often // be eliminated by SROA. @@ -178,22 +192,38 @@ void NVPTXPassConfig::addIRPasses() { // them unused. We could remove dead code in an ad-hoc manner, but that // requires manual work and might be error-prone. addPass(createDeadCodeEliminationPass()); + + // === Straight-line scalar optimizations === addPass(createSeparateConstOffsetFromGEPPass()); + addPass(createSpeculativeExecutionPass()); // ReassociateGEPs exposes more opportunites for SLSR. See // the example in reassociate-geps-and-slsr.ll. addPass(createStraightLineStrengthReducePass()); // SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or // EarlyCSE can reuse. GVN generates significantly better code than EarlyCSE // for some of our benchmarks. - if (getOptLevel() == CodeGenOpt::Aggressive) - addPass(createGVNPass()); - else - addPass(createEarlyCSEPass()); + addEarlyCSEOrGVNPass(); // Run NaryReassociate after EarlyCSE/GVN to be more effective. addPass(createNaryReassociatePass()); // NaryReassociate on GEPs creates redundant common expressions, so run // EarlyCSE after it. addPass(createEarlyCSEPass()); + + // === LSR and other generic IR passes === + TargetPassConfig::addIRPasses(); + // EarlyCSE is not always strong enough to clean up what LSR produces. For + // example, GVN can combine + // + // %0 = add %a, %b + // %1 = add %b, %a + // + // and + // + // %0 = shl nsw %a, 2 + // %1 = shl %a, 2 + // + // but EarlyCSE can do neither of them. + addEarlyCSEOrGVNPass(); } bool NVPTXPassConfig::addInstSelector() { diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h index 5ecdc87..0f88ddf 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h @@ -48,8 +48,7 @@ public: void Initialize(MCContext &ctx, const TargetMachine &TM) override { TargetLoweringObjectFile::Initialize(ctx, TM); TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText()); - DataSection = - new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel()); + DataSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getData()); BSSSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getBSS()); ReadOnlySection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getReadOnly()); @@ -84,7 +83,7 @@ public: new NVPTXSection(MCSection::SV_ELF, SectionKind::getMetadata()); } - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override { return ReadOnlySection; } diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index e7250cd..6e679dd 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -89,12 +89,12 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { return false; } -unsigned NVPTXTTIImpl::getArithmeticInstrCost( +int NVPTXTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index 5bcd1e2..0946a32 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -52,7 +52,7 @@ public: bool isSourceOfDivergence(const Value *V); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 1f178af..578b466 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -335,106 +335,7 @@ bool llvm::getAlign(const CallInst &I, unsigned index, unsigned &align) { return false; } -bool llvm::isBarrierIntrinsic(Intrinsic::ID id) { - if ((id == Intrinsic::nvvm_barrier0) || - (id == Intrinsic::nvvm_barrier0_popc) || - (id == Intrinsic::nvvm_barrier0_and) || - (id == Intrinsic::nvvm_barrier0_or) || - (id == Intrinsic::cuda_syncthreads)) - return true; - return false; -} - -// Interface for checking all memory space transfer related intrinsics -bool llvm::isMemorySpaceTransferIntrinsic(Intrinsic::ID id) { - if (id == Intrinsic::nvvm_ptr_local_to_gen || - id == Intrinsic::nvvm_ptr_shared_to_gen || - id == Intrinsic::nvvm_ptr_global_to_gen || - id == Intrinsic::nvvm_ptr_constant_to_gen || - id == Intrinsic::nvvm_ptr_gen_to_global || - id == Intrinsic::nvvm_ptr_gen_to_shared || - id == Intrinsic::nvvm_ptr_gen_to_local || - id == Intrinsic::nvvm_ptr_gen_to_constant || - id == Intrinsic::nvvm_ptr_gen_to_param) { - return true; - } - - return false; -} - -// consider several special intrinsics in striping pointer casts, and -// provide an option to ignore GEP indicies for find out the base address only -// which could be used in simple alias disambigurate. -const Value * -llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) { - V = V->stripPointerCasts(); - while (true) { - if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) { - if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) { - V = IS->getArgOperand(0)->stripPointerCasts(); - continue; - } - } else if (ignore_GEP_indices) - if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { - V = GEP->getPointerOperand()->stripPointerCasts(); - continue; - } - break; - } - return V; -} - -// consider several special intrinsics in striping pointer casts, and -// - ignore GEP indicies for find out the base address only, and -// - tracking PHINode -// which could be used in simple alias disambigurate. -const Value * -llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) { - if (processed.find(V) != processed.end()) - return nullptr; - processed.insert(V); - - const Value *V2 = V->stripPointerCasts(); - if (V2 != V && processed.find(V2) != processed.end()) - return nullptr; - processed.insert(V2); - - V = V2; - - while (true) { - if (const IntrinsicInst *IS = dyn_cast<IntrinsicInst>(V)) { - if (isMemorySpaceTransferIntrinsic(IS->getIntrinsicID())) { - V = IS->getArgOperand(0)->stripPointerCasts(); - continue; - } - } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { - V = GEP->getPointerOperand()->stripPointerCasts(); - continue; - } else if (const PHINode *PN = dyn_cast<PHINode>(V)) { - if (V != V2 && processed.find(V) != processed.end()) - return nullptr; - processed.insert(PN); - const Value *common = nullptr; - for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { - const Value *pv = PN->getIncomingValue(i); - const Value *base = skipPointerTransfer(pv, processed); - if (base) { - if (!common) - common = base; - else if (common != base) - return PN; - } - } - if (!common) - return PN; - V = common; - } - break; - } - return V; -} - -// The following are some useful utilities for debuggung +// The following are some useful utilities for debugging BasicBlock *llvm::getParentBlock(Value *v) { if (BasicBlock *B = dyn_cast<BasicBlock>(v)) @@ -466,7 +367,7 @@ void llvm::dumpBlock(Value *v, char *blockName) { return; for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) { - BasicBlock *B = it; + BasicBlock *B = &*it; if (strcmp(B->getName().data(), blockName) == 0) { B->dump(); return; @@ -490,7 +391,7 @@ Instruction *llvm::getInst(Value *base, char *instName) { return nullptr; } -// Dump an instruction by nane +// Dump an instruction by name void llvm::dumpInst(Value *base, char *instName) { Instruction *I = getInst(base, instName); if (I) diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 7e2ce73..a5262cb 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -61,27 +61,6 @@ bool isKernelFunction(const llvm::Function &); bool getAlign(const llvm::Function &, unsigned index, unsigned &); bool getAlign(const llvm::CallInst &, unsigned index, unsigned &); -bool isBarrierIntrinsic(llvm::Intrinsic::ID); - -/// make_vector - Helper function which is useful for building temporary vectors -/// to pass into type construction of CallInst ctors. This turns a null -/// terminated list of pointers (or other value types) into a real live vector. -/// -template <typename T> inline std::vector<T> make_vector(T A, ...) { - va_list Args; - va_start(Args, A); - std::vector<T> Result; - Result.push_back(A); - while (T Val = va_arg(Args, T)) - Result.push_back(Val); - va_end(Args); - return Result; -} - -bool isMemorySpaceTransferIntrinsic(Intrinsic::ID id); -const Value *skipPointerTransfer(const Value *V, bool ignore_GEP_indices); -const Value * -skipPointerTransfer(const Value *V, std::set<const Value *> &processed); BasicBlock *getParentBlock(Value *v); Function *getParentFunction(Value *v); void dumpBlock(Value *v, char *blockName); diff --git a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td index a237247..e69bbba 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td +++ b/contrib/llvm/lib/Target/NVPTX/NVPTXVector.td @@ -26,7 +26,7 @@ let isAsCheapAsAMove=1, VecInstType=isVecExtract.Value in { def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), (ins V2I16Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int16Regs:$dst, (vector_extract + [(set Int16Regs:$dst, (extractelt (v2i16 V2I16Regs:$src), imm:$c))], IMOV16rr>; @@ -34,7 +34,7 @@ def V2i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), (ins V4I16Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int16Regs:$dst, (vector_extract + [(set Int16Regs:$dst, (extractelt (v4i16 V4I16Regs:$src), imm:$c))], IMOV16rr>; @@ -42,7 +42,7 @@ def V4i16Extract : NVPTXVecInst<(outs Int16Regs:$dst), def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), (ins V2I8Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int8Regs:$dst, (vector_extract + [(set Int8Regs:$dst, (extractelt (v2i8 V2I8Regs:$src), imm:$c))], IMOV8rr>; @@ -50,7 +50,7 @@ def V2i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), (ins V4I8Regs:$src, i8imm:$c), "mov.u16 \t$dst, $src${c:vecelem};", - [(set Int8Regs:$dst, (vector_extract + [(set Int8Regs:$dst, (extractelt (v4i8 V4I8Regs:$src), imm:$c))], IMOV8rr>; @@ -58,7 +58,7 @@ def V4i8Extract : NVPTXVecInst<(outs Int8Regs:$dst), def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), (ins V2I32Regs:$src, i8imm:$c), "mov.u32 \t$dst, $src${c:vecelem};", - [(set Int32Regs:$dst, (vector_extract + [(set Int32Regs:$dst, (extractelt (v2i32 V2I32Regs:$src), imm:$c))], IMOV32rr>; @@ -66,7 +66,7 @@ def V2i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), (ins V2F32Regs:$src, i8imm:$c), "mov.f32 \t$dst, $src${c:vecelem};", - [(set Float32Regs:$dst, (vector_extract + [(set Float32Regs:$dst, (extractelt (v2f32 V2F32Regs:$src), imm:$c))], FMOV32rr>; @@ -74,7 +74,7 @@ def V2f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst), (ins V2I64Regs:$src, i8imm:$c), "mov.u64 \t$dst, $src${c:vecelem};", - [(set Int64Regs:$dst, (vector_extract + [(set Int64Regs:$dst, (extractelt (v2i64 V2I64Regs:$src), imm:$c))], IMOV64rr>; @@ -82,7 +82,7 @@ def V2i64Extract : NVPTXVecInst<(outs Int64Regs:$dst), def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst), (ins V2F64Regs:$src, i8imm:$c), "mov.f64 \t$dst, $src${c:vecelem};", - [(set Float64Regs:$dst, (vector_extract + [(set Float64Regs:$dst, (extractelt (v2f64 V2F64Regs:$src), imm:$c))], FMOV64rr>; @@ -90,7 +90,7 @@ def V2f64Extract : NVPTXVecInst<(outs Float64Regs:$dst), def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), (ins V4I32Regs:$src, i8imm:$c), "mov.u32 \t$dst, $src${c:vecelem};", - [(set Int32Regs:$dst, (vector_extract + [(set Int32Regs:$dst, (extractelt (v4i32 V4I32Regs:$src), imm:$c))], IMOV32rr>; @@ -98,7 +98,7 @@ def V4i32Extract : NVPTXVecInst<(outs Int32Regs:$dst), def V4f32Extract : NVPTXVecInst<(outs Float32Regs:$dst), (ins V4F32Regs:$src, i8imm:$c), "mov.f32 \t$dst, $src${c:vecelem};", - [(set Float32Regs:$dst, (vector_extract + [(set Float32Regs:$dst, (extractelt (v4f32 V4F32Regs:$src), imm:$c))], FMOV32rr>; } @@ -110,8 +110,7 @@ def V2i8Insert : NVPTXVecInst<(outs V2I8Regs:$dst), "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V2I8Regs:$dst, - (vector_insert V2I8Regs:$src, Int8Regs:$val, imm:$c))], - IMOV8rr>; + (insertelt V2I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>; // Insert v4i8 def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst), @@ -119,8 +118,7 @@ def V4i8Insert : NVPTXVecInst<(outs V4I8Regs:$dst), "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V4I8Regs:$dst, - (vector_insert V4I8Regs:$src, Int8Regs:$val, imm:$c))], - IMOV8rr>; + (insertelt V4I8Regs:$src, Int8Regs:$val, imm:$c))], IMOV8rr>; // Insert v2i16 def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst), @@ -128,8 +126,8 @@ def V2i16Insert : NVPTXVecInst<(outs V2I16Regs:$dst), "mov.v2.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V2I16Regs:$dst, - (vector_insert V2I16Regs:$src, Int16Regs:$val, imm:$c))], - IMOV16rr>; + (insertelt V2I16Regs:$src, Int16Regs:$val, imm:$c))], + IMOV16rr>; // Insert v4i16 def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst), @@ -137,8 +135,8 @@ def V4i16Insert : NVPTXVecInst<(outs V4I16Regs:$dst), "mov.v4.u16 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u16 \t$dst${c:vecelem}, $val;", [(set V4I16Regs:$dst, - (vector_insert V4I16Regs:$src, Int16Regs:$val, imm:$c))], - IMOV16rr>; + (insertelt V4I16Regs:$src, Int16Regs:$val, imm:$c))], + IMOV16rr>; // Insert v2i32 def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst), @@ -146,8 +144,8 @@ def V2i32Insert : NVPTXVecInst<(outs V2I32Regs:$dst), "mov.v2.u32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u32 \t$dst${c:vecelem}, $val;", [(set V2I32Regs:$dst, - (vector_insert V2I32Regs:$src, Int32Regs:$val, imm:$c))], - IMOV32rr>; + (insertelt V2I32Regs:$src, Int32Regs:$val, imm:$c))], + IMOV32rr>; // Insert v2f32 def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst), @@ -155,8 +153,8 @@ def V2f32Insert : NVPTXVecInst<(outs V2F32Regs:$dst), "mov.v2.f32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.f32 \t$dst${c:vecelem}, $val;", [(set V2F32Regs:$dst, - (vector_insert V2F32Regs:$src, Float32Regs:$val, imm:$c))], - FMOV32rr>; + (insertelt V2F32Regs:$src, Float32Regs:$val, imm:$c))], + FMOV32rr>; // Insert v2i64 def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst), @@ -164,8 +162,8 @@ def V2i64Insert : NVPTXVecInst<(outs V2I64Regs:$dst), "mov.v2.u64 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u64 \t$dst${c:vecelem}, $val;", [(set V2I64Regs:$dst, - (vector_insert V2I64Regs:$src, Int64Regs:$val, imm:$c))], - IMOV64rr>; + (insertelt V2I64Regs:$src, Int64Regs:$val, imm:$c))], + IMOV64rr>; // Insert v2f64 def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst), @@ -173,8 +171,8 @@ def V2f64Insert : NVPTXVecInst<(outs V2F64Regs:$dst), "mov.v2.f64 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.f64 \t$dst${c:vecelem}, $val;", [(set V2F64Regs:$dst, - (vector_insert V2F64Regs:$src, Float64Regs:$val, imm:$c))], - FMOV64rr>; + (insertelt V2F64Regs:$src, Float64Regs:$val, imm:$c))], + FMOV64rr>; // Insert v4i32 def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst), @@ -182,8 +180,8 @@ def V4i32Insert : NVPTXVecInst<(outs V4I32Regs:$dst), "mov.v4.u32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.u32 \t$dst${c:vecelem}, $val;", [(set V4I32Regs:$dst, - (vector_insert V4I32Regs:$src, Int32Regs:$val, imm:$c))], - IMOV32rr>; + (insertelt V4I32Regs:$src, Int32Regs:$val, imm:$c))], + IMOV32rr>; // Insert v4f32 def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst), @@ -191,8 +189,8 @@ def V4f32Insert : NVPTXVecInst<(outs V4F32Regs:$dst), "mov.v4.f32 \t${dst:vecfull}, ${src:vecfull};" "\n\tmov.f32 \t$dst${c:vecelem}, $val;", [(set V4F32Regs:$dst, - (vector_insert V4F32Regs:$src, Float32Regs:$val, imm:$c))], - FMOV32rr>; + (insertelt V4F32Regs:$src, Float32Regs:$val, imm:$c))], + FMOV32rr>; } class BinOpAsmString<string c> { diff --git a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp index 5e375b7..20ab5db 100644 --- a/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp +++ b/contrib/llvm/lib/Target/NVPTX/NVVMReflect.cpp @@ -109,10 +109,10 @@ void NVVMReflect::setVarMap() { for (unsigned i = 0, e = ReflectList.size(); i != e; ++i) { DEBUG(dbgs() << "Option : " << ReflectList[i] << "\n"); SmallVector<StringRef, 4> NameValList; - StringRef(ReflectList[i]).split(NameValList, ","); + StringRef(ReflectList[i]).split(NameValList, ','); for (unsigned j = 0, ej = NameValList.size(); j != ej; ++j) { SmallVector<StringRef, 2> NameValPair; - NameValList[j].split(NameValPair, "="); + NameValList[j].split(NameValPair, '='); assert(NameValPair.size() == 2 && "name=val expected"); std::stringstream ValStream(NameValPair[1]); int Val; diff --git a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp index a699a55..220c70a 100644 --- a/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp +++ b/contrib/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp @@ -243,7 +243,6 @@ namespace { struct PPCOperand; class PPCAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; bool IsPPC64; bool IsDarwin; @@ -291,9 +290,9 @@ class PPCAsmParser : public MCTargetAsmParser { public: - PPCAsmParser(MCSubtargetInfo &STI, MCAsmParser &, const MCInstrInfo &MII, - const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(STI), MII(MII) { + PPCAsmParser(const MCSubtargetInfo &STI, MCAsmParser &, + const MCInstrInfo &MII, const MCTargetOptions &Options) + : MCTargetAsmParser(Options, STI), MII(MII) { // Check for 64-bit vs. 32-bit pointer mode. Triple TheTriple(STI.getTargetTriple()); IsPPC64 = (TheTriple.getArch() == Triple::ppc64 || @@ -1185,7 +1184,7 @@ void PPCAsmParser::ProcessInstruction(MCInst &Inst, break; } case PPC::MFTB: { - if (STI.getFeatureBits()[PPC::FeatureMFTB]) { + if (getSTI().getFeatureBits()[PPC::FeatureMFTB]) { assert(Inst.getNumOperands() == 2 && "Expecting two operands"); Inst.setOpcode(PPC::MFSPR); } @@ -1205,7 +1204,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, // Post-process instructions (typically extended mnemonics) ProcessInstruction(Inst, Operands); Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: return Error(IDLoc, "instruction use requires an option to be enabled"); @@ -1690,7 +1689,7 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // where th can be omitted when it is 0. dcbtst is the same. We take the // server form to be the default, so swap the operands if we're parsing for // an embedded core (they'll be swapped again upon printing). - if (STI.getFeatureBits()[PPC::FeatureBookE] && + if (getSTI().getFeatureBits()[PPC::FeatureBookE] && Operands.size() == 4 && (Name == "dcbt" || Name == "dcbtst")) { std::swap(Operands[1], Operands[3]); @@ -1730,10 +1729,19 @@ bool PPCAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; + SMLoc ExprLoc = getLexer().getLoc(); if (getParser().parseExpression(Value)) return false; - getParser().getStreamer().EmitValue(Value, Size); + if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else { + getStreamer().EmitValue(Value, Size, ExprLoc); + } if (getLexer().is(AsmToken::EndOfStatement)) break; diff --git a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 93a503c..1fc84fb 100644 --- a/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/contrib/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -401,8 +401,6 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size, decodeInstruction(DecoderTableQPX32, MI, Inst, Address, this, STI); if (result != MCDisassembler::Fail) return result; - - MI.clear(); } return decodeInstruction(DecoderTable32, MI, Inst, Address, this, STI); diff --git a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h index 8e18783..53eb727 100644 --- a/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h +++ b/contrib/llvm/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h @@ -18,8 +18,6 @@ namespace llvm { -class MCOperand; - class PPCInstPrinter : public MCInstPrinter { bool IsDarwin; public: diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp index 992be5b..dd99495 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp @@ -113,6 +113,10 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, break; } break; + case PPC::fixup_ppc_half16ds: + Target.print(errs()); + errs() << '\n'; + report_fatal_error("Invalid PC-relative half16ds relocation"); case FK_Data_4: case FK_PCRel_4: Type = ELF::R_PPC_REL32; @@ -305,13 +309,13 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, break; case MCSymbolRefExpr::VK_GOT: Type = ELF::R_PPC64_GOT16_DS; - break; + break; case MCSymbolRefExpr::VK_PPC_GOT_LO: Type = ELF::R_PPC64_GOT16_LO_DS; break; case MCSymbolRefExpr::VK_PPC_TOC: Type = ELF::R_PPC64_TOC16_DS; - break; + break; case MCSymbolRefExpr::VK_PPC_TOC_LO: Type = ELF::R_PPC64_TOC16_LO_DS; break; @@ -372,16 +376,16 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target, break; case MCSymbolRefExpr::VK_None: Type = ELF::R_PPC64_ADDR64; - break; + break; case MCSymbolRefExpr::VK_PPC_DTPMOD: Type = ELF::R_PPC64_DTPMOD64; - break; + break; case MCSymbolRefExpr::VK_PPC_TPREL: Type = ELF::R_PPC64_TPREL64; - break; + break; case MCSymbolRefExpr::VK_PPC_DTPREL: Type = ELF::R_PPC64_DTPREL64; - break; + break; } break; case FK_Data_4: diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h index 86ad385..e252ac9 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h @@ -20,18 +20,19 @@ namespace llvm { class Triple; - class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - - public: - explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&); - }; - - class PPCELFMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit PPCELFMCAsmInfo(bool is64Bit, const Triple&); - }; +class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple &); +}; + +class PPCELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit PPCELFMCAsmInfo(bool is64Bit, const Triple &); +}; } // namespace llvm diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h index a641780..d42a111 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h @@ -82,8 +82,8 @@ public: const MCAsmLayout *Layout, const MCFixup *Fixup) const override; void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } // There are no TLS PPCMCExprs at the moment. diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp index 9d72896..b54a0e1 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp @@ -241,12 +241,12 @@ bool PPCMachObjectWriter::recordScatteredRelocation( if (FixupOffset > 0xffffff) { char Buffer[32]; format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), Twine("Section too large, can't encode " "r_address (") + Buffer + ") into 24 bits of scattered " "relocation entry."); - llvm_unreachable("fatal error returned?!"); + return false; } // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()? diff --git a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h index 6075631..acea600 100644 --- a/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h +++ b/contrib/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h @@ -56,6 +56,14 @@ namespace PPC { PRED_BIT_UNSET = 1025 }; + // Bit for branch taken (plus) or not-taken (minus) hint + enum BranchHintBit { + BR_NO_HINT = 0x0, + BR_NONTAKEN_HINT = 0x2, + BR_TAKEN_HINT = 0x3, + BR_HINT_MASK = 0X3 + }; + /// Invert the specified predicate. != -> ==, < -> >=. Predicate InvertPredicate(Predicate Opcode); diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.h b/contrib/llvm/lib/Target/PowerPC/PPC.h index ae8d8b4..a259ed3 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.h +++ b/contrib/llvm/lib/Target/PowerPC/PPC.h @@ -41,13 +41,16 @@ namespace llvm { FunctionPass *createPPCVSXCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); FunctionPass *createPPCVSXSwapRemovalPass(); + FunctionPass *createPPCMIPeepholePass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); + FunctionPass *createPPCBoolRetToIntPass(); void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, AsmPrinter &AP, bool isDarwin); void initializePPCVSXFMAMutatePass(PassRegistry&); + void initializePPCBoolRetToIntPass(PassRegistry&); extern char &PPCVSXFMAMutateID; namespace PPCII { diff --git a/contrib/llvm/lib/Target/PowerPC/PPC.td b/contrib/llvm/lib/Target/PowerPC/PPC.td index 641b237..b03be12 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPC.td +++ b/contrib/llvm/lib/Target/PowerPC/PPC.td @@ -50,6 +50,8 @@ def DirectivePwr8: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR8", "">; def Feature64Bit : SubtargetFeature<"64bit","Has64BitSupport", "true", "Enable 64-bit instructions">; +def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", + "Use software emulation for floating point">; def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true", "Enable 64-bit registers usage for ppc32 [beta]">; def FeatureCRBits : SubtargetFeature<"crbits", "UseCRBits", "true", @@ -137,6 +139,12 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true", "Enable Hardware Transactional Memory instructions">; def FeatureMFTB : SubtargetFeature<"", "FeatureMFTB", "true", "Implement mftb using the mfspr instruction">; +def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true", + "Target supports add/load integer fusion.">; +def FeatureFloat128 : + SubtargetFeature<"float128", "HasFloat128", "true", + "Enable the __float128 data type for IEEE-754R Binary128.", + [FeatureVSX]>; def DeprecatedDST : SubtargetFeature<"", "DeprecatedDST", "true", "Treat vector data stream cache control instructions as deprecated">; @@ -168,7 +176,8 @@ def ProcessorFeatures { FeatureMFTB, DeprecatedDST]; list<SubtargetFeature> Power8SpecificFeatures = [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto, - FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic]; + FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic, + FeatureFusion]; list<SubtargetFeature> Power8FeatureList = !listconcat(Power7FeatureList, Power8SpecificFeatures); } @@ -309,7 +318,7 @@ def : ProcessorModel<"g5", G5Model, Feature64Bit /*, Feature64BitRegs */, FeatureMFTB, DeprecatedDST]>; def : ProcessorModel<"e500mc", PPCE500mcModel, - [DirectiveE500mc, FeatureMFOCRF, + [DirectiveE500mc, FeatureSTFIWX, FeatureICBT, FeatureBookE, FeatureISEL, FeatureMFTB]>; def : ProcessorModel<"e5500", PPCE5500Model, @@ -403,6 +412,7 @@ def PPCAsmParserVariant : AsmParserVariant { // InstAlias definitions use immediate literals. Set RegisterPrefix // so that those are not misinterpreted as registers. string RegisterPrefix = "%"; + string BreakCharacters = "."; } def PPC : Target { diff --git a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index 8e118ec..9a63c14 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -65,19 +65,20 @@ using namespace llvm; #define DEBUG_TYPE "asmprinter" namespace { - class PPCAsmPrinter : public AsmPrinter { - protected: - MapVector<MCSymbol*, MCSymbol*> TOC; - const PPCSubtarget *Subtarget; - StackMaps SM; - public: - explicit PPCAsmPrinter(TargetMachine &TM, - std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} - - const char *getPassName() const override { - return "PowerPC Assembly Printer"; - } +class PPCAsmPrinter : public AsmPrinter { +protected: + MapVector<MCSymbol *, MCSymbol *> TOC; + const PPCSubtarget *Subtarget; + StackMaps SM; + +public: + explicit PPCAsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), SM(*this) {} + + const char *getPassName() const override { + return "PowerPC Assembly Printer"; + } MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym); @@ -94,10 +95,8 @@ namespace { void EmitEndOfAsmFile(Module &M) override; - void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI); - void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI); + void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI); + void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI); void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK); bool runOnMachineFunction(MachineFunction &MF) override { Subtarget = &MF.getSubtarget<PPCSubtarget>(); @@ -157,15 +156,15 @@ static const char *stripRegisterPrefix(const char *RegName) { return RegName + 1; case 'c': if (RegName[1] == 'r') return RegName + 2; } - + return RegName; } void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const MachineOperand &MO = MI->getOperand(OpNo); - + switch (MO.getType()) { case MachineOperand::MO_Register: { const char *RegName = PPCInstPrinter::getRegisterName(MO.getReg()); @@ -184,8 +183,8 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, MO.getMBB()->getSymbol()->print(O, MAI); return; case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() - << '_' << MO.getIndex(); + O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' + << MO.getIndex(); return; case MachineOperand::MO_BlockAddress: GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI); @@ -200,19 +199,19 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo, !GV->isStrongDefinitionForLinker()) { if (!GV->hasHiddenVisibility()) { SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>() - .getGVStubEntry(SymToPrint); + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry( + SymToPrint); if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl:: StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); } else if (GV->isDeclaration() || GV->hasCommonLinkage() || GV->hasAvailableExternallyLinkage()) { SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr"); - - MachineModuleInfoImpl::StubValueTy &StubSym = - MMI->getObjFileInfo<MachineModuleInfoMachO>(). - getHiddenGVStubEntry(SymToPrint); + + MachineModuleInfoImpl::StubValueTy &StubSym = + MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry( + SymToPrint); if (!StubSym.getPointer()) StubSym = MachineModuleInfoImpl:: StubValueTy(getSymbol(GV), !GV->hasInternalLinkage()); @@ -295,16 +294,16 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, } case 'U': // Print 'u' for update form. case 'X': // Print 'x' for indexed form. - { - // FIXME: Currently for PowerPC memory operands are always loaded - // into a register, so we never get an update or indexed form. - // This is bad even for offset forms, since even if we know we - // have a value in -16(r1), we will generate a load into r<n> - // and then load from 0(r<n>). Until that issue is fixed, - // tolerate 'U' and 'X' but don't output anything. - assert(MI->getOperand(OpNo).isReg()); - return false; - } + { + // FIXME: Currently for PowerPC memory operands are always loaded + // into a register, so we never get an update or indexed form. + // This is bad even for offset forms, since even if we know we + // have a value in -16(r1), we will generate a load into r<n> + // and then load from 0(r<n>). Until that issue is fixed, + // tolerate 'U' and 'X' but don't output anything. + assert(MI->getOperand(OpNo).isReg()); + return false; + } } } @@ -315,7 +314,6 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, return false; } - /// lookUpOrCreateTOCEntry -- Given a symbol, look up whether a TOC entry /// exists for it. If not, create one. Then return a symbol that references /// the TOC entry. @@ -330,8 +328,7 @@ void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) { SM.serializeToStackMapSection(); } -void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI) { +void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) { unsigned NumNOPBytes = MI.getOperand(1).getImm(); SM.recordStackMap(MI); @@ -353,13 +350,12 @@ void PPCAsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, // Emit nops. for (unsigned i = 0; i < NumNOPBytes; i += 4) - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); } // Lower a patchpoint of the form: // [<def>], <id>, <numBytes>, <target>, <numArgs> -void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, - const MachineInstr &MI) { +void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) { SM.recordPatchPoint(MI); PatchPointOpers Opers(&MI); @@ -375,60 +371,59 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg(); EncodedBytes = 0; // Materialize the jump address: - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LI8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI8) .addReg(ScratchReg) .addImm((CallTarget >> 32) & 0xFFFF)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::RLDIC) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::RLDIC) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(32).addImm(16)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORIS8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORIS8) .addReg(ScratchReg) .addReg(ScratchReg) .addImm((CallTarget >> 16) & 0xFFFF)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::ORI8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ORI8) .addReg(ScratchReg) .addReg(ScratchReg) .addImm(CallTarget & 0xFFFF)); // Save the current TOC pointer before the remote call. int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::STD) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD) .addReg(PPC::X2) .addImm(TOCSaveOffset) .addReg(PPC::X1)); ++EncodedBytes; - // If we're on ELFv1, then we need to load the actual function pointer // from the function descriptor. if (!Subtarget->isELFv2ABI()) { - // Load the new TOC pointer and the function address, but not r11 - // (needing this is rare, and loading it here would prevent passing it - // via a 'nest' parameter. - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + // Load the new TOC pointer and the function address, but not r11 + // (needing this is rare, and loading it here would prevent passing it + // via a 'nest' parameter. + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) .addReg(PPC::X2) .addImm(8) .addReg(ScratchReg)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) .addReg(ScratchReg) .addImm(0) .addReg(ScratchReg)); ++EncodedBytes; } - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::MTCTR8) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTCTR8) .addReg(ScratchReg)); ++EncodedBytes; - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BCTRL8)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BCTRL8)); ++EncodedBytes; // Restore the TOC pointer after the call. - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::LD) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LD) .addReg(PPC::X2) .addImm(TOCSaveOffset) .addReg(PPC::X1)); @@ -439,7 +434,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, MCSymbol *MOSymbol = getSymbol(GValue); const MCExpr *SymVar = MCSymbolRefExpr::create(MOSymbol, OutContext); - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::BL8_NOP) + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL8_NOP) .addExpr(SymVar)); EncodedBytes += 2; } @@ -454,7 +449,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, assert((NumBytes - EncodedBytes) % 4 == 0 && "Invalid number of NOP bytes requested!"); for (unsigned i = EncodedBytes; i < NumBytes; i += 4) - EmitToStreamer(OutStreamer, MCInstBuilder(PPC::NOP)); + EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP)); } /// EmitTlsCall -- Given a GETtls[ld]ADDR[32] instruction, print a @@ -499,16 +494,16 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { bool isDarwin = TM.getTargetTriple().isOSDarwin(); const Module *M = MF->getFunction()->getParent(); PICLevel::Level PL = M->getPICLevel(); - + // Lower multi-instruction pseudo operations. switch (MI->getOpcode()) { default: break; case TargetOpcode::DBG_VALUE: llvm_unreachable("Should be handled target independently"); case TargetOpcode::STACKMAP: - return LowerSTACKMAP(*OutStreamer, SM, *MI); + return LowerSTACKMAP(SM, *MI); case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(*OutStreamer, SM, *MI); + return LowerPATCHPOINT(SM, *MI); case PPC::MoveGOTtoLR: { // Transform %LR = MoveGOTtoLR @@ -533,17 +528,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { case PPC::MovePCtoLR: case PPC::MovePCtoLR8: { // Transform %LR = MovePCtoLR - // Into this, where the label is the PIC base: + // Into this, where the label is the PIC base: // bl L1$pb // L1$pb: MCSymbol *PICBase = MF->getPICBaseSymbol(); - + // Emit the 'bl'. - EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::BL) - // FIXME: We would like an efficient form for this, so we don't have to do - // a lot of extra uniquing. - .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); - + EmitToStreamer(*OutStreamer, + MCInstBuilder(PPC::BL) + // FIXME: We would like an efficient form for this, so we + // don't have to do a lot of extra uniquing. + .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + // Emit the label. OutStreamer->EmitLabel(PICBase); return; @@ -654,7 +650,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitToStreamer(*OutStreamer, TmpInst); return; } - + case PPC::ADDIStocHA: { // Transform %Xd = ADDIStocHA %X2, <ga:@sym> LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, isDarwin); @@ -669,28 +665,22 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MO.isBlockAddress()) && "Invalid operand for ADDIStocHA!"); MCSymbol *MOSymbol = nullptr; - bool IsExternal = false; - bool IsNonLocalFunction = false; - bool IsCommon = false; - bool IsAvailExt = false; + bool GlobalToc = false; if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); MOSymbol = getSymbol(GV); - IsExternal = GV->isDeclaration(); - IsCommon = GV->hasCommonLinkage(); - IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && - !GV->isStrongDefinitionForLinker(); - IsAvailExt = GV->hasAvailableExternallyLinkage(); - } else if (MO.isCPI()) + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + GlobalToc = (GVFlags & PPCII::MO_NLP_FLAG); + } else if (MO.isCPI()) { MOSymbol = GetCPISymbol(MO.getIndex()); - else if (MO.isJTI()) + } else if (MO.isJTI()) { MOSymbol = GetJTISymbol(MO.getIndex()); - else if (MO.isBlockAddress()) + } else if (MO.isBlockAddress()) { MOSymbol = GetBlockAddressSymbol(MO.getBlockAddress()); + } - if (IsExternal || IsNonLocalFunction || IsCommon || IsAvailExt || - MO.isJTI() || MO.isBlockAddress() || + if (GlobalToc || MO.isJTI() || MO.isBlockAddress() || TM.getCodeModel() == CodeModel::Large) MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); @@ -727,13 +717,14 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); } else if (MO.isGlobal()) { - const GlobalValue *GValue = MO.getGlobal(); - MOSymbol = getSymbol(GValue); - if (GValue->getType()->getElementType()->isFunctionTy() || - GValue->isDeclaration() || GValue->hasCommonLinkage() || - GValue->hasAvailableExternallyLinkage() || - TM.getCodeModel() == CodeModel::Large) - MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + const GlobalValue *GV = MO.getGlobal(); + MOSymbol = getSymbol(GV); + DEBUG( + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + assert((GVFlags & PPCII::MO_NLP_FLAG) && + "LDtocL used on symbol that could be accessed directly is " + "invalid. Must match ADDIStocHA.")); + MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); } const MCExpr *Exp = @@ -754,21 +745,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineOperand &MO = MI->getOperand(2); assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL"); MCSymbol *MOSymbol = nullptr; - bool IsExternal = false; - bool IsNonLocalFunction = false; if (MO.isGlobal()) { const GlobalValue *GV = MO.getGlobal(); + DEBUG( + unsigned char GVFlags = Subtarget->classifyGlobalReference(GV); + assert ( + !(GVFlags & PPCII::MO_NLP_FLAG) && + "Interposable definitions must use indirect access.")); MOSymbol = getSymbol(GV); - IsExternal = GV->isDeclaration(); - IsNonLocalFunction = GV->getType()->getElementType()->isFunctionTy() && - !GV->isStrongDefinitionForLinker(); - } else if (MO.isCPI()) + } else if (MO.isCPI()) { MOSymbol = GetCPISymbol(MO.getIndex()); - - if (IsNonLocalFunction || IsExternal || - TM.getCodeModel() == CodeModel::Large) - MOSymbol = lookUpOrCreateTOCEntry(MOSymbol); + } const MCExpr *Exp = MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO, @@ -840,13 +828,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } case PPC::PPC32GOT: { - MCSymbol *GOTSymbol = OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); - const MCExpr *SymGotTlsL = - MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, - OutContext); - const MCExpr *SymGotTlsHA = - MCSymbolRefExpr::create(GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, - OutContext); + MCSymbol *GOTSymbol = + OutContext.getOrCreateSymbol(StringRef("_GLOBAL_OFFSET_TABLE_")); + const MCExpr *SymGotTlsL = MCSymbolRefExpr::create( + GOTSymbol, MCSymbolRefExpr::VK_PPC_LO, OutContext); + const MCExpr *SymGotTlsHA = MCSymbolRefExpr::create( + GOTSymbol, MCSymbolRefExpr::VK_PPC_HA, OutContext); EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LI) .addReg(MI->getOperand(0).getReg()) .addExpr(SymGotTlsL)); @@ -1079,14 +1066,14 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) { void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { // linux/ppc32 - Normal entry label. - if (!Subtarget->isPPC64() && - (TM.getRelocationModel() != Reloc::PIC_ || + if (!Subtarget->isPPC64() && + (TM.getRelocationModel() != Reloc::PIC_ || MF->getFunction()->getParent()->getPICLevel() == PICLevel::Small)) return AsmPrinter::EmitFunctionEntryLabel(); if (!Subtarget->isPPC64()) { const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>(); - if (PPCFI->usesPICBase()) { + if (PPCFI->usesPICBase()) { MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol(); MCSymbol *PICBase = MF->getPICBaseSymbol(); OutStreamer->EmitLabel(RelocSymbol); @@ -1130,11 +1117,10 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() { OutStreamer->SwitchSection(Current.first, Current.second); } - bool PPCLinuxAsmPrinter::doFinalization(Module &M) { - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); - bool isPPC64 = TD->getPointerSizeInBits() == 64; + bool isPPC64 = DL.getPointerSizeInBits() == 64; PPCTargetStreamer &TS = static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer()); @@ -1293,8 +1279,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) { // Prime text sections so they are adjacent. This reduces the likelihood a // large data or debug section causes a branch to exceed 16M limit. - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); OutStreamer->SwitchSection(TLOFMacho.getTextCoalSection()); if (TM.getRelocationModel() == Reloc::PIC_) { OutStreamer->SwitchSection( @@ -1325,7 +1311,7 @@ static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) { void PPCDarwinAsmPrinter:: EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { - bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64; + bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64; // Construct a local MCSubtargetInfo and shadow EmitToStreamer here. // This is because the MachineFunction won't exist (but have not yet been @@ -1338,8 +1324,8 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { S.EmitInstruction(Inst, *STI); }; - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); // .lazy_symbol_pointer MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection(); @@ -1353,12 +1339,12 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { OutStreamer->SwitchSection(StubSection); EmitAlignment(4); - + MCSymbol *Stub = Stubs[i].first; MCSymbol *RawSym = Stubs[i].second.getPointer(); MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext); MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext); - + OutStreamer->EmitLabel(Stub); OutStreamer->EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol); @@ -1463,20 +1449,19 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) { OutStreamer->EmitSymbolValue(DyldStubBindingHelper, 4); } } - + OutStreamer->AddBlankLine(); } - bool PPCDarwinAsmPrinter::doFinalization(Module &M) { - bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64; + bool isPPC64 = getDataLayout().getPointerSizeInBits() == 64; // Darwin/PPC always uses mach-o. - const TargetLoweringObjectFileMachO &TLOFMacho = - static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); + const TargetLoweringObjectFileMachO &TLOFMacho = + static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering()); MachineModuleInfoMachO &MMIMacho = - MMI->getObjFileInfo<MachineModuleInfoMachO>(); - + MMI->getObjFileInfo<MachineModuleInfoMachO>(); + MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList(); if (!Stubs.empty()) EmitFunctionStubs(Stubs); @@ -1484,27 +1469,27 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { if (MAI->doesSupportExceptionHandling() && MMI) { // Add the (possibly multiple) personalities to the set of global values. // Only referenced functions get into the Personalities list. - const std::vector<const Function*> &Personalities = MMI->getPersonalities(); - for (std::vector<const Function*>::const_iterator I = Personalities.begin(), - E = Personalities.end(); I != E; ++I) { - if (*I) { - MCSymbol *NLPSym = getSymbolWithGlobalValueBase(*I, "$non_lazy_ptr"); + for (const Function *Personality : MMI->getPersonalities()) { + if (Personality) { + MCSymbol *NLPSym = + getSymbolWithGlobalValueBase(Personality, "$non_lazy_ptr"); MachineModuleInfoImpl::StubValueTy &StubSym = - MMIMacho.getGVStubEntry(NLPSym); - StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true); + MMIMacho.getGVStubEntry(NLPSym); + StubSym = + MachineModuleInfoImpl::StubValueTy(getSymbol(Personality), true); } } } // Output stubs for dynamically-linked functions. Stubs = MMIMacho.GetGVStubList(); - + // Output macho stubs for external and common global variables. if (!Stubs.empty()) { // Switch with ".non_lazy_symbol_pointer" directive. OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection()); EmitAlignment(isPPC64 ? 3 : 2); - + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { // L_foo$stub: OutStreamer->EmitLabel(Stubs[i].first); @@ -1535,7 +1520,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) { if (!Stubs.empty()) { OutStreamer->SwitchSection(getObjFileLowering().getDataSection()); EmitAlignment(isPPC64 ? 3 : 2); - + for (unsigned i = 0, e = Stubs.size(); i != e; ++i) { // L_foo$stub: OutStreamer->EmitLabel(Stubs[i].first); @@ -1573,7 +1558,7 @@ createPPCAsmPrinterPass(TargetMachine &tm, } // Force static initialization. -extern "C" void LLVMInitializePowerPCAsmPrinter() { +extern "C" void LLVMInitializePowerPCAsmPrinter() { TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass); TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass); TargetRegistry::RegisterAsmPrinter(ThePPC64LETarget, createPPCAsmPrinterPass); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp new file mode 100644 index 0000000..7920240 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp @@ -0,0 +1,253 @@ +//===- PPCBoolRetToInt.cpp - Convert bool literals to i32 if they are returned ==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements converting i1 values to i32 if they could be more +// profitably allocated as GPRs rather than CRs. This pass will become totally +// unnecessary if Register Bank Allocation and Global Instruction Selection ever +// go upstream. +// +// Presently, the pass converts i1 Constants, and Arguments to i32 if the +// transitive closure of their uses includes only PHINodes, CallInsts, and +// ReturnInsts. The rational is that arguments are generally passed and returned +// in GPRs rather than CRs, so casting them to i32 at the LLVM IR level will +// actually save casts at the Machine Instruction level. +// +// It might be useful to expand this pass to add bit-wise operations to the list +// of safe transitive closure types. Also, we miss some opportunities when LLVM +// represents logical AND and OR operations with control flow rather than data +// flow. For example by lowering the expression: return (A && B && C) +// +// as: return A ? true : B && C. +// +// There's code in SimplifyCFG that code be used to turn control flow in data +// flow using SelectInsts. Selects are slow on some architectures (P7/P8), so +// this probably isn't good in general, but for the special case of i1, the +// Selects could be further lowered to bit operations that are fast everywhere. +// +//===----------------------------------------------------------------------===// + +#include "PPC.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace { + +#define DEBUG_TYPE "bool-ret-to-int" + +STATISTIC(NumBoolRetPromotion, + "Number of times a bool feeding a RetInst was promoted to an int"); +STATISTIC(NumBoolCallPromotion, + "Number of times a bool feeding a CallInst was promoted to an int"); +STATISTIC(NumBoolToIntPromotion, + "Total number of times a bool was promoted to an int"); + +class PPCBoolRetToInt : public FunctionPass { + + static SmallPtrSet<Value *, 8> findAllDefs(Value *V) { + SmallPtrSet<Value *, 8> Defs; + SmallVector<Value *, 8> WorkList; + WorkList.push_back(V); + Defs.insert(V); + while (!WorkList.empty()) { + Value *Curr = WorkList.back(); + WorkList.pop_back(); + if (User *CurrUser = dyn_cast<User>(Curr)) + for (auto &Op : CurrUser->operands()) + if (Defs.insert(Op).second) + WorkList.push_back(Op); + } + return Defs; + } + + // Translate a i1 value to an equivalent i32 value: + static Value *translate(Value *V) { + Type *Int32Ty = Type::getInt32Ty(V->getContext()); + if (Constant *C = dyn_cast<Constant>(V)) + return ConstantExpr::getZExt(C, Int32Ty); + if (PHINode *P = dyn_cast<PHINode>(V)) { + // Temporarily set the operands to 0. We'll fix this later in + // runOnUse. + Value *Zero = Constant::getNullValue(Int32Ty); + PHINode *Q = + PHINode::Create(Int32Ty, P->getNumIncomingValues(), P->getName(), P); + for (unsigned i = 0; i < P->getNumOperands(); ++i) + Q->addIncoming(Zero, P->getIncomingBlock(i)); + return Q; + } + + Argument *A = dyn_cast<Argument>(V); + Instruction *I = dyn_cast<Instruction>(V); + assert((A || I) && "Unknown value type"); + + auto InstPt = + A ? &*A->getParent()->getEntryBlock().begin() : I->getNextNode(); + return new ZExtInst(V, Int32Ty, "", InstPt); + } + + typedef SmallPtrSet<const PHINode *, 8> PHINodeSet; + + // A PHINode is Promotable if: + // 1. Its type is i1 AND + // 2. All of its uses are ReturnInt, CallInst, PHINode, or DbgInfoIntrinsic + // AND + // 3. All of its operands are Constant or Argument or + // CallInst or PHINode AND + // 4. All of its PHINode uses are Promotable AND + // 5. All of its PHINode operands are Promotable + static PHINodeSet getPromotablePHINodes(const Function &F) { + PHINodeSet Promotable; + // Condition 1 + for (auto &BB : F) + for (auto &I : BB) + if (const PHINode *P = dyn_cast<PHINode>(&I)) + if (P->getType()->isIntegerTy(1)) + Promotable.insert(P); + + SmallVector<const PHINode *, 8> ToRemove; + for (const auto &P : Promotable) { + // Condition 2 and 3 + auto IsValidUser = [] (const Value *V) -> bool { + return isa<ReturnInst>(V) || isa<CallInst>(V) || isa<PHINode>(V) || + isa<DbgInfoIntrinsic>(V); + }; + auto IsValidOperand = [] (const Value *V) -> bool { + return isa<Constant>(V) || isa<Argument>(V) || isa<CallInst>(V) || + isa<PHINode>(V); + }; + const auto &Users = P->users(); + const auto &Operands = P->operands(); + if (!std::all_of(Users.begin(), Users.end(), IsValidUser) || + !std::all_of(Operands.begin(), Operands.end(), IsValidOperand)) + ToRemove.push_back(P); + } + + // Iterate to convergence + auto IsPromotable = [&Promotable] (const Value *V) -> bool { + const PHINode *Phi = dyn_cast<PHINode>(V); + return !Phi || Promotable.count(Phi); + }; + while (!ToRemove.empty()) { + for (auto &User : ToRemove) + Promotable.erase(User); + ToRemove.clear(); + + for (const auto &P : Promotable) { + // Condition 4 and 5 + const auto &Users = P->users(); + const auto &Operands = P->operands(); + if (!std::all_of(Users.begin(), Users.end(), IsPromotable) || + !std::all_of(Operands.begin(), Operands.end(), IsPromotable)) + ToRemove.push_back(P); + } + } + + return Promotable; + } + + typedef DenseMap<Value *, Value *> B2IMap; + + public: + static char ID; + PPCBoolRetToInt() : FunctionPass(ID) { + initializePPCBoolRetToIntPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) { + PHINodeSet PromotablePHINodes = getPromotablePHINodes(F); + B2IMap Bool2IntMap; + bool Changed = false; + for (auto &BB : F) { + for (auto &I : BB) { + if (ReturnInst *R = dyn_cast<ReturnInst>(&I)) + if (F.getReturnType()->isIntegerTy(1)) + Changed |= + runOnUse(R->getOperandUse(0), PromotablePHINodes, Bool2IntMap); + + if (CallInst *CI = dyn_cast<CallInst>(&I)) + for (auto &U : CI->operands()) + if (U->getType()->isIntegerTy(1)) + Changed |= runOnUse(U, PromotablePHINodes, Bool2IntMap); + } + } + + return Changed; + } + + static bool runOnUse(Use &U, const PHINodeSet &PromotablePHINodes, + B2IMap &BoolToIntMap) { + auto Defs = findAllDefs(U); + + // If the values are all Constants or Arguments, don't bother + if (!std::any_of(Defs.begin(), Defs.end(), isa<Instruction, Value *>)) + return false; + + // Presently, we only know how to handle PHINode, Constant, and Arguments. + // Potentially, bitwise operations (AND, OR, XOR, NOT) and sign extension + // could also be handled in the future. + for (const auto &V : Defs) + if (!isa<PHINode>(V) && !isa<Constant>(V) && !isa<Argument>(V)) + return false; + + for (const auto &V : Defs) + if (const PHINode *P = dyn_cast<PHINode>(V)) + if (!PromotablePHINodes.count(P)) + return false; + + if (isa<ReturnInst>(U.getUser())) + ++NumBoolRetPromotion; + if (isa<CallInst>(U.getUser())) + ++NumBoolCallPromotion; + ++NumBoolToIntPromotion; + + for (const auto &V : Defs) + if (!BoolToIntMap.count(V)) + BoolToIntMap[V] = translate(V); + + // Replace the operands of the translated instructions. There were set to + // zero in the translate function. + for (auto &Pair : BoolToIntMap) { + User *First = dyn_cast<User>(Pair.first); + User *Second = dyn_cast<User>(Pair.second); + assert((!First || Second) && "translated from user to non-user!?"); + if (First) + for (unsigned i = 0; i < First->getNumOperands(); ++i) + Second->setOperand(i, BoolToIntMap[First->getOperand(i)]); + } + + Value *IntRetVal = BoolToIntMap[U]; + Type *Int1Ty = Type::getInt1Ty(U->getContext()); + Instruction *I = cast<Instruction>(U.getUser()); + Value *BackToBool = new TruncInst(IntRetVal, Int1Ty, "backToBool", I); + U.set(BackToBool); + + return true; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } +}; +} + +char PPCBoolRetToInt::ID = 0; +INITIALIZE_PASS(PPCBoolRetToInt, "bool-ret-to-int", + "Convert i1 constants to i32 if they are returned", + false, false) + +FunctionPass *llvm::createPPCBoolRetToIntPass() { return new PPCBoolRetToInt(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp index 940d55a..73a5305 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -91,7 +91,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { unsigned FuncSize = 0; for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E; ++MFI) { - MachineBasicBlock *MBB = MFI; + MachineBasicBlock *MBB = &*MFI; // The end of the previous block may have extra nops if this block has an // alignment requirement. diff --git a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp index fd150be..b6ac4d5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -98,7 +98,7 @@ namespace { AU.addPreserved<LoopInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); } private: @@ -112,6 +112,7 @@ namespace { const DataLayout *DL; DominatorTree *DT; const TargetLibraryInfo *LibInfo; + bool PreserveLCSSA; }; char PPCCTRLoops::ID = 0; @@ -147,7 +148,7 @@ INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops", false, false) @@ -169,11 +170,12 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() { bool PPCCTRLoops::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); DL = &F.getParent()->getDataLayout(); auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); LibInfo = TLIP ? &TLIP->getTLI() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); bool MadeChange = false; @@ -250,8 +252,8 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr // we're definitely using CTR. case Intrinsic::ppc_is_decremented_ctr_nonzero: - case Intrinsic::ppc_mtctr: - return true; + case Intrinsic::ppc_mtctr: + return true; // VisualStudio defines setjmp as _setjmp #if defined(_MSC_VER) && defined(setjmp) && \ @@ -369,7 +371,7 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) { true); if (VTy == MVT::Other) return true; - + if (TLI->isOperationLegalOrCustom(Opcode, VTy)) continue; else if (VTy.isVector() && @@ -537,7 +539,7 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // the CTR register because some such uses might be reordered by the // selection DAG after the mtctr instruction). if (!Preheader || mightUseCTR(TT, Preheader)) - Preheader = InsertPreheaderForLoop(L, this); + Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (!Preheader) return MadeChange; @@ -554,10 +556,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { if (!ExitCount->getType()->isPointerTy() && ExitCount->getType() != CountType) ExitCount = SE->getZeroExtendExpr(ExitCount, CountType); - ExitCount = SE->getAddExpr(ExitCount, - SE->getConstant(CountType, 1)); - Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType, - Preheader->getTerminator()); + ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType)); + Value *ECValue = + SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator()); IRBuilder<> CountBuilder(Preheader->getTerminator()); Module *M = Preheader->getParent()->getParent(); @@ -677,7 +678,7 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { // any other instructions that might clobber the ctr register. for (MachineFunction::iterator I = MF.begin(), IE = MF.end(); I != IE; ++I) { - MachineBasicBlock *MBB = I; + MachineBasicBlock *MBB = &*I; if (!MDT->isReachableFromEntry(MBB)) continue; @@ -694,4 +695,3 @@ bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) { return false; } #endif // NDEBUG - diff --git a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp index fc89753..7cb1bb5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp @@ -71,15 +71,20 @@ protected: for (MachineBasicBlock::pred_iterator PI = ReturnMBB.pred_begin(), PIE = ReturnMBB.pred_end(); PI != PIE; ++PI) { bool OtherReference = false, BlockChanged = false; + + if ((*PI)->empty()) + continue; + for (MachineBasicBlock::iterator J = (*PI)->getLastNonDebugInstr();;) { - MachineInstrBuilder MIB; + if (J == (*PI)->end()) + break; + if (J->getOpcode() == PPC::B) { if (J->getOperand(0).getMBB() == &ReturnMBB) { // This is an unconditional branch to the return. Replace the // branch with a blr. - MIB = - BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode())); - MIB.copyImplicitOps(I); + BuildMI(**PI, J, J->getDebugLoc(), TII->get(I->getOpcode())) + .copyImplicitOps(I); MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -90,10 +95,10 @@ protected: if (J->getOperand(2).getMBB() == &ReturnMBB) { // This is a conditional branch to the return. Replace the branch // with a bclr. - MIB = BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR)) - .addImm(J->getOperand(0).getImm()) - .addReg(J->getOperand(1).getReg()); - MIB.copyImplicitOps(I); + BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR)) + .addImm(J->getOperand(0).getImm()) + .addReg(J->getOperand(1).getReg()) + .copyImplicitOps(I); MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -104,11 +109,11 @@ protected: if (J->getOperand(1).getMBB() == &ReturnMBB) { // This is a conditional branch to the return. Replace the branch // with a bclr. - MIB = BuildMI(**PI, J, J->getDebugLoc(), - TII->get(J->getOpcode() == PPC::BC ? - PPC::BCLR : PPC::BCLRn)) - .addReg(J->getOperand(0).getReg()); - MIB.copyImplicitOps(I); + BuildMI( + **PI, J, J->getDebugLoc(), + TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn)) + .addReg(J->getOperand(0).getReg()) + .copyImplicitOps(I); MachineBasicBlock::iterator K = J--; K->eraseFromParent(); BlockChanged = true; @@ -146,7 +151,7 @@ protected: } for (unsigned i = 0, ie = PredToRemove.size(); i != ie; ++i) - PredToRemove[i]->removeSuccessor(&ReturnMBB); + PredToRemove[i]->removeSuccessor(&ReturnMBB, true); if (Changed && !ReturnMBB.hasAddressTaken()) { // We now might be able to merge this blr-only block into its @@ -156,7 +161,7 @@ protected: if (PrevMBB.isLayoutSuccessor(&ReturnMBB) && PrevMBB.canFallThrough()) { // Move the blr into the preceding block. PrevMBB.splice(PrevMBB.end(), &ReturnMBB, I); - PrevMBB.removeSuccessor(&ReturnMBB); + PrevMBB.removeSuccessor(&ReturnMBB, true); } } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 5f236f7..b451ebf 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -164,7 +164,8 @@ class PPCFastISel final : public FastISel { unsigned DestReg, bool IsZExt); unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT); unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT); - unsigned PPCMaterializeInt(const Constant *C, MVT VT, bool UseSExt = true); + unsigned PPCMaterializeInt(const ConstantInt *CI, MVT VT, + bool UseSExt = true); unsigned PPCMaterialize32BitInt(int64_t Imm, const TargetRegisterClass *RC); unsigned PPCMaterialize64BitInt(int64_t Imm, @@ -292,10 +293,7 @@ bool PPCFastISel::isValueAvailable(const Value *V) const { return true; const auto *I = cast<Instruction>(V); - if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) - return true; - - return false; + return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB; } // Given a value Obj, create an Address object Addr that represents its @@ -527,9 +525,9 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr, // VSX only provides an indexed load. if (Is32VSXLoad || Is64VSXLoad) return false; - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset), + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI, + Addr.Offset), MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI), MFI.getObjectAlignment(Addr.Base.FI)); @@ -660,9 +658,9 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) { // VSX only provides an indexed store. if (Is32VSXStore || Is64VSXStore) return false; - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset), + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI, + Addr.Offset), MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI), MFI.getObjectAlignment(Addr.Base.FI)); @@ -774,8 +772,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) { BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC)) .addImm(PPCPred).addReg(CondReg).addMBB(TBB); - fastEmitBranch(FBB, DbgLoc); - FuncInfo.MBB->addSuccessor(TBB); + finishCondBranch(BI->getParent(), TBB, FBB); return true; } } else if (const ConstantInt *CI = @@ -1607,21 +1604,18 @@ bool PPCFastISel::SelectRet(const Instruction *I) { if (ValLocs.size() > 1) return false; - // Special case for returning a constant integer of any size. - // Materialize the constant as an i64 and copy it to the return - // register. We still need to worry about properly extending the sign. E.g: - // If the constant has only one bit, it means it is a boolean. Therefore - // we can't use PPCMaterializeInt because it extends the sign which will - // cause negations of the returned value to be incorrect as they are - // implemented as the flip of the least significant bit. - if (isa<ConstantInt>(*RV)) { - const Constant *C = cast<Constant>(RV); - + // Special case for returning a constant integer of any size - materialize + // the constant as an i64 and copy it to the return register. + if (const ConstantInt *CI = dyn_cast<ConstantInt>(RV)) { CCValAssign &VA = ValLocs[0]; unsigned RetReg = VA.getLocReg(); - unsigned SrcReg = PPCMaterializeInt(C, MVT::i64, - VA.getLocInfo() == CCValAssign::SExt); + // We still need to worry about properly extending the sign. For example, + // we could have only a single bit or a constant that needs zero + // extension rather than sign extension. Make sure we pass the return + // value extension property to integer materialization. + unsigned SrcReg = + PPCMaterializeInt(CI, MVT::i64, VA.getLocInfo() == CCValAssign::SExt); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), RetReg).addReg(SrcReg); @@ -1761,8 +1755,8 @@ bool PPCFastISel::SelectIndirectBr(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCTR8)); const IndirectBrInst *IB = cast<IndirectBrInst>(I); - for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i) - FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]); + for (const BasicBlock *SuccBB : IB->successors()) + FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[SuccBB]); return true; } @@ -1898,10 +1892,9 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) { unsigned DestReg = createResultReg(TLI.getRegClassFor(VT)); CodeModel::Model CModel = TM.getCodeModel(); - MachineMemOperand *MMO = - FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - (VT == MVT::f32) ? 4 : 8, Align); + MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( + MachinePointerInfo::getConstantPool(*FuncInfo.MF), + MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align); unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD; unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass); @@ -1976,19 +1969,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA), HighPartReg).addReg(PPC::X2).addGlobalAddress(GV); - // If/when switches are implemented, jump tables should be handled - // on the "if" path here. - if (CModel == CodeModel::Large || - (GV->getType()->getElementType()->isFunctionTy() && - !GV->isStrongDefinitionForLinker()) || - GV->isDeclaration() || GV->hasCommonLinkage() || - GV->hasAvailableExternallyLinkage()) + unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); + if (GVFlags & PPCII::MO_NLP_FLAG) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL), DestReg).addGlobalAddress(GV).addReg(HighPartReg); - else + } else { // Otherwise generate the ADDItocL. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDItocL), DestReg).addReg(HighPartReg).addGlobalAddress(GV); + } } return DestReg; @@ -2085,12 +2074,11 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm, // Materialize an integer constant into a register, and return // the register number (or zero if we failed to handle it). -unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT, - bool UseSExt) { +unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT, + bool UseSExt) { // If we're using CR bit registers for i1 values, handle that as a special // case first. if (VT == MVT::i1 && PPCSubTarget->useCRBits()) { - const ConstantInt *CI = cast<ConstantInt>(C); unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg); @@ -2105,12 +2093,17 @@ unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT, &PPC::GPRCRegClass); // If the constant is in range, use a load-immediate. - const ConstantInt *CI = cast<ConstantInt>(C); - if (isInt<16>(CI->getSExtValue())) { + if (UseSExt && isInt<16>(CI->getSExtValue())) { + unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; + unsigned ImmReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) + .addImm(CI->getSExtValue()); + return ImmReg; + } else if (!UseSExt && isUInt<16>(CI->getZExtValue())) { unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI; unsigned ImmReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg) - .addImm( (UseSExt) ? CI->getSExtValue() : CI->getZExtValue() ); + .addImm(CI->getZExtValue()); return ImmReg; } @@ -2138,8 +2131,8 @@ unsigned PPCFastISel::fastMaterializeConstant(const Constant *C) { return PPCMaterializeFP(CFP, VT); else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C)) return PPCMaterializeGV(GV, VT); - else if (isa<ConstantInt>(C)) - return PPCMaterializeInt(C, VT, VT != MVT::i1); + else if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) + return PPCMaterializeInt(CI, VT, VT != MVT::i1); return 0; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 08ae717..beab844 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -30,7 +30,7 @@ using namespace llvm; /// VRRegNo - Map from a numbered VR register to its enum value. /// -static const uint16_t VRRegNo[] = { +static const MCPhysReg VRRegNo[] = { PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 , PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15, PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23, @@ -270,7 +270,7 @@ static void RemoveVRSaveCode(MachineInstr *MI) { // epilog blocks. for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) { // If last instruction is a return instruction, add an epilogue - if (!I->empty() && I->back().isReturn()) { + if (I->isReturnBlock()) { bool FoundIt = false; for (MBBI = I->end(); MBBI != I->begin(); ) { --MBBI; @@ -306,9 +306,10 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); DebugLoc dl = MI->getDebugLoc(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UsedRegMask = 0; for (unsigned i = 0; i != 32; ++i) - if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i])) + if (MRI.isPhysRegModified(VRRegNo[i])) UsedRegMask |= 1 << (31-i); // Live in and live out values already must be in the mask, so don't bother @@ -325,7 +326,7 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) { for (MachineFunction::const_iterator BI = MF->begin(), BE = MF->end(); UsedRegMask != 0 && BI != BE; ++BI) { const MachineBasicBlock &MBB = *BI; - if (MBB.empty() || !MBB.back().isReturn()) + if (!MBB.isReturnBlock()) continue; const MachineInstr &Ret = MBB.back(); for (unsigned I = 0, E = Ret.getNumOperands(); I != E; ++I) { @@ -555,9 +556,67 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const { } } +bool PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB, + bool UseAtEnd, + unsigned *ScratchRegister) const { + RegScavenger RS; + unsigned R0 = Subtarget.isPPC64() ? PPC::X0 : PPC::R0; + + if (ScratchRegister) + *ScratchRegister = R0; + + // If MBB is an entry or exit block, use R0 as the scratch register + if ((UseAtEnd && MBB->isReturnBlock()) || + (!UseAtEnd && (&MBB->getParent()->front() == MBB))) + return true; + + RS.enterBasicBlock(MBB); + + if (UseAtEnd && !MBB->empty()) { + // The scratch register will be used at the end of the block, so must consider + // all registers used within the block + + MachineBasicBlock::iterator MBBI = MBB->getFirstTerminator(); + // If no terminator, back iterator up to previous instruction. + if (MBBI == MBB->end()) + MBBI = std::prev(MBBI); + + if (MBBI != MBB->begin()) + RS.forward(MBBI); + } + + if (!RS.isRegUsed(R0)) + return true; + + unsigned Reg = RS.FindUnusedReg(Subtarget.isPPC64() ? &PPC::G8RCRegClass + : &PPC::GPRCRegClass); + + // Make sure the register scavenger was able to find an available register + // If not, use R0 but return false to indicate no register was available and + // R0 must be used (as recommended by the ABI) + if (Reg == 0) + return false; + + if (ScratchRegister) + *ScratchRegister = Reg; + + return true; +} + +bool PPCFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const { + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + + return findScratchRegister(TmpMBB, false, nullptr); +} + +bool PPCFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { + MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB); + + return findScratchRegister(TmpMBB, true, nullptr); +} + void PPCFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const PPCInstrInfo &TII = @@ -589,7 +648,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, } } - // Move MBBI back to the beginning of the function. + // Move MBBI back to the beginning of the prologue block. MBBI = MBB.begin(); // Work out frame sizes. @@ -613,7 +672,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; unsigned LRReg = isPPC64 ? PPC::LR8 : PPC::LR; - unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; + unsigned ScratchReg = 0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg // ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.) const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8 @@ -642,6 +701,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) && "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4."); + findScratchRegister(&MBB, false, &ScratchReg); + assert(ScratchReg && "No scratch register!"); + int LROffset = getReturnSaveOffset(); int FPOffset = 0; @@ -916,27 +978,18 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF, } void PPCFrameLowering::emitEpilogue(MachineFunction &MF, - MachineBasicBlock &MBB) const { - MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI != MBB.end() && "Returning block has no terminator"); + MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); + DebugLoc dl; + + if (MBBI != MBB.end()) + dl = MBBI->getDebugLoc(); + const PPCInstrInfo &TII = *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo()); const PPCRegisterInfo *RegInfo = static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo()); - unsigned RetOpcode = MBBI->getOpcode(); - DebugLoc dl; - - assert((RetOpcode == PPC::BLR || - RetOpcode == PPC::BLR8 || - RetOpcode == PPC::TCRETURNri || - RetOpcode == PPC::TCRETURNdi || - RetOpcode == PPC::TCRETURNai || - RetOpcode == PPC::TCRETURNri8 || - RetOpcode == PPC::TCRETURNdi8 || - RetOpcode == PPC::TCRETURNai8) && - "Can only insert epilog into returning blocks"); - // Get alignment info so we know how to restore the SP. const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -959,7 +1012,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, unsigned SPReg = isPPC64 ? PPC::X1 : PPC::R1; unsigned BPReg = RegInfo->getBaseRegister(MF); unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31; - unsigned ScratchReg = isPPC64 ? PPC::X0 : PPC::R0; + unsigned ScratchReg = 0; unsigned TempReg = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8 : PPC::MTLR ); @@ -973,10 +1026,14 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, : PPC::ADDI ); const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8 : PPC::ADD4 ); - + int LROffset = getReturnSaveOffset(); int FPOffset = 0; + + findScratchRegister(&MBB, true, &ScratchReg); + assert(ScratchReg && "No scratch register!"); + if (HasFP) { if (isSVR4ABI) { MachineFrameInfo *FFI = MF.getFrameInfo(); @@ -1008,25 +1065,30 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, PBPOffset = FFI->getObjectOffset(PBPIndex); } - bool UsesTCRet = RetOpcode == PPC::TCRETURNri || - RetOpcode == PPC::TCRETURNdi || - RetOpcode == PPC::TCRETURNai || - RetOpcode == PPC::TCRETURNri8 || - RetOpcode == PPC::TCRETURNdi8 || - RetOpcode == PPC::TCRETURNai8; - - if (UsesTCRet) { - int MaxTCRetDelta = FI->getTailCallSPDelta(); - MachineOperand &StackAdjust = MBBI->getOperand(1); - assert(StackAdjust.isImm() && "Expecting immediate value."); - // Adjust stack pointer. - int StackAdj = StackAdjust.getImm(); - int Delta = StackAdj - MaxTCRetDelta; - assert((Delta >= 0) && "Delta must be positive"); - if (MaxTCRetDelta>0) - FrameSize += (StackAdj +Delta); - else - FrameSize += StackAdj; + bool IsReturnBlock = (MBBI != MBB.end() && MBBI->isReturn()); + + if (IsReturnBlock) { + unsigned RetOpcode = MBBI->getOpcode(); + bool UsesTCRet = RetOpcode == PPC::TCRETURNri || + RetOpcode == PPC::TCRETURNdi || + RetOpcode == PPC::TCRETURNai || + RetOpcode == PPC::TCRETURNri8 || + RetOpcode == PPC::TCRETURNdi8 || + RetOpcode == PPC::TCRETURNai8; + + if (UsesTCRet) { + int MaxTCRetDelta = FI->getTailCallSPDelta(); + MachineOperand &StackAdjust = MBBI->getOperand(1); + assert(StackAdjust.isImm() && "Expecting immediate value."); + // Adjust stack pointer. + int StackAdj = StackAdjust.getImm(); + int Delta = StackAdj - MaxTCRetDelta; + assert((Delta >= 0) && "Delta must be positive"); + if (MaxTCRetDelta>0) + FrameSize += (StackAdj +Delta); + else + FrameSize += StackAdj; + } } // Frames of 32KB & larger require special handling because they cannot be @@ -1066,7 +1128,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, .addImm(0) .addReg(SPReg); } - } if (MustSaveLR) @@ -1109,52 +1170,55 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF, // Callee pop calling convention. Pop parameter/linkage area. Used for tail // call optimization - if (MF.getTarget().Options.GuaranteedTailCallOpt && - (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) && - MF.getFunction()->getCallingConv() == CallingConv::Fast) { - PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); - unsigned CallerAllocatedAmt = FI->getMinReservedArea(); - - if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) { - BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) - .addReg(SPReg).addImm(CallerAllocatedAmt); - } else { - BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) + if (IsReturnBlock) { + unsigned RetOpcode = MBBI->getOpcode(); + if (MF.getTarget().Options.GuaranteedTailCallOpt && + (RetOpcode == PPC::BLR || RetOpcode == PPC::BLR8) && + MF.getFunction()->getCallingConv() == CallingConv::Fast) { + PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>(); + unsigned CallerAllocatedAmt = FI->getMinReservedArea(); + + if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) { + BuildMI(MBB, MBBI, dl, AddImmInst, SPReg) + .addReg(SPReg).addImm(CallerAllocatedAmt); + } else { + BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg) .addImm(CallerAllocatedAmt >> 16); - BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) + BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg) .addReg(ScratchReg, RegState::Kill) .addImm(CallerAllocatedAmt & 0xFFFF); - BuildMI(MBB, MBBI, dl, AddInst) + BuildMI(MBB, MBBI, dl, AddInst) .addReg(SPReg) .addReg(FPReg) .addReg(ScratchReg); - } - } else if (RetOpcode == PPC::TCRETURNdi) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); - } else if (RetOpcode == PPC::TCRETURNri) { - MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR)); - } else if (RetOpcode == PPC::TCRETURNai) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm()); - } else if (RetOpcode == PPC::TCRETURNdi8) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). - addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); - } else if (RetOpcode == PPC::TCRETURNri8) { - MBBI = MBB.getLastNonDebugInstr(); - assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8)); - } else if (RetOpcode == PPC::TCRETURNai8) { - MBBI = MBB.getLastNonDebugInstr(); - MachineOperand &JumpTarget = MBBI->getOperand(0); - BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm()); + } + } else if (RetOpcode == PPC::TCRETURNdi) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR)); + } else if (RetOpcode == PPC::TCRETURNai) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm()); + } else if (RetOpcode == PPC::TCRETURNdi8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)). + addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset()); + } else if (RetOpcode == PPC::TCRETURNri8) { + MBBI = MBB.getLastNonDebugInstr(); + assert(MBBI->getOperand(0).isReg() && "Expecting register operand."); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8)); + } else if (RetOpcode == PPC::TCRETURNai8) { + MBBI = MBB.getLastNonDebugInstr(); + MachineOperand &JumpTarget = MBBI->getOperand(0); + BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm()); + } } } @@ -1200,8 +1264,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF, // Reserve stack space for the PIC Base register (R30). // Only used in SVR4 32-bit. if (FI->usesPICBase()) { - int PBPSI = FI->getPICBasePointerSaveIndex(); - PBPSI = MFI->CreateFixedObject(4, -8, true); + int PBPSI = MFI->CreateFixedObject(4, -8, true); FI->setPICBasePointerSaveIndex(PBPSI); } @@ -1710,3 +1773,8 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, return true; } + +bool PPCFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + return (MF.getSubtarget<PPCSubtarget>().isSVR4ABI() && + MF.getSubtarget<PPCSubtarget>().isPPC64()); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h index d6a389b..bbe1329 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCFrameLowering.h @@ -29,6 +29,30 @@ class PPCFrameLowering: public TargetFrameLowering { const unsigned LinkageSize; const unsigned BasePointerSaveOffset; + /** + * \brief Find a register that can be used in function prologue and epilogue + * + * Find a register that can be use as the scratch register in function + * prologue and epilogue to save various registers (Link Register, Base + * Pointer, etc.). Prefer R0, if it is available. If it is not available, + * then choose a different register. + * + * This method will return true if an available register was found (including + * R0). If no available registers are found, the method returns false and sets + * ScratchRegister to R0, as per the recommendation in the ABI. + * + * \param[in] MBB The machine basic block to find an available register for + * \param[in] UseAtEnd Specify whether the scratch register will be used at + * the end of the basic block (i.e., will the scratch + * register kill a register defined in the basic block) + * \param[out] ScratchRegister The scratch register to use + * \return true if a scratch register was found. false of a scratch register + * was not found and R0 is being used as the default. + */ + bool findScratchRegister(MachineBasicBlock *MBB, + bool UseAtEnd, + unsigned *ScratchRegister) const; + public: PPCFrameLowering(const PPCSubtarget &STI); @@ -92,6 +116,13 @@ public: const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const override; + + bool enableShrinkWrapping(const MachineFunction &MF) const override; + + /// Methods used by shrink wrapping to determine if MBB can be used for the + /// function prologue/epilogue. + bool canUseAsPrologue(const MachineBasicBlock &MBB) const override; + bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 9322268..1eaa811 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -16,6 +16,8 @@ #include "MCTargetDesc/PPCPredicates.h" #include "PPCMachineFunctionInfo.h" #include "PPCTargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -52,6 +54,11 @@ static cl::opt<bool> BPermRewriterNoMasking( "bit permutations"), cl::Hidden); +static cl::opt<bool> EnableBranchHint( + "ppc-use-branch-hint", cl::init(true), + cl::desc("Enable static hinting of branches on ppc"), + cl::Hidden); + namespace llvm { void initializePPCDAGToDAGISelPass(PassRegistry&); } @@ -286,7 +293,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) { // Find all return blocks, outputting a restore in each epilog. for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { - if (!BB->empty() && BB->back().isReturn()) { + if (BB->isReturnBlock()) { IP = BB->end(); --IP; // Skip over all terminator instructions, which are part of the return @@ -393,6 +400,55 @@ static bool isInt32Immediate(SDValue N, unsigned &Imm) { return isInt32Immediate(N.getNode(), Imm); } +static unsigned getBranchHint(unsigned PCC, FunctionLoweringInfo *FuncInfo, + const SDValue &DestMBB) { + assert(isa<BasicBlockSDNode>(DestMBB)); + + if (!FuncInfo->BPI) return PPC::BR_NO_HINT; + + const BasicBlock *BB = FuncInfo->MBB->getBasicBlock(); + const TerminatorInst *BBTerm = BB->getTerminator(); + + if (BBTerm->getNumSuccessors() != 2) return PPC::BR_NO_HINT; + + const BasicBlock *TBB = BBTerm->getSuccessor(0); + const BasicBlock *FBB = BBTerm->getSuccessor(1); + + auto TProb = FuncInfo->BPI->getEdgeProbability(BB, TBB); + auto FProb = FuncInfo->BPI->getEdgeProbability(BB, FBB); + + // We only want to handle cases which are easy to predict at static time, e.g. + // C++ throw statement, that is very likely not taken, or calling never + // returned function, e.g. stdlib exit(). So we set Threshold to filter + // unwanted cases. + // + // Below is LLVM branch weight table, we only want to handle case 1, 2 + // + // Case Taken:Nontaken Example + // 1. Unreachable 1048575:1 C++ throw, stdlib exit(), + // 2. Invoke-terminating 1:1048575 + // 3. Coldblock 4:64 __builtin_expect + // 4. Loop Branch 124:4 For loop + // 5. PH/ZH/FPH 20:12 + const uint32_t Threshold = 10000; + + if (std::max(TProb, FProb) / Threshold < std::min(TProb, FProb)) + return PPC::BR_NO_HINT; + + DEBUG(dbgs() << "Use branch hint for '" << FuncInfo->Fn->getName() << "::" + << BB->getName() << "'\n" + << " -> " << TBB->getName() << ": " << TProb << "\n" + << " -> " << FBB->getName() << ": " << FProb << "\n"); + + const BasicBlockSDNode *BBDN = cast<BasicBlockSDNode>(DestMBB); + + // If Dest BasicBlock is False-BasicBlock (FBB), swap branch probabilities, + // because we want 'TProb' stands for 'branch probability' to Dest BasicBlock + if (BBDN->getBasicBlock()->getBasicBlock() != TBB) + std::swap(TProb, FProb); + + return (TProb > FProb) ? PPC::BR_TAKEN_HINT : PPC::BR_NONTAKEN_HINT; +} // isOpcWithIntImmediate - This method tests to see if the node is a specific // opcode and that it has a immediate integer right operand. @@ -564,7 +620,6 @@ static unsigned SelectInt64CountDirect(int64_t Imm) { // Handle first 32 bits. unsigned Lo = Imm & 0xFFFF; - unsigned Hi = (Imm >> 16) & 0xFFFF; // Simple value. if (isInt<16>(Imm)) { @@ -586,9 +641,9 @@ static unsigned SelectInt64CountDirect(int64_t Imm) { ++Result; // Add in the last bits as required. - if ((Hi = (Remainder >> 16) & 0xFFFF)) + if ((Remainder >> 16) & 0xFFFF) ++Result; - if ((Lo = Remainder & 0xFFFF)) + if (Remainder & 0xFFFF) ++Result; return Result; @@ -1028,7 +1083,7 @@ class BitPermutationSelector { BitGroups[BitGroups.size()-1].EndIdx == Bits.size()-1 && BitGroups[0].V == BitGroups[BitGroups.size()-1].V && BitGroups[0].RLAmt == BitGroups[BitGroups.size()-1].RLAmt) { - DEBUG(dbgs() << "\tcombining final bit group with inital one\n"); + DEBUG(dbgs() << "\tcombining final bit group with initial one\n"); BitGroups[BitGroups.size()-1].EndIdx = BitGroups[0].EndIdx; BitGroups.erase(BitGroups.begin()); } @@ -1557,10 +1612,7 @@ class BitPermutationSelector { return false; } - if (VRI.RLAmt != EffRLAmt) - return false; - - return true; + return VRI.RLAmt == EffRLAmt; }; for (auto &BG : BitGroups) { @@ -2781,7 +2833,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 || N->getValueType(0) == MVT::v2i64)) { ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); - + SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1), Op2 = N->getOperand(SVN->getMaskElt(1) < 2 ? 0 : 1); unsigned DM[2]; @@ -2798,7 +2850,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { LoadSDNode *LD = cast<LoadSDNode>(Op1.getOperand(0)); SDValue Base, Offset; - if (LD->isUnindexed() && + if (LD->isUnindexed() && LD->hasOneUse() && Op1.hasOneUse() && (LD->getMemoryVT() == MVT::f64 || LD->getMemoryVT() == MVT::i64) && SelectAddrIdxOnly(LD->getBasePtr(), Base, Offset)) { @@ -2841,8 +2893,11 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { // Op #3 is the Dest MBB // Op #4 is the Flag. // Prevent PPC::PRED_* from being selected into LI. - SDValue Pred = - getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(), dl); + unsigned PCC = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + if (EnableBranchHint) + PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(3)); + + SDValue Pred = getI32Imm(PCC, dl); SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3), N->getOperand(0), N->getOperand(4) }; return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops); @@ -2871,6 +2926,9 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { BitComp, N->getOperand(4), N->getOperand(0)); } + if (EnableBranchHint) + PCC |= getBranchHint(PCC, FuncInfo, N->getOperand(4)); + SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl); SDValue Ops[] = { getI32Imm(PCC, dl), CondCode, N->getOperand(4), N->getOperand(0) }; @@ -2903,9 +2961,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { break; // The first source operand is a TargetGlobalAddress or a TargetJumpTable. - // If it is an externally defined symbol, a symbol with common linkage, - // a non-local function address, or a jump table address, or if we are - // generating code for large code model, we generate: + // If it must be toc-referenced according to PPCSubTarget, we generate: // LDtocL(<ga:@sym>, ADDIStocHA(%X2, <ga:@sym>)) // Otherwise we generate: // ADDItocL(ADDIStocHA(%X2, <ga:@sym>), <ga:@sym>) @@ -2920,13 +2976,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { MVT::i64, GA, SDValue(Tmp, 0))); if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) { - const GlobalValue *GValue = G->getGlobal(); - if ((GValue->getType()->getElementType()->isFunctionTy() && - !GValue->isStrongDefinitionForLinker()) || - GValue->isDeclaration() || GValue->hasCommonLinkage() || - GValue->hasAvailableExternallyLinkage()) + const GlobalValue *GV = G->getGlobal(); + unsigned char GVFlags = PPCSubTarget->classifyGlobalReference(GV); + if (GVFlags & PPCII::MO_NLP_FLAG) { return transferMemOperands(N, CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA, SDValue(Tmp, 0))); + } } return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64, @@ -3110,7 +3165,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) { if (!CurDAG->MaskedValueIsZero(Op0, APInt::getHighBitsSet(Bits, Bits - (b+1)*8))) return false; - + LHS = Op0.getOperand(0); RHS = Op0.getOperand(1); return true; @@ -3305,7 +3360,7 @@ void PPCDAGToDAGISel::PreprocessISelDAG() { bool MadeChange = false; while (Position != CurDAG->allnodes_begin()) { - SDNode *N = --Position; + SDNode *N = &*--Position; if (N->use_empty()) continue; @@ -3989,7 +4044,7 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() { bool MadeChange = false; while (Position != CurDAG->allnodes_begin()) { - SDNode *N = --Position; + SDNode *N = &*--Position; // Skip dead nodes and any non-machine opcodes. if (N->use_empty() || !N->isMachineOpcode()) continue; @@ -4145,7 +4200,7 @@ void PPCDAGToDAGISel::PeepholePPC64() { ++Position; while (Position != CurDAG->allnodes_begin()) { - SDNode *N = --Position; + SDNode *N = &*--Position; // Skip dead nodes and any non-machine opcodes. if (N->use_empty() || !N->isMachineOpcode()) continue; @@ -4184,16 +4239,24 @@ void PPCDAGToDAGISel::PeepholePPC64() { break; } - // If this is a load or store with a zero offset, we may be able to - // fold an add-immediate into the memory operation. - if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) || - N->getConstantOperandVal(FirstOp) != 0) + // If this is a load or store with a zero offset, or within the alignment, + // we may be able to fold an add-immediate into the memory operation. + // The check against alignment is below, as it can't occur until we check + // the arguments to N + if (!isa<ConstantSDNode>(N->getOperand(FirstOp))) continue; SDValue Base = N->getOperand(FirstOp + 1); if (!Base.isMachineOpcode()) continue; + // On targets with fusion, we don't want this to fire and remove a fusion + // opportunity, unless a) it results in another fusion opportunity or + // b) optimizing for size. + if (PPCSubTarget->hasFusion() && + (!MF->getFunction()->optForSize() && !Base.hasOneUse())) + continue; + unsigned Flags = 0; bool ReplaceFlags = true; @@ -4237,6 +4300,17 @@ void PPCDAGToDAGISel::PeepholePPC64() { break; } + SDValue ImmOpnd = Base.getOperand(1); + int MaxDisplacement = 0; + if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) { + const GlobalValue *GV = GA->getGlobal(); + MaxDisplacement = GV->getAlignment() - 1; + } + + int Offset = N->getConstantOperandVal(FirstOp); + if (Offset < 0 || Offset > MaxDisplacement) + continue; + // We found an opportunity. Reverse the operands from the add // immediate and substitute them into the load or store. If // needed, update the target flags for the immediate operand to @@ -4247,8 +4321,6 @@ void PPCDAGToDAGISel::PeepholePPC64() { DEBUG(N->dump(CurDAG)); DEBUG(dbgs() << "\n"); - SDValue ImmOpnd = Base.getOperand(1); - // If the relocation information isn't already present on the // immediate operand, add it now. if (ReplaceFlags) { @@ -4259,17 +4331,17 @@ void PPCDAGToDAGISel::PeepholePPC64() { // is insufficient for the instruction encoding. if (GV->getAlignment() < 4 && (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD || - StorageOpcode == PPC::LWA)) { + StorageOpcode == PPC::LWA || (Offset % 4) != 0)) { DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n"); continue; } - ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags); + ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, Offset, Flags); } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(ImmOpnd)) { const Constant *C = CP->getConstVal(); ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64, CP->getAlignment(), - 0, Flags); + Offset, Flags); } } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 1b8f8fb..af9ad07 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -42,10 +42,6 @@ using namespace llvm; -// FIXME: Remove this once soft-float is supported. -static cl::opt<bool> DisablePPCFloatInVariadic("disable-ppc-float-in-variadic", -cl::desc("disable saving float registers for va_start on PPC"), cl::Hidden); - static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc", cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden); @@ -72,8 +68,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // Set up the register classes. addRegisterClass(MVT::i32, &PPC::GPRCRegClass); - addRegisterClass(MVT::f32, &PPC::F4RCRegClass); - addRegisterClass(MVT::f64, &PPC::F8RCRegClass); + if (!Subtarget.useSoftFloat()) { + addRegisterClass(MVT::f32, &PPC::F4RCRegClass); + addRegisterClass(MVT::f64, &PPC::F8RCRegClass); + } // PowerPC has an i16 but no i8 (or i1) SEXTLOAD for (MVT VT : MVT::integer_valuetypes()) { @@ -107,8 +105,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, isPPC64 ? MVT::i64 : MVT::i32); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); - AddPromotedToType (ISD::UINT_TO_FP, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, + isPPC64 ? MVT::i64 : MVT::i32); } else { setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom); @@ -257,10 +255,17 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand); - setOperationAction(ISD::BITCAST, MVT::f32, Expand); - setOperationAction(ISD::BITCAST, MVT::i32, Expand); - setOperationAction(ISD::BITCAST, MVT::i64, Expand); - setOperationAction(ISD::BITCAST, MVT::f64, Expand); + if (Subtarget.hasDirectMove()) { + setOperationAction(ISD::BITCAST, MVT::f32, Legal); + setOperationAction(ISD::BITCAST, MVT::i32, Legal); + setOperationAction(ISD::BITCAST, MVT::i64, Legal); + setOperationAction(ISD::BITCAST, MVT::f64, Legal); + } else { + setOperationAction(ISD::BITCAST, MVT::f32, Expand); + setOperationAction(ISD::BITCAST, MVT::i32, Expand); + setOperationAction(ISD::BITCAST, MVT::i64, Expand); + setOperationAction(ISD::BITCAST, MVT::f64, Expand); + } // We cannot sextinreg(i1). Expand to shifts. setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -329,6 +334,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i32, Custom); + setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, MVT::i64, Custom); // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -403,9 +410,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // will selectively turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { // add/sub are legal for all supported vector VT's. - setOperationAction(ISD::ADD , VT, Legal); - setOperationAction(ISD::SUB , VT, Legal); - + setOperationAction(ISD::ADD, VT, Legal); + setOperationAction(ISD::SUB, VT, Legal); + // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { setOperationAction(ISD::CTPOP, VT, Legal); @@ -477,6 +484,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::ROTR, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); @@ -519,12 +528,11 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); } - - if (Subtarget.hasP8Altivec()) + if (Subtarget.hasP8Altivec()) setOperationAction(ISD::MUL, MVT::v4i32, Legal); else setOperationAction(ISD::MUL, MVT::v4i32, Custom); - + setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v16i8, Custom); @@ -545,6 +553,21 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (Subtarget.hasVSX()) { setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); + if (Subtarget.hasP8Vector()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); + } + if (Subtarget.hasDirectMove()) { + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); + } + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); @@ -813,15 +836,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setLibcallName(RTLIB::SRA_I128, nullptr); } - if (isPPC64) { - setStackPointerRegisterToSaveRestore(PPC::X1); - setExceptionPointerRegister(PPC::X3); - setExceptionSelectorRegister(PPC::X4); - } else { - setStackPointerRegisterToSaveRestore(PPC::R1); - setExceptionPointerRegister(PPC::R3); - setExceptionSelectorRegister(PPC::R4); - } + setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::SINT_TO_FP); @@ -942,9 +957,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign, if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast<StructType>(Ty)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; - getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign); + getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == MaxMaxAlign) @@ -969,6 +984,10 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty, return Align; } +bool PPCTargetLowering::useSoftFloat() const { + return Subtarget.useSoftFloat(); +} + const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((PPCISD::NodeType)Opcode) { case PPCISD::FIRST_NUMBER: break; @@ -992,6 +1011,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::Lo: return "PPCISD::Lo"; case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY"; case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC"; + case PPCISD::DYNAREAOFFSET: return "PPCISD::DYNAREAOFFSET"; case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg"; case PPCISD::SRL: return "PPCISD::SRL"; case PPCISD::SRA: return "PPCISD::SRA"; @@ -1236,7 +1256,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize, /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes). -/// The ShuffleKind distinguishes between big-endian merges with two +/// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). @@ -1261,7 +1281,7 @@ bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for /// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes). -/// The ShuffleKind distinguishes between big-endian merges with two +/// The ShuffleKind distinguishes between big-endian merges with two /// different inputs (0), either-endian merges with two identical inputs (1), /// and little-endian merges with two different inputs (2). For the latter, /// the input operands are swapped (see PPCInstrAltivec.td). @@ -1353,7 +1373,7 @@ static bool isVMerge(ShuffleVectorSDNode *N, unsigned IndexOffset, * - 2 = little-endian merge with two different inputs (inputs are swapped for * little-endian merges). * \param[in] DAG The current SelectionDAG - * \return true iff this shuffle mask + * \return true iff this shuffle mask */ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, unsigned ShuffleKind, SelectionDAG &DAG) { @@ -1380,7 +1400,7 @@ bool PPC::isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven, /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift /// amount, otherwise return -1. -/// The ShuffleKind distinguishes between big-endian operations with two +/// The ShuffleKind distinguishes between big-endian operations with two /// different inputs (0), either-endian operations with two identical inputs /// (1), and little-endian operations with two different inputs (2). For the /// latter, the input operands are swapped (see PPCInstrAltivec.td). @@ -1513,8 +1533,8 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) { for (unsigned i = 0; i != Multiple-1; ++i) { if (!UniquedVals[i].getNode()) continue; // Must have been undefs. - LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue(); - LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue(); + LeadingZero &= isNullConstant(UniquedVals[i]); + LeadingOnes &= isAllOnesConstant(UniquedVals[i]); } // Finally, check the least significant entry. if (LeadingZero) { @@ -1629,7 +1649,6 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } - /// SelectAddressRegReg - Given the specified addressed, check to see if it /// can be represented as an indexed [r+r] operation. Returns false if it /// can be more efficiently represented with [r+imm]. @@ -1998,10 +2017,10 @@ static SDValue getTOCEntry(SelectionDAG &DAG, SDLoc dl, bool Is64Bit, DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); SDValue Ops[] = { GA, Reg }; - return DAG.getMemIntrinsicNode(PPCISD::TOC_ENTRY, dl, - DAG.getVTList(VT, MVT::Other), Ops, VT, - MachinePointerInfo::getGOT(), 0, false, true, - false, 0); + return DAG.getMemIntrinsicNode( + PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0, false, true, + false, 0); } SDValue PPCTargetLowering::LowerConstantPool(SDValue Op, @@ -2092,6 +2111,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op, // large models could be added if users need it, at the cost of // additional complexity. GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + SDLoc dl(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2480,7 +2502,6 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, // */ // } va_list[1]; - SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), dl, MVT::i32); SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), dl, MVT::i32); @@ -2536,7 +2557,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG, #include "PPCGenCallingConv.inc" -// Function whose sole purpose is to kill compiler warnings +// Function whose sole purpose is to kill compiler warnings // stemming from unused functions included from PPCGenCallingConv.inc. CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const { return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS; @@ -2933,8 +2954,9 @@ PPCTargetLowering::LowerFormalArguments_32SVR4( PPC::F8 }; unsigned NumFPArgRegs = array_lengthof(FPArgRegs); - if (DisablePPCFloatInVariadic) - NumFPArgRegs = 0; + + if (Subtarget.useSoftFloat()) + NumFPArgRegs = 0; FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs)); FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs)); @@ -3177,15 +3199,15 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( EVT ObjType = (ObjSize == 1 ? MVT::i8 : (ObjSize == 2 ? MVT::i16 : MVT::i32)); Store = DAG.getTruncStore(Val.getValue(1), dl, Val, Arg, - MachinePointerInfo(FuncArg), - ObjType, false, false, 0); + MachinePointerInfo(&*FuncArg), ObjType, + false, false, 0); } else { // For sizes that don't fit a truncating store (3, 5, 6, 7), // store the whole register as-is to the parameter save area // slot. - Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg), - false, false, 0); + Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(&*FuncArg), false, false, 0); } MemOps.push_back(Store); @@ -3212,9 +3234,9 @@ PPCTargetLowering::LowerFormalArguments_64SVR4( SDValue Off = DAG.getConstant(j, dl, PtrVT); Addr = DAG.getNode(ISD::ADD, dl, Off.getValueType(), Addr, Off); } - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, Addr, - MachinePointerInfo(FuncArg, j), - false, false, 0); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, Addr, + MachinePointerInfo(&*FuncArg, j), false, false, 0); MemOps.push_back(Store); ++GPR_idx; } @@ -3592,7 +3614,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin( SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16; SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg), + MachinePointerInfo(&*FuncArg), ObjType, false, false, 0); MemOps.push_back(Store); ++GPR_idx; @@ -3615,9 +3637,9 @@ PPCTargetLowering::LowerFormalArguments_Darwin( int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT); - SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo(FuncArg, j), - false, false, 0); + SDValue Store = + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo(&*FuncArg, j), false, false, 0); MemOps.push_back(Store); ++GPR_idx; ArgOffset += PtrByteSize; @@ -3880,7 +3902,6 @@ struct TailCallArgumentInfo { TailCallArgumentInfo() : FrameIdx(0) {} }; - } /// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot. @@ -3895,9 +3916,10 @@ StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG, SDValue FIN = TailCallArgs[i].FrameIdxOp; int FI = TailCallArgs[i].FrameIdx; // Store relative to framepointer. - MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains.push_back(DAG.getStore( + Chain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, 0)); } } @@ -3922,9 +3944,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, NewRetAddrLoc, true); EVT VT = isPPC64 ? MVT::i64 : MVT::i32; SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); - Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, - MachinePointerInfo::getFixedStack(NewRetAddr), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, OldRetAddr, NewRetAddrFrIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewRetAddr), + false, false, 0); // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack // slot as the FP is never overwritten. @@ -3933,9 +3956,10 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc, true); SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT); - Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx, - MachinePointerInfo::getFixedStack(NewFPIdx), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, OldFP, NewFramePtrIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), NewFPIdx), + false, false, 0); } } return Chain; @@ -4812,8 +4836,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, continue; break; case MVT::v4f32: - // When using QPX, this is handled like a FP register, otherwise, it - // is an Altivec register. + // When using QPX, this is handled like a FP register, otherwise, it + // is an Altivec register. if (Subtarget.hasQPX()) { if (++NumFPRsUsed <= NumFPRs) continue; @@ -5318,9 +5342,10 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl); SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff); - Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, - MachinePointerInfo::getStack(TOCSaveOffset), - false, false, 0); + Chain = DAG.getStore( + Val.getValue(1), dl, Val, AddPtr, + MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset), + false, false, 0); // In the ELFv2 ABI, R12 must contain the address of an indirect callee. // This does not mean the MTCTR instruction must use R12; it's easier // to model this as an extra parameter, so do that. @@ -5341,9 +5366,9 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee, PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp, FPOp, true, TailCallArguments); - return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, - hasNest, DAG, RegsToPass, InFlag, Chain, CallSeqStart, - Callee, SPDiff, NumBytes, Ins, InVals, CS); + return FinishCall(CallConv, dl, isTailCall, isVarArg, IsPatchPoint, hasNest, + DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee, + SPDiff, NumBytes, Ins, InVals, CS); } SDValue @@ -5798,6 +5823,22 @@ PPCTargetLowering::LowerReturn(SDValue Chain, return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps); } +SDValue PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET( + SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { + SDLoc dl(Op); + + // Get the corect type for integers. + EVT IntVT = Op.getValueType(); + + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue FPSIdx = getFramePointerFrameIndex(DAG); + // Build a DYNAREAOFFSET node. + SDValue Ops[2] = {Chain, FPSIdx}; + SDVTList VTs = DAG.getVTList(IntVT); + return DAG.getNode(PPCISD::DYNAREAOFFSET, dl, VTs, Ops); +} + SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const { // When we pop the dynamic allocation we need to restore the SP link. @@ -5828,10 +5869,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, false, false, 0); } - - -SDValue -PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const { +SDValue PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); bool isPPC64 = Subtarget.isPPC64(); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(MF.getDataLayout()); @@ -5983,6 +6021,10 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { if (!DAG.getTarget().Options.NoInfsFPMath || !DAG.getTarget().Options.NoNaNsFPMath) return Op; + // TODO: Propagate flags from the select rather than global settings. + SDNodeFlags Flags; + Flags.setNoInfs(true); + Flags.setNoNaNs(true); ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); @@ -6033,7 +6075,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETNE: std::swap(TV, FV); case ISD::SETEQ: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6043,25 +6085,25 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV); case ISD::SETULT: case ISD::SETLT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOGE: case ISD::SETGE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); case ISD::SETUGT: case ISD::SETGT: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV); case ISD::SETOLE: case ISD::SETLE: - Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS); + Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS, &Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp); return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV); @@ -6101,7 +6143,8 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI, (Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()); SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64); int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex(); - MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI); + MachinePointerInfo MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Emit a store to the stack slot. SDValue Chain; @@ -6291,11 +6334,11 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, // into 0 (false) and 1 (true), add 1 and then divide by 2 (multiply by 0.5). // This can be done with an fma and the 0.5 constant: (V+1.0)*0.5 = 0.5*V+0.5 Value = DAG.getNode(PPCISD::QBFLT, dl, MVT::v4f64, Value); - + SDValue FPHalfs = DAG.getConstantFP(0.5, dl, MVT::f64); - FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, - FPHalfs, FPHalfs, FPHalfs, FPHalfs); - + FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs, + FPHalfs, FPHalfs); + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); if (Op.getValueType() != MVT::v4f64) @@ -6421,17 +6464,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Store = - DAG.getStore(DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, SINT.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; - RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; MachineMemOperand *MMO = @@ -6472,16 +6516,18 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, int FrameIdx = FrameInfo->CreateStackObject(4, 4, false); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, Op.getOperand(0), FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 && "Expected an i32 store"); RLI.Ptr = FIdx; RLI.Chain = Store; - RLI.MPI = MachinePointerInfo::getFixedStack(FrameIdx); + RLI.MPI = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); RLI.Alignment = 4; } @@ -6506,14 +6552,16 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op, Op.getOperand(0)); // STD the extended value into the stack slot. - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, 0); + SDValue Store = DAG.getStore( + DAG.getEntryNode(), dl, Ext64, FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, 0); // Load the value as a double. - Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, - MachinePointerInfo::getFixedStack(FrameIdx), - false, false, false, 0); + Ld = DAG.getLoad( + MVT::f64, dl, Store, FIdx, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx), + false, false, false, 0); } // FCFID it and return it. @@ -6735,7 +6783,6 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1, DAG.getConstant(IID, dl, MVT::i32), Op0, Op1, Op2); } - /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified /// amount. The result has the specified value type. static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt, @@ -6768,7 +6815,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // to a zero vector to get the boolean result. MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -6794,8 +6842,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, for (unsigned i = 0; i < 4; ++i) { if (BVN->getOperand(i).getOpcode() == ISD::UNDEF) CV[i] = UndefValue::get(Type::getFloatTy(*DAG.getContext())); - else if (cast<ConstantSDNode>(BVN->getOperand(i))-> - getConstantIntValue()->isZero()) + else if (isNullConstant(BVN->getOperand(i))) continue; else CV[i] = One; @@ -6814,9 +6861,9 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, ValueVTs.push_back(MVT::Other); // chain SDVTList VTs = DAG.getVTList(ValueVTs); - return DAG.getMemIntrinsicNode(PPCISD::QVLFSb, - dl, VTs, Ops, MVT::v4f32, - MachinePointerInfo::getConstantPool()); + return DAG.getMemIntrinsicNode( + PPCISD::QVLFSb, dl, VTs, Ops, MVT::v4f32, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } SmallVector<SDValue, 4> Stores; @@ -6915,7 +6962,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (SextVal >= -16 && SextVal <= 15) return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl); - // Two instruction sequences. // If this value is in the range [-32,30] and is even, use: @@ -7304,11 +7350,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, V1, V2, VPermMask); } -/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an -/// altivec comparison. If it is, return true and fill in Opc/isDot with +/// getVectorCompareInfo - Given an intrinsic, return false if it is not a +/// vector comparison. If it is, return true and fill in Opc/isDot with /// information about the intrinsic. -static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, - bool &isDot, const PPCSubtarget &Subtarget) { +static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc, + bool &isDot, const PPCSubtarget &Subtarget) { unsigned IntrinsicID = cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue(); CompareOpc = -1; @@ -7321,12 +7367,11 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpequd_p: + case Intrinsic::ppc_altivec_vcmpequd_p: if (Subtarget.hasP8Altivec()) { - CompareOpc = 199; - isDot = 1; - } - else + CompareOpc = 199; + isDot = 1; + } else return false; break; @@ -7335,28 +7380,48 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtsd_p: + case Intrinsic::ppc_altivec_vcmpgtsd_p: if (Subtarget.hasP8Altivec()) { - CompareOpc = 967; - isDot = 1; - } - else + CompareOpc = 967; + isDot = 1; + } else return false; break; case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break; case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break; - case Intrinsic::ppc_altivec_vcmpgtud_p: + case Intrinsic::ppc_altivec_vcmpgtud_p: if (Subtarget.hasP8Altivec()) { - CompareOpc = 711; - isDot = 1; + CompareOpc = 711; + isDot = 1; + } else + return false; + + break; + // VSX predicate comparisons use the same infrastructure + case Intrinsic::ppc_vsx_xvcmpeqdp_p: + case Intrinsic::ppc_vsx_xvcmpgedp_p: + case Intrinsic::ppc_vsx_xvcmpgtdp_p: + case Intrinsic::ppc_vsx_xvcmpeqsp_p: + case Intrinsic::ppc_vsx_xvcmpgesp_p: + case Intrinsic::ppc_vsx_xvcmpgtsp_p: + if (Subtarget.hasVSX()) { + switch (IntrinsicID) { + case Intrinsic::ppc_vsx_xvcmpeqdp_p: CompareOpc = 99; break; + case Intrinsic::ppc_vsx_xvcmpgedp_p: CompareOpc = 115; break; + case Intrinsic::ppc_vsx_xvcmpgtdp_p: CompareOpc = 107; break; + case Intrinsic::ppc_vsx_xvcmpeqsp_p: CompareOpc = 67; break; + case Intrinsic::ppc_vsx_xvcmpgesp_p: CompareOpc = 83; break; + case Intrinsic::ppc_vsx_xvcmpgtsp_p: CompareOpc = 75; break; + } + isDot = 1; } - else + else return false; break; - + // Normal Comparisons. case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break; @@ -7365,10 +7430,9 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpequd: if (Subtarget.hasP8Altivec()) { - CompareOpc = 199; - isDot = 0; - } - else + CompareOpc = 199; + isDot = 0; + } else return false; break; @@ -7377,24 +7441,22 @@ static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc, case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtsd: + case Intrinsic::ppc_altivec_vcmpgtsd: if (Subtarget.hasP8Altivec()) { - CompareOpc = 967; - isDot = 0; - } - else + CompareOpc = 967; + isDot = 0; + } else return false; break; case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break; case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break; - case Intrinsic::ppc_altivec_vcmpgtud: + case Intrinsic::ppc_altivec_vcmpgtud: if (Subtarget.hasP8Altivec()) { - CompareOpc = 711; - isDot = 0; - } - else + CompareOpc = 711; + isDot = 0; + } else return false; break; @@ -7411,7 +7473,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDLoc dl(Op); int CompareOpc; bool isDot; - if (!getAltivecCompareInfo(Op, CompareOpc, isDot, Subtarget)) + if (!getVectorCompareInfo(Op, CompareOpc, isDot, Subtarget)) return SDValue(); // Don't custom lower most intrinsics. // If this is a non-dot comparison, make the VCMP node and we are done. @@ -7536,7 +7598,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs, FPHalfs, FPHalfs); - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, @@ -7545,7 +7607,8 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -7752,7 +7815,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, FPHalfs = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f64, FPHalfs, FPHalfs, FPHalfs, FPHalfs); - Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); + Value = DAG.getNode(ISD::FMA, dl, MVT::v4f64, Value, FPHalfs, FPHalfs); // Now convert to an integer and store. Value = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f64, @@ -7761,7 +7824,8 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo(); int FrameIdx = FrameInfo->CreateStackObject(16, 16, false); - MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FrameIdx); + MachinePointerInfo PtrInfo = + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT); @@ -7798,11 +7862,10 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, SDValue Idx = DAG.getConstant(i, dl, BasePtr.getValueType()); Idx = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, Idx); - Stores.push_back(DAG.getTruncStore(StoreChain, dl, Loads[i], Idx, - SN->getPointerInfo().getWithOffset(i), - MVT::i8 /* memory type */, - SN->isNonTemporal(), SN->isVolatile(), - 1 /* alignment */, SN->getAAInfo())); + Stores.push_back(DAG.getTruncStore( + StoreChain, dl, Loads[i], Idx, SN->getPointerInfo().getWithOffset(i), + MVT::i8 /* memory type */, SN->isNonTemporal(), SN->isVolatile(), + 1 /* alignment */, SN->getAAInfo())); } StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); @@ -7906,6 +7969,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); + case ISD::GET_DYNAMIC_AREA_OFFSET: return LowerGET_DYNAMIC_AREA_OFFSET(Op, DAG, Subtarget); case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); @@ -7971,7 +8035,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, N->getValueType(0)); SDVTList VTs = DAG.getVTList(SVT, MVT::Other); SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0), - N->getOperand(1)); + N->getOperand(1)); Results.push_back(NewInt); Results.push_back(NewInt.getValue(1)); @@ -8020,7 +8084,6 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, } } - //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// @@ -8089,8 +8152,7 @@ PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned ptrA = MI->getOperand(1).getReg(); @@ -8160,8 +8222,7 @@ PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI, const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction *F = BB->getParent(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI->getOperand(0).getReg(); unsigned ptrA = MI->getOperand(1).getReg(); @@ -8283,8 +8344,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); @@ -8384,8 +8444,8 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, .addMBB(mainMBB); MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB); - thisMBB->addSuccessor(mainMBB, /* weight */ 0); - thisMBB->addSuccessor(sinkMBB, /* weight */ 1); + thisMBB->addSuccessor(mainMBB, BranchProbability::getZero()); + thisMBB->addSuccessor(sinkMBB, BranchProbability::getOne()); // mainMBB: // mainDstReg = 0 @@ -8562,8 +8622,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // To "insert" these instructions we actually have to insert their // control-flow patterns. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); MachineFunction *F = BB->getParent(); @@ -8675,7 +8734,7 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // mfspr Rx,TBU # load from TBU // mfspr Ry,TB # load from TB // mfspr Rz,TBU # load from TBU - // cmpw crX,Rx,Rz # check if ‘old’=’new’ + // cmpw crX,Rx,Rz # check if 'old'='new' // bne readLoop # branch if they're not equal // ... @@ -9137,7 +9196,7 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, return SDValue(); } -bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { +unsigned PPCTargetLowering::combineRepeatedFPDivisors() const { // Note: This functionality is used only when unsafe-fp-math is enabled, and // on cores with reciprocal estimates (which are used when unsafe-fp-math is // enabled for division), this functionality is redundant with the default @@ -9150,12 +9209,26 @@ bool PPCTargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { // one FP pipeline) for three or more FDIVs (for generic OOO cores). switch (Subtarget.getDarwinDirective()) { default: - return NumUsers > 2; + return 3; case PPC::DIR_440: case PPC::DIR_A2: case PPC::DIR_E500mc: case PPC::DIR_E5500: - return NumUsers > 1; + return 2; + } +} + +// isConsecutiveLSLoc needs to work even if all adds have not yet been +// collapsed, and so we need to look through chains of them. +static void getBaseWithConstantOffset(SDValue Loc, SDValue &Base, + int64_t& Offset, SelectionDAG &DAG) { + if (DAG.isBaseWithConstantOffset(Loc)) { + Base = Loc.getOperand(0); + Offset += cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue(); + + // The base might itself be a base plus an offset, and if so, accumulate + // that as well. + getBaseWithConstantOffset(Loc.getOperand(0), Base, Offset, DAG); } } @@ -9178,16 +9251,18 @@ static bool isConsecutiveLSLoc(SDValue Loc, EVT VT, LSBaseSDNode *Base, return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes); } - // Handle X+C - if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc && - cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes) + SDValue Base1 = Loc, Base2 = BaseLoc; + int64_t Offset1 = 0, Offset2 = 0; + getBaseWithConstantOffset(Loc, Base1, Offset1, DAG); + getBaseWithConstantOffset(BaseLoc, Base2, Offset2, DAG); + if (Base1 == Base2 && Offset1 == (Offset2 + Dist * Bytes)) return true; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const GlobalValue *GV1 = nullptr; const GlobalValue *GV2 = nullptr; - int64_t Offset1 = 0; - int64_t Offset2 = 0; + Offset1 = 0; + Offset2 = 0; bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1); bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2); if (isGA1 && isGA2 && GV1 == GV2) @@ -9343,7 +9418,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(), IE = LoadRoots.end(); I != IE; ++I) { Queue.push_back(*I); - + while (!Queue.empty()) { SDNode *LoadRoot = Queue.pop_back_val(); if (!Visited.insert(LoadRoot).second) @@ -9470,7 +9545,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, } // Visit all inputs, collect all binary operations (and, or, xor and - // select) that are all fed by extensions. + // select) that are all fed by extensions. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); @@ -9492,7 +9567,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) && BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) || isa<ConstantSDNode>(BinOp.getOperand(i))) { - Inputs.push_back(BinOp.getOperand(i)); + Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || @@ -9572,7 +9647,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N, if (isa<ConstantSDNode>(Inputs[i])) continue; else - DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); + DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0)); } // Replace all operations (these are all the same, but have a different @@ -9682,7 +9757,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, SmallPtrSet<SDNode *, 16> Visited; // Visit all inputs, collect all binary operations (and, or, xor and - // select) that are all fed by truncations. + // select) that are all fed by truncations. while (!BinOps.empty()) { SDValue BinOp = BinOps.back(); BinOps.pop_back(); @@ -9701,7 +9776,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE || isa<ConstantSDNode>(BinOp.getOperand(i))) { - Inputs.push_back(BinOp.getOperand(i)); + Inputs.push_back(BinOp.getOperand(i)); } else if (BinOp.getOperand(i).getOpcode() == ISD::AND || BinOp.getOperand(i).getOpcode() == ISD::OR || BinOp.getOperand(i).getOpcode() == ISD::XOR || @@ -9915,10 +9990,11 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, "Invalid extension type"); EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0), DAG.getDataLayout()); SDValue ShiftCst = - DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); - return DAG.getNode(ISD::SRA, dl, N->getValueType(0), - DAG.getNode(ISD::SHL, dl, N->getValueType(0), - N->getOperand(0), ShiftCst), ShiftCst); + DAG.getConstant(N->getValueSizeInBits(0) - PromBits, dl, ShiftAmountTy); + return DAG.getNode( + ISD::SRA, dl, N->getValueType(0), + DAG.getNode(ISD::SHL, dl, N->getValueType(0), N->getOperand(0), ShiftCst), + ShiftCst); } SDValue PPCTargetLowering::combineFPToIntToFP(SDNode *N, @@ -10102,16 +10178,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case PPCISD::SHL: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { - if (C->isNullValue()) // 0 << V -> 0. + if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); - } break; case PPCISD::SRL: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { - if (C->isNullValue()) // 0 >>u V -> 0. + if (isNullConstant(N->getOperand(0))) // 0 >>u V -> 0. return N->getOperand(0); - } break; case PPCISD::SRA: if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) { @@ -10122,7 +10194,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - case ISD::ANY_EXTEND: + case ISD::ANY_EXTEND: return DAGCombineExtBoolTrunc(N, DCI); case ISD::TRUNCATE: case ISD::SETCC: @@ -10277,7 +10349,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, // original unaligned load. MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *BaseMMO = - MF.getMachineMemOperand(LD->getMemOperand(), -MemVT.getStoreSize()+1, + MF.getMachineMemOperand(LD->getMemOperand(), + -(long)MemVT.getStoreSize()+1, 2*MemVT.getStoreSize()-1); // Create the new base load. @@ -10527,7 +10600,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, case ISD::BRCOND: { SDValue Cond = N->getOperand(1); SDValue Target = N->getOperand(2); - + if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN && cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero) { @@ -10558,8 +10631,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() == Intrinsic::ppc_is_decremented_ctr_nonzero && isa<ConstantSDNode>(LHS.getOperand(1)) && - !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()-> - isZero()) + !isNullConstant(LHS.getOperand(1))) LHS = LHS.getOperand(0); if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN && @@ -10588,7 +10660,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN && isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) && - getAltivecCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { + getVectorCompareInfo(LHS, CompareOpc, isDot, Subtarget)) { assert(isDot && "Can't compare against a vector result!"); // If this is a comparison against something other than 0/1, then we know @@ -10739,8 +10811,11 @@ unsigned PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { // boundary so that the entire loop fits in one instruction-cache line. uint64_t LoopSize = 0; for (auto I = ML->block_begin(), IE = ML->block_end(); I != IE; ++I) - for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) + for (auto J = (*I)->begin(), JE = (*I)->end(); J != JE; ++J) { LoopSize += TII->GetInstSizeInBytes(J); + if (LoopSize > 32) + break; + } if (LoopSize > 16 && LoopSize <= 32) return 5; @@ -10868,17 +10943,19 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &PPC::QFRCRegClass); if (VT == MVT::v4f32 && Subtarget.hasQPX()) return std::make_pair(0U, &PPC::QSRCRegClass); - return std::make_pair(0U, &PPC::VRRCRegClass); + if (Subtarget.hasAltivec()) + return std::make_pair(0U, &PPC::VRRCRegClass); case 'y': // crrc return std::make_pair(0U, &PPC::CRRCRegClass); } - } else if (Constraint == "wc") { // an individual CR bit. + } else if (Constraint == "wc" && Subtarget.useCRBits()) { + // An individual CR bit. return std::make_pair(0U, &PPC::CRBITRCRegClass); - } else if (Constraint == "wa" || Constraint == "wd" || - Constraint == "wf") { + } else if ((Constraint == "wa" || Constraint == "wd" || + Constraint == "wf") && Subtarget.hasVSX()) { return std::make_pair(0U, &PPC::VSRCRegClass); - } else if (Constraint == "ws") { - if (VT == MVT::f32) + } else if (Constraint == "ws" && Subtarget.hasVSX()) { + if (VT == MVT::f32 && Subtarget.hasP8Vector()) return std::make_pair(0U, &PPC::VSSRCRegClass); else return std::make_pair(0U, &PPC::VSFRCRegClass); @@ -10908,7 +10985,6 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return R; } - /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, @@ -11358,9 +11434,7 @@ bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); - if (BitSize == 0 || BitSize > 64) - return false; - return true; + return !(BitSize == 0 || BitSize > 64); } bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { @@ -11477,11 +11551,21 @@ PPCTargetLowering::getScratchRegisters(CallingConv::ID) const { return ScratchRegs; } +unsigned PPCTargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + return Subtarget.isPPC64() ? PPC::X3 : PPC::R3; +} + +unsigned PPCTargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + return Subtarget.isPPC64() ? PPC::X4 : PPC::R4; +} + bool PPCTargetLowering::shouldExpandBuildVectorWithShuffles( EVT VT , unsigned DefinedValues) const { if (VT == MVT::v2i64) - return false; + return Subtarget.hasDirectMove(); // Don't need stack ops with direct moves if (Subtarget.hasQPX()) { if (VT == MVT::v4f32 || VT == MVT::v4f64 || VT == MVT::v4i1) diff --git a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h index 6e13533..44bcb89 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -79,6 +79,11 @@ namespace llvm { /// compute an allocation on the stack. DYNALLOC, + /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to + /// compute an offset from native SP to the address of the most recent + /// dynamic alloca. + DYNAREAOFFSET, + /// GlobalBaseReg - On Darwin, this node represents the result of the mflr /// at function entry, used for PIC code. GlobalBaseReg, @@ -423,6 +428,8 @@ namespace llvm { /// DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool useSoftFloat() const override; + MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { return MVT::i32; } @@ -655,8 +662,17 @@ namespace llvm { return Ty->isArrayTy(); } - private: + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + private: struct ReuseLoadInfo { SDValue Ptr; SDValue Chain; @@ -719,6 +735,8 @@ namespace llvm { const PPCSubtarget &Subtarget) const; SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; + SDValue LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG, + const PPCSubtarget &Subtarget) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG, const PPCSubtarget &Subtarget) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; @@ -853,7 +871,7 @@ namespace llvm { bool &UseOneConstNR) const override; SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI, unsigned &RefinementSteps) const override; - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; CCAssignFn *useFastISelCCs(unsigned Flag) const; }; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index d628330..075e093 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -369,6 +369,8 @@ let Defs = [X1], Uses = [X1] in def DYNALLOC8 : Pseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri:$fpsi),"#DYNALLOC8", [(set i64:$result, (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET8 : Pseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8", + [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>; let Defs = [LR8] in { def MTLR8 : XFXForm_7_ext<31, 467, 8, (outs), (ins g8rc:$rS), diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index d4e666c..c17603a 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -144,6 +144,9 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, int Latency = PPCGenInstrInfo::getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx); + if (!DefMI->getParent()) + return Latency; + const MachineOperand &DefMO = DefMI->getOperand(DefIdx); unsigned Reg = DefMO.getReg(); @@ -186,6 +189,60 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, return Latency; } +// This function does not list all associative and commutative operations, but +// only those worth feeding through the machine combiner in an attempt to +// reduce the critical path. Mostly, this means floating-point operations, +// because they have high latencies (compared to other operations, such and +// and/or, which are also associative and commutative, but have low latencies). +bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + // FP Add: + case PPC::FADD: + case PPC::FADDS: + // FP Multiply: + case PPC::FMUL: + case PPC::FMULS: + // Altivec Add: + case PPC::VADDFP: + // VSX Add: + case PPC::XSADDDP: + case PPC::XVADDDP: + case PPC::XVADDSP: + case PPC::XSADDSP: + // VSX Multiply: + case PPC::XSMULDP: + case PPC::XVMULDP: + case PPC::XVMULSP: + case PPC::XSMULSP: + // QPX Add: + case PPC::QVFADD: + case PPC::QVFADDS: + case PPC::QVFADDSs: + // QPX Multiply: + case PPC::QVFMUL: + case PPC::QVFMULS: + case PPC::QVFMULSs: + return true; + default: + return false; + } +} + +bool PPCInstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + // Using the machine combiner in this way is potentially expensive, so + // restrict to when aggressive optimizations are desired. + if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive) + return false; + + // FP reassociation is only legal when we don't need strict IEEE semantics. + if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) + return false; + + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); +} + // Detect 32 -> 64-bit extensions where we may reuse the low sub-register. bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, @@ -259,16 +316,16 @@ unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI, return 0; } -// commuteInstruction - We can commute rlwimi instructions, but only if the -// rotate amt is zero. We also have to munge the immediates a bit. -MachineInstr * -PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +MachineInstr *PPCInstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { MachineFunction &MF = *MI->getParent()->getParent(); // Normal instructions can be commuted the obvious way. if (MI->getOpcode() != PPC::RLWIMI && MI->getOpcode() != PPC::RLWIMIo) - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); // Note that RLWIMI can be commuted as a 32-bit instruction, but not as a // 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because // changing the relative order of the mask operands might change what happens @@ -286,6 +343,8 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Op0 = (Op2 & ~M) | (Op1 & M) // Swap op1/op2 + assert(((OpIdx1 == 1 && OpIdx2 == 2) || (OpIdx1 == 2 && OpIdx2 == 1)) && + "Only the operands 1 and 2 can be swapped in RLSIMI/RLWIMIo."); unsigned Reg0 = MI->getOperand(0).getReg(); unsigned Reg1 = MI->getOperand(1).getReg(); unsigned Reg2 = MI->getOperand(2).getReg(); @@ -353,9 +412,9 @@ bool PPCInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, if (AltOpc == -1) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; + // The commutable operand indices are 2 and 3. Return them in SrcOpIdx1 + // and SrcOpIdx2. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3); } void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, @@ -996,11 +1055,10 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MBB.insert(MI, NewMIs[i]); const MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOStore, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); NewMIs.back()->addMemOperand(MF, MMO); } @@ -1109,11 +1167,10 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MBB.insert(MI, NewMIs[i]); const MachineFrameInfo &MFI = *MF.getFrameInfo(); - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FrameIdx), - MFI.getObjectAlignment(FrameIdx)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FrameIdx), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx), + MFI.getObjectAlignment(FrameIdx)); NewMIs.back()->addMemOperand(MF, MMO); } @@ -1214,7 +1271,7 @@ bool PPCInstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT, unsigned ExtraT, MachineBasicBlock &FMBB, unsigned NumF, unsigned ExtraF, - const BranchProbability &Probability) const { + BranchProbability Probability) const { return !(MBBDefinesCTR(TMBB) && MBBDefinesCTR(FMBB)); } @@ -1691,13 +1748,13 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr, MI->setDesc(NewDesc); if (NewDesc.ImplicitDefs) - for (const uint16_t *ImpDefs = NewDesc.getImplicitDefs(); + for (const MCPhysReg *ImpDefs = NewDesc.getImplicitDefs(); *ImpDefs; ++ImpDefs) if (!MI->definesRegister(*ImpDefs)) MI->addOperand(*MI->getParent()->getParent(), MachineOperand::CreateReg(*ImpDefs, true, true)); if (NewDesc.ImplicitUses) - for (const uint16_t *ImpUses = NewDesc.getImplicitUses(); + for (const MCPhysReg *ImpUses = NewDesc.getImplicitUses(); *ImpUses; ++ImpUses) if (!MI->readsRegister(*ImpUses)) MI->addOperand(*MI->getParent()->getParent(), @@ -1737,3 +1794,35 @@ unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { } } +std::pair<unsigned, unsigned> +PPCInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + const unsigned Mask = PPCII::MO_ACCESS_MASK; + return std::make_pair(TF & Mask, TF & ~Mask); +} + +ArrayRef<std::pair<unsigned, const char *>> +PPCInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace PPCII; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_LO, "ppc-lo"}, + {MO_HA, "ppc-ha"}, + {MO_TPREL_LO, "ppc-tprel-lo"}, + {MO_TPREL_HA, "ppc-tprel-ha"}, + {MO_DTPREL_LO, "ppc-dtprel-lo"}, + {MO_TLSLD_LO, "ppc-tlsld-lo"}, + {MO_TOC_LO, "ppc-toc-lo"}, + {MO_TLS, "ppc-tls"}}; + return makeArrayRef(TargetFlags); +} + +ArrayRef<std::pair<unsigned, const char *>> +PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace PPCII; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_PLT_OR_STUB, "ppc-plt-or-stub"}, + {MO_PIC_FLAG, "ppc-pic"}, + {MO_NLP_FLAG, "ppc-nlp"}, + {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}}; + return makeArrayRef(TargetFlags); +} + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 40badae..c3c3a48 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -79,6 +79,23 @@ class PPCInstrInfo : public PPCGenInstrInfo { SmallVectorImpl<MachineInstr*> &NewMIs, bool &NonRI, bool &SpillsVRS) const; virtual void anchor(); + +protected: + /// Commutes the operands in the given instruction. + /// The commutable operands are specified by their indices OpIdx1 and OpIdx2. + /// + /// Do not call this method for a non-commutable instruction or for + /// non-commutable pair of operand indices OpIdx1 and OpIdx2. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + /// + /// For example, we can commute rlwimi instructions, but only if the + /// rotate amt is zero. We also have to munge the immediates a bit. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const override; + public: explicit PPCInstrInfo(PPCSubtarget &STI); @@ -119,6 +136,19 @@ public: return false; } + bool useMachineCombiner() const override { + return true; + } + + /// Return true when there is potentially a faster code sequence + /// for an instruction chain ending in <Root>. All potential patterns are + /// output in the <Pattern> array. + bool getMachineCombinerPatterns( + MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &P) const override; + + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const override; @@ -127,10 +157,6 @@ public: unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const override; - // commuteInstruction - We can commute rlwimi instructions, but only if the - // rotate amt is zero. We also have to munge the immediates a bit. - MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; - bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; @@ -183,7 +209,7 @@ public: // profitable to use the predicated branches. bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override { + BranchProbability Probability) const override { return true; } @@ -191,12 +217,10 @@ public: unsigned NumT, unsigned ExtraT, MachineBasicBlock &FMBB, unsigned NumF, unsigned ExtraF, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; - bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, - unsigned NumCycles, - const BranchProbability - &Probability) const override { + bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, + BranchProbability Probability) const override { return true; } @@ -239,6 +263,15 @@ public: unsigned GetInstSizeInBytes(const MachineInstr *MI) const; void getNoopForMachoTarget(MCInst &NopInst) const override; + + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; }; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 24fd9bd..6c4364a 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -226,7 +226,9 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, // Instructions to support dynamic alloca. def SDTDynOp : SDTypeProfile<1, 2, []>; +def SDTDynAreaOp : SDTypeProfile<1, 1, []>; def PPCdynalloc : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>; +def PPCdynareaoffset : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>; //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. @@ -1029,6 +1031,8 @@ let Defs = [R1], Uses = [R1] in def DYNALLOC : Pseudo<(outs gprc:$result), (ins gprc:$negsize, memri:$fpsi), "#DYNALLOC", [(set i32:$result, (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>; +def DYNAREAOFFSET : Pseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET", + [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>; // SELECT_CC_* - Used to implement the SELECT_CC DAG operation. Expanded after // instruction selection into a branch sequence. @@ -3883,8 +3887,11 @@ def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>; -def : InstAlias<"cntlz $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>; -def : InstAlias<"cntlz. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>; +def : InstAlias<"cntlzw $rA, $rS", (CNTLZW gprc:$rA, gprc:$rS)>; +def : InstAlias<"cntlzw. $rA, $rS", (CNTLZWo gprc:$rA, gprc:$rS)>; +// The POWER variant +def : MnemonicAlias<"cntlz", "cntlzw">; +def : MnemonicAlias<"cntlz.", "cntlzw.">; def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b", (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td index 0a044c5..4312007 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrQPX.td @@ -839,31 +839,31 @@ def : Pat<(v4f64 (scalar_to_vector f64:$A)), def : Pat<(v4f32 (scalar_to_vector f32:$A)), (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), $A, sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 0)), +def : Pat<(f64 (extractelt v4f64:$S, 0)), (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 0)), +def : Pat<(f32 (extractelt v4f32:$S, 0)), (EXTRACT_SUBREG $S, sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 1)), +def : Pat<(f64 (extractelt v4f64:$S, 1)), (EXTRACT_SUBREG (QVESPLATI $S, 1), sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 2)), +def : Pat<(f64 (extractelt v4f64:$S, 2)), (EXTRACT_SUBREG (QVESPLATI $S, 2), sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, 3)), +def : Pat<(f64 (extractelt v4f64:$S, 3)), (EXTRACT_SUBREG (QVESPLATI $S, 3), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 1)), +def : Pat<(f32 (extractelt v4f32:$S, 1)), (EXTRACT_SUBREG (QVESPLATIs $S, 1), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 2)), +def : Pat<(f32 (extractelt v4f32:$S, 2)), (EXTRACT_SUBREG (QVESPLATIs $S, 2), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, 3)), +def : Pat<(f32 (extractelt v4f32:$S, 3)), (EXTRACT_SUBREG (QVESPLATIs $S, 3), sub_64)>; -def : Pat<(f64 (vector_extract v4f64:$S, i64:$F)), +def : Pat<(f64 (extractelt v4f64:$S, i64:$F)), (EXTRACT_SUBREG (QVFPERM $S, $S, (QVLPCLSXint (RLDICR $F, 2, /* 63-2 = */ 61))), sub_64)>; -def : Pat<(f32 (vector_extract v4f32:$S, i64:$F)), +def : Pat<(f32 (extractelt v4f32:$S, i64:$F)), (EXTRACT_SUBREG (QVFPERMs $S, $S, (QVLPCLSXint (RLDICR $F, 2, /* 63-2 = */ 61))), diff --git a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td index ce63c22..df1142c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/contrib/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -67,17 +67,19 @@ def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>; def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>; def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>; -multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL, - string asmbase, string asmstr, InstrItinClass itin, - list<dag> pattern> { +multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase, + string asmstr, InstrItinClass itin, Intrinsic Int, + ValueType OutTy, ValueType InTy> { let BaseName = asmbase in { - def NAME : XX3Form_Rc<opcode, xo, OOL, IOL, + def NAME : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), !strconcat(asmbase, !strconcat(" ", asmstr)), itin, - pattern>; + [(set OutTy:$XT, (Int InTy:$XA, InTy:$XB))]>; let Defs = [CR6] in - def o : XX3Form_Rc<opcode, xo, OOL, IOL, + def o : XX3Form_Rc<opcode, xo, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), !strconcat(asmbase, !strconcat(". ", asmstr)), itin, - []>, isDOT; + [(set InTy:$XT, + (InTy (PPCvcmp_o InTy:$XA, InTy:$XB, xo)))]>, + isDOT; } } @@ -456,35 +458,23 @@ let Uses = [RM] in { "xscmpudp $crD, $XA, $XB", IIC_FPCompare, []>; defm XVCMPEQDP : XX3Form_Rcr<60, 99, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpeqdp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v2i64:$XT, - (int_ppc_vsx_xvcmpeqdp v2f64:$XA, v2f64:$XB))]>; + int_ppc_vsx_xvcmpeqdp, v2i64, v2f64>; defm XVCMPEQSP : XX3Form_Rcr<60, 67, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpeqsp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v4i32:$XT, - (int_ppc_vsx_xvcmpeqsp v4f32:$XA, v4f32:$XB))]>; + int_ppc_vsx_xvcmpeqsp, v4i32, v4f32>; defm XVCMPGEDP : XX3Form_Rcr<60, 115, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgedp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v2i64:$XT, - (int_ppc_vsx_xvcmpgedp v2f64:$XA, v2f64:$XB))]>; + int_ppc_vsx_xvcmpgedp, v2i64, v2f64>; defm XVCMPGESP : XX3Form_Rcr<60, 83, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgesp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v4i32:$XT, - (int_ppc_vsx_xvcmpgesp v4f32:$XA, v4f32:$XB))]>; + int_ppc_vsx_xvcmpgesp, v4i32, v4f32>; defm XVCMPGTDP : XX3Form_Rcr<60, 107, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgtdp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v2i64:$XT, - (int_ppc_vsx_xvcmpgtdp v2f64:$XA, v2f64:$XB))]>; + int_ppc_vsx_xvcmpgtdp, v2i64, v2f64>; defm XVCMPGTSP : XX3Form_Rcr<60, 75, - (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), "xvcmpgtsp", "$XT, $XA, $XB", IIC_VecFPCompare, - [(set v4i32:$XT, - (int_ppc_vsx_xvcmpgtsp v4f32:$XA, v4f32:$XB))]>; + int_ppc_vsx_xvcmpgtsp, v4i32, v4f32>; // Move Instructions def XSABSDP : XX2Form<60, 345, @@ -845,9 +835,9 @@ let Predicates = [IsBigEndian] in { def : Pat<(v2f64 (scalar_to_vector f64:$A)), (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>; -def : Pat<(f64 (vector_extract v2f64:$S, 0)), +def : Pat<(f64 (extractelt v2f64:$S, 0)), (f64 (EXTRACT_SUBREG $S, sub_64))>; -def : Pat<(f64 (vector_extract v2f64:$S, 1)), +def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; } @@ -856,9 +846,9 @@ def : Pat<(v2f64 (scalar_to_vector f64:$A)), (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64), (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>; -def : Pat<(f64 (vector_extract v2f64:$S, 0)), +def : Pat<(f64 (extractelt v2f64:$S, 0)), (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>; -def : Pat<(f64 (vector_extract v2f64:$S, 1)), +def : Pat<(f64 (extractelt v2f64:$S, 1)), (f64 (EXTRACT_SUBREG $S, sub_64))>; } @@ -1206,6 +1196,23 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns. RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">, AltVSXFMARel; } + + // Single Precision Conversions (FP <-> INT) + def XSCVSXDSP : XX2Form<60, 312, + (outs vssrc:$XT), (ins vsfrc:$XB), + "xscvsxdsp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfcfids f64:$XB))]>; + def XSCVUXDSP : XX2Form<60, 296, + (outs vssrc:$XT), (ins vsfrc:$XB), + "xscvuxdsp $XT, $XB", IIC_VecFP, + [(set f32:$XT, (PPCfcfidus f64:$XB))]>; + + // Conversions between vector and scalar single precision + def XSCVDPSPN : XX2Form<60, 267, (outs vsrc:$XT), (ins vssrc:$XB), + "xscvdpspn $XT, $XB", IIC_VecFP, []>; + def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB), + "xscvspdpn $XT, $XB", IIC_VecFP, []>; + } // AddedComplexity = 400 } // HasP8Vector @@ -1229,3 +1236,550 @@ let Predicates = [HasDirectMove, HasVSX] in { "mtvsrwz $XT, $rA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsrz i32:$rA))]>; } // HasDirectMove, HasVSX + +/* Direct moves of various widths from GPR's into VSR's. Each move lines + the value up into element 0 (both BE and LE). Namely, entities smaller than + a doubleword are shifted left and moved for BE. For LE, they're moved, then + swapped to go into the least significant element of the VSR. +*/ +def MovesToVSR { + dag BE_BYTE_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 56, 7)); + dag BE_HALF_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 48, 15)); + dag BE_WORD_0 = + (MTVSRD + (RLDICR + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32), 32, 31)); + dag BE_DWORD_0 = (MTVSRD $A); + + dag LE_MTVSRW = (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32)); + dag LE_WORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + LE_MTVSRW, sub_64)); + dag LE_WORD_0 = (XXPERMDI LE_WORD_1, LE_WORD_1, 2); + dag LE_DWORD_1 = (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), + BE_DWORD_0, sub_64)); + dag LE_DWORD_0 = (XXPERMDI LE_DWORD_1, LE_DWORD_1, 2); +} + +/* Patterns for extracting elements out of vectors. Integer elements are + extracted using direct move operations. Patterns for extracting elements + whose indices are not available at compile time are also provided with + various _VARIABLE_ patterns. + The numbering for the DAG's is for LE, but when used on BE, the correct + LE element can just be used (i.e. LE_BYTE_2 == BE_BYTE_13). +*/ +def VectorExtractions { + // Doubleword extraction + dag LE_DWORD_0 = + (MFVSRD + (EXTRACT_SUBREG + (XXPERMDI (COPY_TO_REGCLASS $S, VSRC), + (COPY_TO_REGCLASS $S, VSRC), 2), sub_64)); + dag LE_DWORD_1 = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); + + // Word extraction + dag LE_WORD_0 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 2), sub_64)); + dag LE_WORD_1 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 1), sub_64)); + dag LE_WORD_2 = (MFVSRWZ (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS $S, VSRC)), sub_64)); + dag LE_WORD_3 = (MFVSRWZ (EXTRACT_SUBREG (XXSLDWI $S, $S, 3), sub_64)); + + // Halfword extraction + dag LE_HALF_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 48), sub_32)); + dag LE_HALF_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 48), sub_32)); + dag LE_HALF_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 48), sub_32)); + dag LE_HALF_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 48), sub_32)); + dag LE_HALF_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 48), sub_32)); + dag LE_HALF_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 48), sub_32)); + dag LE_HALF_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 48), sub_32)); + dag LE_HALF_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 48), sub_32)); + + // Byte extraction + dag LE_BYTE_0 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 0, 56), sub_32)); + dag LE_BYTE_1 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 56, 56), sub_32)); + dag LE_BYTE_2 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 48, 56), sub_32)); + dag LE_BYTE_3 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 40, 56), sub_32)); + dag LE_BYTE_4 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 32, 56), sub_32)); + dag LE_BYTE_5 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 24, 56), sub_32)); + dag LE_BYTE_6 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 16, 56), sub_32)); + dag LE_BYTE_7 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_0, 8, 56), sub_32)); + dag LE_BYTE_8 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 0, 56), sub_32)); + dag LE_BYTE_9 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 56, 56), sub_32)); + dag LE_BYTE_10 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 48, 56), sub_32)); + dag LE_BYTE_11 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 40, 56), sub_32)); + dag LE_BYTE_12 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 32, 56), sub_32)); + dag LE_BYTE_13 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 24, 56), sub_32)); + dag LE_BYTE_14 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 16, 56), sub_32)); + dag LE_BYTE_15 = (i32 (EXTRACT_SUBREG (RLDICL LE_DWORD_1, 8, 56), sub_32)); + + /* Variable element number (BE and LE patterns must be specified separately) + This is a rather involved process. + + Conceptually, this is how the move is accomplished: + 1. Identify which doubleword contains the element + 2. Shift in the VMX register so that the correct doubleword is correctly + lined up for the MFVSRD + 3. Perform the move so that the element (along with some extra stuff) + is in the GPR + 4. Right shift within the GPR so that the element is right-justified + + Of course, the index is an element number which has a different meaning + on LE/BE so the patterns have to be specified separately. + + Note: The final result will be the element right-justified with high + order bits being arbitrarily defined (namely, whatever was in the + vector register to the left of the value originally). + */ + + /* LE variable byte + Number 1. above: + - For elements 0-7, we shift left by 8 bytes since they're on the right + - For elements 8-15, we need not shift (shift left by zero bytes) + This is accomplished by inverting the bits of the index and AND-ing + with 0x8 (i.e. clearing all bits of the index and inverting bit 60). + */ + dag LE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDC8 (LI8 8), $Idx)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VBYTE_PERMUTE = (VPERM $S, $S, LE_VBYTE_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VBYTE = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VBYTE_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-7 (8-15 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 8 as we need to shift right by the number of bits, not bytes + - Shift right in the GPR by the calculated value + */ + dag LE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 7), $Idx), 3, 60), + sub_32); + dag LE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD LE_MV_VBYTE, LE_VBYTE_SHIFT), + sub_32); + + /* LE variable halfword + Number 1. above: + - For elements 0-3, we shift left by 8 since they're on the right + - For elements 4-7, we need not shift (shift left by zero bytes) + Similarly to the byte pattern, we invert the bits of the index, but we + AND with 0x4 (i.e. clear all bits of the index and invert bit 61). + Of course, the shift is still by 8 bytes, so we must multiply by 2. + */ + dag LE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 4), $Idx), 1, 62)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VHALF_PERMUTE = (VPERM $S, $S, LE_VHALF_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VHALF = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VHALF_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-3 (4-7 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 16 as we need to shift right by the number of bits + - Shift right in the GPR by the calculated value + */ + dag LE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 3), $Idx), 4, 59), + sub_32); + dag LE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD LE_MV_VHALF, LE_VHALF_SHIFT), + sub_32); + + /* LE variable word + Number 1. above: + - For elements 0-1, we shift left by 8 since they're on the right + - For elements 2-3, we need not shift + */ + dag LE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 2), $Idx), 2, 61)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VWORD_PERMUTE = (VPERM $S, $S, LE_VWORD_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + dag LE_MV_VWORD = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VWORD_PERMUTE, VSRC)), + sub_64)); + + /* Number 4. above: + - Truncate the element number to the range 0-1 (2-3 are symmetrical + and out of range values are truncated accordingly) + - Multiply by 32 as we need to shift right by the number of bits + - Shift right in the GPR by the calculated value + */ + dag LE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (AND8 (LI8 1), $Idx), 5, 58), + sub_32); + dag LE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD LE_MV_VWORD, LE_VWORD_SHIFT), + sub_32); + + /* LE variable doubleword + Number 1. above: + - For element 0, we shift left by 8 since it's on the right + - For element 1, we need not shift + */ + dag LE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDC8 (LI8 1), $Idx), 3, 60)); + + // Number 2. above: + // - Now that we set up the shift amount, we shift in the VMX register + dag LE_VDWORD_PERMUTE = (VPERM $S, $S, LE_VDWORD_PERM_VEC); + + // Number 3. above: + // - The doubleword containing our element is moved to a GPR + // - Number 4. is not needed for the doubleword as the value is 64-bits + dag LE_VARIABLE_DWORD = + (MFVSRD (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS LE_VDWORD_PERMUTE, VSRC)), + sub_64)); + + /* LE variable float + - Shift the vector to line up the desired element to BE Word 0 + - Convert 32-bit float to a 64-bit single precision float + */ + dag LE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR (XOR8 (LI8 3), $Idx), 2, 61)); + dag LE_VFLOAT_PERMUTE = (VPERM $S, $S, LE_VFLOAT_PERM_VEC); + dag LE_VARIABLE_FLOAT = (XSCVSPDPN LE_VFLOAT_PERMUTE); + + /* LE variable double + Same as the LE doubleword except there is no move. + */ + dag LE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC), + (COPY_TO_REGCLASS $S, VRRC), + LE_VDWORD_PERM_VEC); + dag LE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS LE_VDOUBLE_PERMUTE, VSRC); + + /* BE variable byte + The algorithm here is the same as the LE variable byte except: + - The shift in the VMX register is by 0/8 for opposite element numbers so + we simply AND the element number with 0x8 + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-7 + */ + dag BE_VBYTE_PERM_VEC = (LVSL ZERO8, (ANDIo8 $Idx, 8)); + dag BE_VBYTE_PERMUTE = (VPERM $S, $S, BE_VBYTE_PERM_VEC); + dag BE_MV_VBYTE = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VBYTE_PERMUTE, VSRC)), + sub_64)); + dag BE_VBYTE_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 7), $Idx), 3, 60), + sub_32); + dag BE_VARIABLE_BYTE = (EXTRACT_SUBREG (SRD BE_MV_VBYTE, BE_VBYTE_SHIFT), + sub_32); + + /* BE variable halfword + The algorithm here is the same as the LE variable halfword except: + - The shift in the VMX register is by 0/8 for opposite element numbers so + we simply AND the element number with 0x4 and multiply by 2 + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-3 + */ + dag BE_VHALF_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 4), 1, 62)); + dag BE_VHALF_PERMUTE = (VPERM $S, $S, BE_VHALF_PERM_VEC); + dag BE_MV_VHALF = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VHALF_PERMUTE, VSRC)), + sub_64)); + dag BE_VHALF_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 3), $Idx), 4, 59), + sub_32); + dag BE_VARIABLE_HALF = (EXTRACT_SUBREG (SRD BE_MV_VHALF, BE_VHALF_SHIFT), + sub_32); + + /* BE variable word + The algorithm is the same as the LE variable word except: + - The shift in the VMX register happens for opposite element numbers + - The order of elements after the move to GPR is reversed, so we invert + the bits of the index prior to truncating to the range 0-1 + */ + dag BE_VWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 2), 2, 61)); + dag BE_VWORD_PERMUTE = (VPERM $S, $S, BE_VWORD_PERM_VEC); + dag BE_MV_VWORD = (MFVSRD + (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VWORD_PERMUTE, VSRC)), + sub_64)); + dag BE_VWORD_SHIFT = (EXTRACT_SUBREG (RLDICR (ANDC8 (LI8 1), $Idx), 5, 58), + sub_32); + dag BE_VARIABLE_WORD = (EXTRACT_SUBREG (SRD BE_MV_VWORD, BE_VWORD_SHIFT), + sub_32); + + /* BE variable doubleword + Same as the LE doubleword except we shift in the VMX register for opposite + element indices. + */ + dag BE_VDWORD_PERM_VEC = (LVSL ZERO8, (RLDICR (ANDIo8 $Idx, 1), 3, 60)); + dag BE_VDWORD_PERMUTE = (VPERM $S, $S, BE_VDWORD_PERM_VEC); + dag BE_VARIABLE_DWORD = + (MFVSRD (EXTRACT_SUBREG + (v2i64 (COPY_TO_REGCLASS BE_VDWORD_PERMUTE, VSRC)), + sub_64)); + + /* BE variable float + - Shift the vector to line up the desired element to BE Word 0 + - Convert 32-bit float to a 64-bit single precision float + */ + dag BE_VFLOAT_PERM_VEC = (LVSL ZERO8, (RLDICR $Idx, 2, 61)); + dag BE_VFLOAT_PERMUTE = (VPERM $S, $S, BE_VFLOAT_PERM_VEC); + dag BE_VARIABLE_FLOAT = (XSCVSPDPN BE_VFLOAT_PERMUTE); + + /* BE variable double + Same as the BE doubleword except there is no move. + */ + dag BE_VDOUBLE_PERMUTE = (VPERM (COPY_TO_REGCLASS $S, VRRC), + (COPY_TO_REGCLASS $S, VRRC), + BE_VDWORD_PERM_VEC); + dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC); +} + +// v4f32 scalar <-> vector conversions (BE) +let Predicates = [IsBigEndian, HasP8Vector] in { + def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (v4f32 (XSCVDPSPN $A))>; + def : Pat<(f32 (vector_extract v4f32:$S, 0)), + (f32 (XSCVSPDPN $S))>; + def : Pat<(f32 (vector_extract v4f32:$S, 1)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 2)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 3)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; + def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), + (f32 VectorExtractions.BE_VARIABLE_FLOAT)>; +} // IsBigEndian, HasP8Vector + +// Variable index vector_extract for v2f64 does not require P8Vector +let Predicates = [IsBigEndian, HasVSX] in + def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), + (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>; + +let Predicates = [IsBigEndian, HasDirectMove] in { + // v16i8 scalar <-> vector conversions (BE) + def : Pat<(v16i8 (scalar_to_vector i32:$A)), + (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>; + def : Pat<(v8i16 (scalar_to_vector i32:$A)), + (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>; + def : Pat<(v4i32 (scalar_to_vector i32:$A)), + (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>; + def : Pat<(v2i64 (scalar_to_vector i64:$A)), + (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 VectorExtractions.LE_BYTE_15)>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 VectorExtractions.LE_BYTE_14)>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 VectorExtractions.LE_BYTE_13)>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 VectorExtractions.LE_BYTE_12)>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 VectorExtractions.LE_BYTE_11)>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 VectorExtractions.LE_BYTE_10)>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 VectorExtractions.LE_BYTE_9)>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 VectorExtractions.LE_BYTE_8)>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 VectorExtractions.LE_BYTE_7)>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 VectorExtractions.LE_BYTE_6)>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 VectorExtractions.LE_BYTE_5)>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 VectorExtractions.LE_BYTE_4)>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 VectorExtractions.LE_BYTE_3)>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 VectorExtractions.LE_BYTE_2)>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 VectorExtractions.LE_BYTE_1)>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 VectorExtractions.LE_BYTE_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_BYTE)>; + + // v8i16 scalar <-> vector conversions (BE) + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 VectorExtractions.LE_HALF_7)>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 VectorExtractions.LE_HALF_6)>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 VectorExtractions.LE_HALF_5)>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 VectorExtractions.LE_HALF_4)>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 VectorExtractions.LE_HALF_3)>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 VectorExtractions.LE_HALF_2)>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 VectorExtractions.LE_HALF_1)>; + def : Pat<(i32 (vector_extract v8i16:$S, 7)), + (i32 VectorExtractions.LE_HALF_0)>; + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_HALF)>; + + // v4i32 scalar <-> vector conversions (BE) + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 VectorExtractions.LE_WORD_3)>; + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 VectorExtractions.LE_WORD_1)>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 VectorExtractions.LE_WORD_0)>; + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 VectorExtractions.BE_VARIABLE_WORD)>; + + // v2i64 scalar <-> vector conversions (BE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.BE_VARIABLE_DWORD)>; +} // IsBigEndian, HasDirectMove + +// v4f32 scalar <-> vector conversions (LE) +let Predicates = [IsLittleEndian, HasP8Vector] in { + def : Pat<(v4f32 (scalar_to_vector f32:$A)), + (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>; + def : Pat<(f32 (vector_extract v4f32:$S, 0)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 1)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 2)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 2)), + (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>; + def : Pat<(f32 (vector_extract v4f32:$S, 3)), + (f32 (XSCVSPDPN $S))>; + def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)), + (f32 VectorExtractions.LE_VARIABLE_FLOAT)>; +} // IsLittleEndian, HasP8Vector + +// Variable index vector_extract for v2f64 does not require P8Vector +let Predicates = [IsLittleEndian, HasVSX] in + def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)), + (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>; + +let Predicates = [IsLittleEndian, HasDirectMove] in { + // v16i8 scalar <-> vector conversions (LE) + def : Pat<(v16i8 (scalar_to_vector i32:$A)), + (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; + def : Pat<(v8i16 (scalar_to_vector i32:$A)), + (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>; + def : Pat<(v4i32 (scalar_to_vector i32:$A)), + (v4i32 MovesToVSR.LE_WORD_0)>; + def : Pat<(v2i64 (scalar_to_vector i64:$A)), + (v2i64 MovesToVSR.LE_DWORD_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, 0)), + (i32 VectorExtractions.LE_BYTE_0)>; + def : Pat<(i32 (vector_extract v16i8:$S, 1)), + (i32 VectorExtractions.LE_BYTE_1)>; + def : Pat<(i32 (vector_extract v16i8:$S, 2)), + (i32 VectorExtractions.LE_BYTE_2)>; + def : Pat<(i32 (vector_extract v16i8:$S, 3)), + (i32 VectorExtractions.LE_BYTE_3)>; + def : Pat<(i32 (vector_extract v16i8:$S, 4)), + (i32 VectorExtractions.LE_BYTE_4)>; + def : Pat<(i32 (vector_extract v16i8:$S, 5)), + (i32 VectorExtractions.LE_BYTE_5)>; + def : Pat<(i32 (vector_extract v16i8:$S, 6)), + (i32 VectorExtractions.LE_BYTE_6)>; + def : Pat<(i32 (vector_extract v16i8:$S, 7)), + (i32 VectorExtractions.LE_BYTE_7)>; + def : Pat<(i32 (vector_extract v16i8:$S, 8)), + (i32 VectorExtractions.LE_BYTE_8)>; + def : Pat<(i32 (vector_extract v16i8:$S, 9)), + (i32 VectorExtractions.LE_BYTE_9)>; + def : Pat<(i32 (vector_extract v16i8:$S, 10)), + (i32 VectorExtractions.LE_BYTE_10)>; + def : Pat<(i32 (vector_extract v16i8:$S, 11)), + (i32 VectorExtractions.LE_BYTE_11)>; + def : Pat<(i32 (vector_extract v16i8:$S, 12)), + (i32 VectorExtractions.LE_BYTE_12)>; + def : Pat<(i32 (vector_extract v16i8:$S, 13)), + (i32 VectorExtractions.LE_BYTE_13)>; + def : Pat<(i32 (vector_extract v16i8:$S, 14)), + (i32 VectorExtractions.LE_BYTE_14)>; + def : Pat<(i32 (vector_extract v16i8:$S, 15)), + (i32 VectorExtractions.LE_BYTE_15)>; + def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_BYTE)>; + + // v8i16 scalar <-> vector conversions (LE) + def : Pat<(i32 (vector_extract v8i16:$S, 0)), + (i32 VectorExtractions.LE_HALF_0)>; + def : Pat<(i32 (vector_extract v8i16:$S, 1)), + (i32 VectorExtractions.LE_HALF_1)>; + def : Pat<(i32 (vector_extract v8i16:$S, 2)), + (i32 VectorExtractions.LE_HALF_2)>; + def : Pat<(i32 (vector_extract v8i16:$S, 3)), + (i32 VectorExtractions.LE_HALF_3)>; + def : Pat<(i32 (vector_extract v8i16:$S, 4)), + (i32 VectorExtractions.LE_HALF_4)>; + def : Pat<(i32 (vector_extract v8i16:$S, 5)), + (i32 VectorExtractions.LE_HALF_5)>; + def : Pat<(i32 (vector_extract v8i16:$S, 6)), + (i32 VectorExtractions.LE_HALF_6)>; + def : Pat<(i32 (vector_extract v8i16:$S, 7)), + (i32 VectorExtractions.LE_HALF_7)>; + def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_HALF)>; + + // v4i32 scalar <-> vector conversions (LE) + def : Pat<(i32 (vector_extract v4i32:$S, 0)), + (i32 VectorExtractions.LE_WORD_0)>; + def : Pat<(i32 (vector_extract v4i32:$S, 1)), + (i32 VectorExtractions.LE_WORD_1)>; + def : Pat<(i32 (vector_extract v4i32:$S, 2)), + (i32 VectorExtractions.LE_WORD_2)>; + def : Pat<(i32 (vector_extract v4i32:$S, 3)), + (i32 VectorExtractions.LE_WORD_3)>; + def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)), + (i32 VectorExtractions.LE_VARIABLE_WORD)>; + + // v2i64 scalar <-> vector conversions (LE) + def : Pat<(i64 (vector_extract v2i64:$S, 0)), + (i64 VectorExtractions.LE_DWORD_0)>; + def : Pat<(i64 (vector_extract v2i64:$S, 1)), + (i64 VectorExtractions.LE_DWORD_1)>; + def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)), + (i64 VectorExtractions.LE_VARIABLE_DWORD)>; +} // IsLittleEndian, HasDirectMove + +let Predicates = [HasDirectMove, HasVSX] in { +// bitconvert f32 -> i32 +// (convert to 32-bit fp single, shift right 1 word, move to GPR) +def : Pat<(i32 (bitconvert f32:$S)), + (i32 (MFVSRWZ (EXTRACT_SUBREG + (XXSLDWI (XSCVDPSPN $S),(XSCVDPSPN $S), 3), + sub_64)))>; +// bitconvert i32 -> f32 +// (move to FPR, shift left 1 word, convert to 64-bit fp single) +def : Pat<(f32 (bitconvert i32:$A)), + (f32 (XSCVSPDPN + (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>; + +// bitconvert f64 -> i64 +// (move to GPR, nothing else needed) +def : Pat<(i64 (bitconvert f64:$S)), + (i64 (MFVSRD $S))>; + +// bitconvert i64 -> f64 +// (move to FPR, nothing else needed) +def : Pat<(f64 (bitconvert i64:$S)), + (f64 (MTVSRD $S))>; +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp index b4e1c09..e3a35d5 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopDataPrefetch.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -71,10 +72,10 @@ namespace { AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); // FIXME: For some reason, preserving SE here breaks LSR (even if // this pass changes nothing). - // AU.addPreserved<ScalarEvolution>(); + // AU.addPreserved<ScalarEvolutionWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); } @@ -96,7 +97,7 @@ INITIALIZE_PASS_BEGIN(PPCLoopDataPrefetch, "ppc-loop-data-prefetch", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCLoopDataPrefetch, "ppc-loop-data-prefetch", "PPC Loop Data Prefetch", false, false) @@ -104,7 +105,7 @@ FunctionPass *llvm::createPPCLoopDataPrefetchPass() { return new PPCLoopDataPref bool PPCLoopDataPrefetch::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); DL = &F.getParent()->getDataLayout(); AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp index b6e7799..5e18826 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp @@ -73,7 +73,7 @@ namespace { AU.addPreserved<DominatorTreeWrapperPass>(); AU.addRequired<LoopInfoWrapperPass>(); AU.addPreserved<LoopInfoWrapperPass>(); - AU.addRequired<ScalarEvolution>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); } bool runOnFunction(Function &F) override; @@ -84,8 +84,10 @@ namespace { private: PPCTargetMachine *TM; + DominatorTree *DT; LoopInfo *LI; ScalarEvolution *SE; + bool PreserveLCSSA; }; } @@ -93,7 +95,7 @@ char PPCLoopPreIncPrep::ID = 0; static const char *name = "Prepare loop for pre-inc. addressing modes"; INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false) FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { @@ -101,17 +103,20 @@ FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) { } namespace { - struct SCEVLess : std::binary_function<const SCEV *, const SCEV *, bool> - { - SCEVLess(ScalarEvolution *SE) : SE(SE) {} + struct BucketElement { + BucketElement(const SCEVConstant *O, Instruction *I) : Offset(O), Instr(I) {} + BucketElement(Instruction *I) : Offset(nullptr), Instr(I) {} - bool operator() (const SCEV *X, const SCEV *Y) const { - const SCEV *Diff = SE->getMinusSCEV(X, Y); - return cast<SCEVConstant>(Diff)->getValue()->getSExtValue() < 0; - } + const SCEVConstant *Offset; + Instruction *Instr; + }; - protected: - ScalarEvolution *SE; + struct Bucket { + Bucket(const SCEV *B, Instruction *I) : BaseSCEV(B), + Elements(1, BucketElement(I)) {} + + const SCEV *BaseSCEV; + SmallVector<BucketElement, 16> Elements; }; } @@ -140,7 +145,10 @@ static Value *GetPointerOperand(Value *MemI) { bool PPCLoopPreIncPrep::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - SE = &getAnalysis<ScalarEvolution>(); + SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); bool MadeChange = false; @@ -169,7 +177,6 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { std::distance(pred_begin(Header), pred_end(Header)); // Collect buckets of comparable addresses used by loads and stores. - typedef std::multimap<const SCEV *, Instruction *, SCEVLess> Bucket; SmallVector<Bucket, 16> Buckets; for (Loop::block_iterator I = L->block_begin(), IE = L->block_end(); I != IE; ++I) { @@ -212,25 +219,24 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { } bool FoundBucket = false; - for (unsigned i = 0, e = Buckets.size(); i != e; ++i) - for (Bucket::iterator K = Buckets[i].begin(), KE = Buckets[i].end(); - K != KE; ++K) { - const SCEV *Diff = SE->getMinusSCEV(K->first, LSCEV); - if (isa<SCEVConstant>(Diff)) { - Buckets[i].insert(std::make_pair(LSCEV, MemI)); - FoundBucket = true; - break; - } + for (auto &B : Buckets) { + const SCEV *Diff = SE->getMinusSCEV(LSCEV, B.BaseSCEV); + if (const auto *CDiff = dyn_cast<SCEVConstant>(Diff)) { + B.Elements.push_back(BucketElement(CDiff, MemI)); + FoundBucket = true; + break; } + } if (!FoundBucket) { - Buckets.push_back(Bucket(SCEVLess(SE))); - Buckets[Buckets.size()-1].insert(std::make_pair(LSCEV, MemI)); + if (Buckets.size() == MaxVars) + return MadeChange; + Buckets.push_back(Bucket(LSCEV, MemI)); } } } - if (Buckets.empty() || Buckets.size() > MaxVars) + if (Buckets.empty()) return MadeChange; BasicBlock *LoopPredecessor = L->getLoopPredecessor(); @@ -239,7 +245,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { // iteration space), insert a new preheader for the loop. if (!LoopPredecessor || !LoopPredecessor->getTerminator()->getType()->isVoidTy()) { - LoopPredecessor = InsertPreheaderForLoop(L, this); + LoopPredecessor = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA); if (LoopPredecessor) MadeChange = true; } @@ -253,8 +259,45 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { // The base address of each bucket is transformed into a phi and the others // are rewritten as offsets of that variable. + // We have a choice now of which instruction's memory operand we use as the + // base for the generated PHI. Always picking the first instruction in each + // bucket does not work well, specifically because that instruction might + // be a prefetch (and there are no pre-increment dcbt variants). Otherwise, + // the choice is somewhat arbitrary, because the backend will happily + // generate direct offsets from both the pre-incremented and + // post-incremented pointer values. Thus, we'll pick the first non-prefetch + // instruction in each bucket, and adjust the recurrence and other offsets + // accordingly. + for (int j = 0, je = Buckets[i].Elements.size(); j != je; ++j) { + if (auto *II = dyn_cast<IntrinsicInst>(Buckets[i].Elements[j].Instr)) + if (II->getIntrinsicID() == Intrinsic::prefetch) + continue; + + // If we'd otherwise pick the first element anyway, there's nothing to do. + if (j == 0) + break; + + // If our chosen element has no offset from the base pointer, there's + // nothing to do. + if (!Buckets[i].Elements[j].Offset || + Buckets[i].Elements[j].Offset->isZero()) + break; + + const SCEV *Offset = Buckets[i].Elements[j].Offset; + Buckets[i].BaseSCEV = SE->getAddExpr(Buckets[i].BaseSCEV, Offset); + for (auto &E : Buckets[i].Elements) { + if (E.Offset) + E.Offset = cast<SCEVConstant>(SE->getMinusSCEV(E.Offset, Offset)); + else + E.Offset = cast<SCEVConstant>(SE->getNegativeSCEV(Offset)); + } + + std::swap(Buckets[i].Elements[j], Buckets[i].Elements[0]); + break; + } + const SCEVAddRecExpr *BasePtrSCEV = - cast<SCEVAddRecExpr>(Buckets[i].begin()->first); + cast<SCEVAddRecExpr>(Buckets[i].BaseSCEV); if (!BasePtrSCEV->isAffine()) continue; @@ -262,7 +305,9 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { assert(BasePtrSCEV->getLoop() == L && "AddRec for the wrong loop?"); - Instruction *MemI = Buckets[i].begin()->second; + // The instruction corresponding to the Bucket's BaseSCEV must be the first + // in the vector of elements. + Instruction *MemI = Buckets[i].Elements.begin()->Instr; Value *BasePtr = GetPointerOperand(MemI); assert(BasePtr && "No pointer operand"); @@ -302,7 +347,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { NewPHI->addIncoming(BasePtrStart, LoopPredecessor); } - Instruction *InsPoint = Header->getFirstInsertionPt(); + Instruction *InsPoint = &*Header->getFirstInsertionPt(); GetElementPtrInst *PtrInc = GetElementPtrInst::Create( I8Ty, NewPHI, BasePtrIncSCEV->getValue(), MemI->hasName() ? MemI->getName() + ".inc" : "", InsPoint); @@ -327,18 +372,20 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { BasePtr->replaceAllUsesWith(NewBasePtr); RecursivelyDeleteTriviallyDeadInstructions(BasePtr); - Value *LastNewPtr = NewBasePtr; - for (Bucket::iterator I = std::next(Buckets[i].begin()), - IE = Buckets[i].end(); I != IE; ++I) { - Value *Ptr = GetPointerOperand(I->second); + // Keep track of the replacement pointer values we've inserted so that we + // don't generate more pointer values than necessary. + SmallPtrSet<Value *, 16> NewPtrs; + NewPtrs.insert( NewBasePtr); + + for (auto I = std::next(Buckets[i].Elements.begin()), + IE = Buckets[i].Elements.end(); I != IE; ++I) { + Value *Ptr = GetPointerOperand(I->Instr); assert(Ptr && "No pointer operand"); - if (Ptr == LastNewPtr) + if (NewPtrs.count(Ptr)) continue; Instruction *RealNewPtr; - const SCEVConstant *Diff = - cast<SCEVConstant>(SE->getMinusSCEV(I->first, BasePtrSCEV)); - if (Diff->isZero()) { + if (!I->Offset || I->Offset->getValue()->isZero()) { RealNewPtr = NewBasePtr; } else { Instruction *PtrIP = dyn_cast<Instruction>(Ptr); @@ -346,13 +393,13 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { cast<Instruction>(NewBasePtr)->getParent() == PtrIP->getParent()) PtrIP = 0; else if (isa<PHINode>(PtrIP)) - PtrIP = PtrIP->getParent()->getFirstInsertionPt(); + PtrIP = &*PtrIP->getParent()->getFirstInsertionPt(); else if (!PtrIP) - PtrIP = I->second; + PtrIP = I->Instr; GetElementPtrInst *NewPtr = GetElementPtrInst::Create( - I8Ty, PtrInc, Diff->getValue(), - I->second->hasName() ? I->second->getName() + ".off" : "", PtrIP); + I8Ty, PtrInc, I->Offset->getValue(), + I->Instr->hasName() ? I->Instr->getName() + ".off" : "", PtrIP); if (!PtrIP) NewPtr->insertAfter(cast<Instruction>(PtrInc)); NewPtr->setIsInBounds(IsPtrInBounds(Ptr)); @@ -373,7 +420,7 @@ bool PPCLoopPreIncPrep::runOnLoop(Loop *L) { Ptr->replaceAllUsesWith(ReplNewPtr); RecursivelyDeleteTriviallyDeadInstructions(Ptr); - LastNewPtr = RealNewPtr; + NewPtrs.insert(RealNewPtr); } MadeChange = true; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp index 76837ec..44a692d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp @@ -38,7 +38,7 @@ static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) { static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ const TargetMachine &TM = AP.TM; Mangler *Mang = AP.Mang; - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = AP.getDataLayout(); MCContext &Ctx = AP.OutContext; bool isDarwin = TM.getTargetTriple().isOSDarwin(); @@ -51,13 +51,13 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){ Suffix = "$non_lazy_ptr"; if (!Suffix.empty()) - Name += DL->getPrivateGlobalPrefix(); + Name += DL.getPrivateGlobalPrefix(); unsigned PrefixLen = Name.size(); if (!MO.isGlobal()) { assert(MO.isSymbol() && "Isn't a symbol reference"); - Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL); + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL); } else { const GlobalValue *GV = MO.getGlobal(); TM.getNameWithPrefix(Name, GV, *Mang); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp new file mode 100644 index 0000000..fe339d7 --- /dev/null +++ b/contrib/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp @@ -0,0 +1,230 @@ +//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass performs peephole optimizations to clean up ugly code +// sequences at the MachineInstruction layer. It runs at the end of +// the SSA phases, following VSX swap removal. A pass of dead code +// elimination follows this one for quick clean-up of any dead +// instructions introduced here. Although we could do this as callbacks +// from the generic peephole pass, this would have a couple of bad +// effects: it might remove optimization opportunities for VSX swap +// removal, and it would miss cleanups made possible following VSX +// swap removal. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-mi-peepholes" + +namespace llvm { + void initializePPCMIPeepholePass(PassRegistry&); +} + +namespace { + +struct PPCMIPeephole : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + PPCMIPeephole() : MachineFunctionPass(ID) { + initializePPCMIPeepholePass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + // Perform peepholes. + bool simplifyCode(void); + + // Find the "true" register represented by SrcReg (following chains + // of copies and subreg_to_reg operations). + unsigned lookThruCopyLike(unsigned SrcReg); + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + initialize(MF); + return simplifyCode(); + } +}; + +// Initialize class variables. +void PPCMIPeephole::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo(); + DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n"); + DEBUG(MF->dump()); +} + +// Perform peephole optimizations. +bool PPCMIPeephole::simplifyCode(void) { + bool Simplified = false; + MachineInstr* ToErase = nullptr; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + + // If the previous instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + + // Ignore debug instructions. + if (MI.isDebugValue()) + continue; + + // Per-opcode peepholes. + switch (MI.getOpcode()) { + + default: + break; + + case PPC::XXPERMDI: { + // Perform simplifications of 2x64 vector swaps and splats. + // A swap is identified by an immediate value of 2, and a splat + // is identified by an immediate value of 0 or 3. + int Immed = MI.getOperand(3).getImm(); + + if (Immed != 1) { + + // For each of these simplifications, we need the two source + // regs to match. Unfortunately, MachineCSE ignores COPY and + // SUBREG_TO_REG, so for example we can see + // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed. + // We have to look through chains of COPY and SUBREG_TO_REG + // to find the real source values for comparison. + unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg()); + unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg()); + + if (TrueReg1 == TrueReg2 + && TargetRegisterInfo::isVirtualRegister(TrueReg1)) { + MachineInstr *DefMI = MRI->getVRegDef(TrueReg1); + + // If this is a splat or a swap fed by another splat, we + // can replace it with a copy. + if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) { + unsigned FeedImmed = DefMI->getOperand(3).getImm(); + unsigned FeedReg1 + = lookThruCopyLike(DefMI->getOperand(1).getReg()); + unsigned FeedReg2 + = lookThruCopyLike(DefMI->getOperand(2).getReg()); + + if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) { + DEBUG(dbgs() + << "Optimizing splat/swap or splat/splat " + "to splat/copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(MI.getOperand(1)); + ToErase = &MI; + Simplified = true; + } + + // If this is a splat fed by a swap, we can simplify modify + // the splat to splat the other value from the swap's input + // parameter. + else if ((Immed == 0 || Immed == 3) + && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/splat => splat: "); + DEBUG(MI.dump()); + MI.getOperand(1).setReg(DefMI->getOperand(1).getReg()); + MI.getOperand(2).setReg(DefMI->getOperand(2).getReg()); + MI.getOperand(3).setImm(3 - Immed); + Simplified = true; + } + + // If this is a swap fed by a swap, we can replace it + // with a copy from the first swap's input. + else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) { + DEBUG(dbgs() << "Optimizing swap/swap => copy: "); + DEBUG(MI.dump()); + BuildMI(MBB, &MI, MI.getDebugLoc(), + TII->get(PPC::COPY), MI.getOperand(0).getReg()) + .addOperand(DefMI->getOperand(1)); + ToErase = &MI; + Simplified = true; + } + } + } + } + break; + } + } + } + + // If the last instruction was marked for elimination, + // remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + } + + return Simplified; +} + +// This is used to find the "true" source register for an +// XXPERMDI instruction, since MachineCSE does not handle the +// "copy-like" operations (Copy and SubregToReg). Returns +// the original SrcReg unless it is the target of a copy-like +// operation, in which case we chain backwards through all +// such operations to the ultimate source register. If a +// physical register is encountered, we stop the search. +unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) { + + while (true) { + + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg; + if (MI->isCopy()) + CopySrcReg = MI->getOperand(1).getReg(); + else { + assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike"); + CopySrcReg = MI->getOperand(2).getReg(); + } + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) + return CopySrcReg; + + SrcReg = CopySrcReg; + } +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) +INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE, + "PowerPC MI Peephole Optimization", false, false) + +char PPCMIPeephole::ID = 0; +FunctionPass* +llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); } + diff --git a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp index ec4e0a5..95f1631 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp @@ -18,8 +18,8 @@ using namespace llvm; void PPCFunctionInfo::anchor() { } MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const { - const DataLayout *DL = MF.getTarget().getDataLayout(); - return MF.getContext().getOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) + + const DataLayout &DL = MF.getDataLayout(); + return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) + Twine(MF.getFunctionNumber()) + "$poff"); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 2b09b2f..934bdf6 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -200,7 +200,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(PPC::R2); // System-reserved register Reserved.set(PPC::R13); // Small Data Area pointer register } - + // On PPC64, r13 is the thread pointer. Never allocate this register. if (TM.isPPC64()) { Reserved.set(PPC::R13); @@ -262,7 +262,7 @@ unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, default: return 0; case PPC::G8RC_NOX0RegClassID: - case PPC::GPRC_NOR0RegClassID: + case PPC::GPRC_NOR0RegClassID: case PPC::G8RCRegClassID: case PPC::GPRCRegClassID: { unsigned FP = TFI->hasFP(MF) ? 1 : 0; @@ -311,7 +311,7 @@ PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, //===----------------------------------------------------------------------===// /// lowerDynamicAlloc - Generate the code for allocating an object in the -/// current frame. The sequence of code with be in the general form +/// current frame. The sequence of code will be in the general form /// /// addi R0, SP, \#frameSize ; get the address of the previous frame /// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size @@ -337,7 +337,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); // Get the total frame size. unsigned FrameSize = MFI->getStackSize(); - + // Get stack alignments. const PPCFrameLowering *TFI = getFrameLowering(MF); unsigned TargetAlign = TFI->getStackAlignment(); @@ -347,14 +347,14 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { // Determine the previous frame's address. If FrameSize can't be // represented as 16 bits or we need special alignment, then we load the - // previous frame's address from 0(SP). Why not do an addis of the hi? - // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. - // Constructing the constant and adding would take 3 instructions. + // previous frame's address from 0(SP). Why not do an addis of the hi? + // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. + // Constructing the constant and adding would take 3 instructions. // Fortunately, a frame greater than 32K is rare. const TargetRegisterClass *G8RC = &PPC::G8RCRegClass; const TargetRegisterClass *GPRC = &PPC::GPRCRegClass; unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC); - + if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) { BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg) .addReg(PPC::R31) @@ -425,11 +425,32 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { .addReg(PPC::R1) .addImm(maxCallFrameSize); } - + // Discard the DYNALLOC instruction. MBB.erase(II); } +void PPCRegisterInfo::lowerDynamicAreaOffset( + MachineBasicBlock::iterator II) const { + // Get the instruction. + MachineInstr &MI = *II; + // Get the instruction's basic block. + MachineBasicBlock &MBB = *MI.getParent(); + // Get the basic block's function. + MachineFunction &MF = *MBB.getParent(); + // Get the frame info. + MachineFrameInfo *MFI = MF.getFrameInfo(); + const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); + // Get the instruction info. + const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); + + unsigned maxCallFrameSize = MFI->getMaxCallFrameSize(); + DebugLoc dl = MI.getDebugLoc(); + BuildMI(MBB, II, dl, TII.get(PPC::LI), MI.getOperand(0).getReg()) + .addImm(maxCallFrameSize); + MBB.erase(II); +} + /// lowerCRSpilling - Generate the code for spilling a CR register. Instead of /// reserving a whole register (R0), we scrounge for one here. This generates /// code like this: @@ -459,8 +480,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II, // We need to store the CR in the low 4-bits of the saved value. First, issue // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg. BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) - .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); - + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + // If the saved register wasn't CR0, shift the bits left so that they are in // CR0's slot. if (SrcReg != PPC::CR0) { @@ -549,8 +570,8 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II, .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg) - .addReg(getCRFromCRBit(SrcReg)); - + .addReg(getCRFromCRBit(SrcReg)); + // If the saved register wasn't CR0LT, shift the bits left so that the bit to // store is the first one. Mask all but that bit. unsigned Reg1 = Reg; @@ -602,17 +623,19 @@ void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II, unsigned ShiftBits = getEncodingValue(DestReg); // rlwimi r11, r10, 32-ShiftBits, ..., ... BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO) - .addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill) - .addImm(ShiftBits ? 32-ShiftBits : 0) - .addImm(ShiftBits).addImm(ShiftBits); - + .addReg(RegO, RegState::Kill) + .addReg(Reg, RegState::Kill) + .addImm(ShiftBits ? 32 - ShiftBits : 0) + .addImm(ShiftBits) + .addImm(ShiftBits); + BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), getCRFromCRBit(DestReg)) - .addReg(RegO, RegState::Kill) - // Make sure we have a use dependency all the way through this - // sequence of instructions. We can't have the other bits in the CR - // modified in between the mfocrf and the mtocrf. - .addReg(getCRFromCRBit(DestReg), RegState::Implicit); + .addReg(RegO, RegState::Kill) + // Make sure we have a use dependency all the way through this + // sequence of instructions. We can't have the other bits in the CR + // modified in between the mfocrf and the mtocrf. + .addReg(getCRFromCRBit(DestReg), RegState::Implicit); // Discard the pseudo instruction. MBB.erase(II); @@ -634,11 +657,11 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II, unsigned SrcReg = MI.getOperand(0).getReg(); BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg) - .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); - - addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW)) - .addReg(Reg, RegState::Kill), - FrameIndex); + .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill())); + + addFrameReference( + BuildMI(MBB, II, dl, TII.get(PPC::STW)).addReg(Reg, RegState::Kill), + FrameIndex); // Discard the pseudo instruction. MBB.erase(II); @@ -671,9 +694,8 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II, MBB.erase(II); } -bool -PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, - unsigned Reg, int &FrameIdx) const { +bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, + unsigned Reg, int &FrameIdx) const { const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>(); // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4 // ABI, return true to prevent allocating an additional frame slot. @@ -752,7 +774,12 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int FPSI = FI->getFramePointerSaveIndex(); // Get the instruction opcode. unsigned OpC = MI.getOpcode(); - + + if ((OpC == PPC::DYNAREAOFFSET || OpC == PPC::DYNAREAOFFSET8)) { + lowerDynamicAreaOffset(II); + return; + } + // Special case for dynamic alloca. if (FPSI && FrameIndex == FPSI && (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) { @@ -800,8 +827,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // If we're not using a Frame Pointer that has been set to the value of the // SP before having the stack size subtracted from it, then add the stack size // to Offset to get the correct offset. - // Naked functions have stack size 0, although getStackSize may not reflect that - // because we didn't call all the pieces that compute it for naked functions. + // Naked functions have stack size 0, although getStackSize may not reflect + // that because we didn't call all the pieces that compute it for naked + // functions. if (!MF.getFunction()->hasFnAttribute(Attribute::Naked)) { if (!(hasBasePointer(MF) && FrameIndex < 0)) Offset += MFI->getStackSize(); @@ -840,7 +868,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, .addImm(Offset); // Convert into indexed form of the instruction: - // + // // sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0 // addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0 unsigned OperandBase; @@ -898,24 +926,6 @@ bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const { return needsStackRealignment(MF); } -bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const { - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) - return false; - - return true; -} - -bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const PPCFrameLowering *TFI = getFrameLowering(MF); - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttribute(Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); -} - /// Returns true if the instruction's frame index /// reference would be better served by a base register other than FP /// or SP. Used by LocalStackFrameAllocation to determine which frame index diff --git a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index d304e1d..b15fde8 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -54,13 +54,13 @@ inline static unsigned getCRFromCRBit(unsigned SrcReg) { return Reg; } - class PPCRegisterInfo : public PPCGenRegisterInfo { DenseMap<unsigned, unsigned> ImmToIdxMap; const PPCTargetMachine &TM; + public: PPCRegisterInfo(const PPCTargetMachine &TM); - + /// getPointerRegClass - Return the register class to use to hold pointers. /// This is used for addressing modes. const TargetRegisterClass * @@ -77,7 +77,7 @@ public: const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const override; - const uint32_t *getNoPreservedMask() const; + const uint32_t *getNoPreservedMask() const override; void adjustStackMapLiveOutMask(uint32_t *Mask) const override; @@ -101,6 +101,7 @@ public: } void lowerDynamicAlloc(MachineBasicBlock::iterator II) const; + void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const; void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex) const; void lowerCRRestore(MachineBasicBlock::iterator II, @@ -115,9 +116,9 @@ public: unsigned FrameIndex) const; bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, - int &FrameIdx) const override; - void eliminateFrameIndex(MachineBasicBlock::iterator II, - int SPAdj, unsigned FIOperandNum, + int &FrameIdx) const override; + void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, + unsigned FIOperandNum, RegScavenger *RS = nullptr) const override; // Support for virtual base registers. @@ -136,8 +137,6 @@ public: // Base pointer (stack realignment) support. unsigned getBaseRegister(const MachineFunction &MF) const; bool hasBasePointer(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index 58dacca..c0fcb6c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -62,6 +62,7 @@ void PPCSubtarget::initializeEnvironment() { Has64BitSupport = false; Use64BitRegs = false; UseCRBits = false; + UseSoftFloat = false; HasAltivec = false; HasSPE = false; HasQPX = false; @@ -100,6 +101,8 @@ void PPCSubtarget::initializeEnvironment() { HasDirectMove = false; IsQPXStackUnaligned = false; HasHTM = false; + HasFusion = false; + HasFloat128 = false; } void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { @@ -210,5 +213,33 @@ bool PPCSubtarget::enableSubRegLiveness() const { return UseSubRegLiveness; } +unsigned char PPCSubtarget::classifyGlobalReference( + const GlobalValue *GV) const { + // Note that currently we don't generate non-pic references. + // If a caller wants that, this will have to be updated. + + // Large code model always uses the TOC even for local symbols. + if (TM.getCodeModel() == CodeModel::Large) + return PPCII::MO_PIC_FLAG | PPCII::MO_NLP_FLAG; + + unsigned char flags = PPCII::MO_PIC_FLAG; + + // Only if the relocation mode is PIC do we have to worry about + // interposition. In all other cases we can use a slightly looser standard to + // decide how to access the symbol. + if (TM.getRelocationModel() == Reloc::PIC_) { + // If it's local, or it's non-default, it can't be interposed. + if (!GV->hasLocalLinkage() && + GV->hasDefaultVisibility()) { + flags |= PPCII::MO_NLP_FLAG; + } + return flags; + } + + if (GV->isStrongDefinitionForLinker()) + return flags; + return flags | PPCII::MO_NLP_FLAG; +} + bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); } bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h index 0616c1f..4f5c95c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -83,6 +83,7 @@ protected: bool Has64BitSupport; bool Use64BitRegs; bool UseCRBits; + bool UseSoftFloat; bool IsPPC64; bool HasAltivec; bool HasSPE; @@ -119,6 +120,8 @@ protected: bool HasPartwordAtomics; bool HasDirectMove; bool HasHTM; + bool HasFusion; + bool HasFloat128; /// When targeting QPX running a stock PPC64 Linux kernel where the stack /// alignment has not been changed, we need to keep the 16-byte alignment @@ -188,6 +191,8 @@ public: /// has64BitSupport - Return true if the selected CPU supports 64-bit /// instructions, regardless of whether we are in 32-bit or 64-bit mode. bool has64BitSupport() const { return Has64BitSupport; } + // useSoftFloat - Return true if soft-float option is turned on. + bool useSoftFloat() const { return UseSoftFloat; } /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit /// registers in 32-bit mode when possible. This can only true if @@ -254,6 +259,8 @@ public: return 16; } bool hasHTM() const { return HasHTM; } + bool hasFusion() const { return HasFusion; } + bool hasFloat128() const { return HasFloat128; } const Triple &getTargetTriple() const { return TargetTriple; } @@ -285,6 +292,10 @@ public: bool useAA() const override; bool enableSubRegLiveness() const override; + + /// classifyGlobalReference - Classify a global variable reference for the + /// current subtarget accourding to how we should reference it. + unsigned char classifyGlobalReference(const GlobalValue *GV) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 1daf244..d24b590 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -42,6 +42,10 @@ static cl:: opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden, cl::desc("Disable VSX Swap Removal for PPC")); +static cl:: +opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden, + cl::desc("Disable machine peepholes for PPC")); + static cl::opt<bool> EnableGEPOpt("ppc-gep-opt", cl::Hidden, cl::desc("Enable optimizations on complex GEPs"), @@ -57,11 +61,19 @@ EnableExtraTOCRegDeps("enable-ppc-extra-toc-reg-deps", cl::desc("Add extra TOC register dependencies"), cl::init(true), cl::Hidden); +static cl::opt<bool> +EnableMachineCombinerPass("ppc-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializePowerPCTarget() { // Register the targets RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target); RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target); RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializePPCBoolRetToIntPass(PR); } /// Return the datalayout string of a subtarget. @@ -118,7 +130,7 @@ static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL, } if (OL != CodeGenOpt::None) { - if (!FullFS.empty()) + if (!FullFS.empty()) FullFS = "+invariant-function-descriptors," + FullFS; else FullFS = "+invariant-function-descriptors"; @@ -144,7 +156,7 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, return PPCTargetMachine::PPC_ABI_ELFv2; assert(Options.MCOptions.getABIName().empty() && - "Unknown target-abi option!"); + "Unknown target-abi option!"); if (!TT.isMacOSX()) { switch (TT.getArch()) { @@ -160,9 +172,9 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT, return PPCTargetMachine::PPC_ABI_UNKNOWN; } -// The FeatureString here is a little subtle. We are modifying the feature string -// with what are (currently) non-function specific overrides as it goes into the -// LLVMTargetMachine constructor and then using the stored value in the +// The FeatureString here is a little subtle. We are modifying the feature +// string with what are (currently) non-function specific overrides as it goes +// into the LLVMTargetMachine constructor and then using the stored value in the // Subtarget constructor below it. PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, @@ -227,6 +239,19 @@ PPCTargetMachine::getSubtargetImpl(const Function &F) const { ? FSAttr.getValueAsString().str() : TargetFS; + // FIXME: This is related to the code below to reset the target options, + // we need to know whether or not the soft float flag is set on the + // function before we can generate a subtarget. We also need to use + // it as a key for the subtarget since that can be the only difference + // between two functions. + bool SoftFloat = + F.hasFnAttribute("use-soft-float") && + F.getFnAttribute("use-soft-float").getValueAsString() == "true"; + // If the soft float attribute is set on the function turn on the soft float + // subtarget feature. + if (SoftFloat) + FS += FS.empty() ? "+soft-float" : ",+soft-float"; + auto &I = SubtargetMap[CPU + FS]; if (!I) { // This needs to be done before we create a new subtarget since any @@ -277,6 +302,8 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) { } void PPCPassConfig::addIRPasses() { + if (TM->getOptLevel() != CodeGenOpt::None) + addPass(createPPCBoolRetToIntPass()); addPass(createAtomicExpandPass(&getPPCTargetMachine())); // For the BG/Q (or if explicitly requested), add explicit data prefetch @@ -316,6 +343,10 @@ bool PPCPassConfig::addPreISel() { bool PPCPassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + return true; } @@ -339,6 +370,12 @@ void PPCPassConfig::addMachineSSAOptimization() { if (TM->getTargetTriple().getArch() == Triple::ppc64le && !DisableVSXSwapRemoval) addPass(createPPCVSXSwapRemovalPass()); + // Target-specific peephole cleanups performed after instruction + // selection. + if (!DisableMIPeephole) { + addPass(createPPCMIPeepholePass()); + addPass(&DeadMachineInstructionElimID); + } } void PPCPassConfig::addPreRegAlloc() { @@ -364,6 +401,7 @@ void PPCPassConfig::addPreEmitPass() { } TargetIRAnalysis PPCTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); }); + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(PPCTTIImpl(this, F)); + }); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp index 9ee5db9..798bb9d 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp @@ -42,9 +42,7 @@ MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal( if (Kind.isReadOnly()) { const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV); - if (GVar && GVar->isConstant() && - (GVar->getInitializer()->getRelocationInfo() == - Constant::GlobalRelocations)) + if (GVar && GVar->isConstant() && GVar->getInitializer()->needsRelocation()) Kind = SectionKind::getReadOnlyWithRel(); } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp index e21c2b7..cd86dab 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -35,7 +35,7 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } -unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(Imm, Ty); @@ -64,8 +64,8 @@ unsigned PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 4 * TTI::TCC_Basic; } -unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(IID, Idx, Imm, Ty); @@ -98,8 +98,8 @@ unsigned PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return PPCTTIImpl::getIntImmCost(Imm, Ty); } -unsigned PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { if (DisablePPCConstHoist) return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty); @@ -197,9 +197,20 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, } bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) { + // On the A2, always unroll aggressively. For QPX unaligned loads, we depend + // on combining the loads generated for consecutive accesses, and failure to + // do so is particularly expensive. This makes it much more likely (compared + // to only using concatenation unrolling). + if (ST->getDarwinDirective() == PPC::DIR_A2) + return true; + return LoopHasReductions; } +bool PPCTTIImpl::enableInterleavedAccessVectorization() { + return true; +} + unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasAltivec() && !ST->hasQPX()) return 0; @@ -246,7 +257,7 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -unsigned PPCTTIImpl::getArithmeticInstrCost( +int PPCTTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { @@ -257,24 +268,30 @@ unsigned PPCTTIImpl::getArithmeticInstrCost( Opd1PropInfo, Opd2PropInfo); } -unsigned PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { - return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). We need one such shuffle instruction for each actual + // register (this is not true for arbitrary shuffles, but is true for the + // structured types of shuffles covered by TTI::ShuffleKind). + return LT.first; } -unsigned PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode"); return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); int ISD = TLI->InstructionOpcodeToISD(Opcode); @@ -313,41 +330,83 @@ unsigned PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { +int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); - unsigned Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - - // VSX loads/stores support unaligned access. - if (ST->hasVSX()) { - if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64) - return Cost; - } + int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace); - bool UnalignedAltivec = - Src->isVectorTy() && - Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() && - LT.second.getSizeInBits() == 128 && - Opcode == Instruction::Load; + // Aligned loads and stores are easy. + unsigned SrcBytes = LT.second.getStoreSize(); + if (!SrcBytes || !Alignment || Alignment >= SrcBytes) + return Cost; + + bool IsAltivecType = ST->hasAltivec() && + (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 || + LT.second == MVT::v4i32 || LT.second == MVT::v4f32); + bool IsVSXType = ST->hasVSX() && + (LT.second == MVT::v2f64 || LT.second == MVT::v2i64); + bool IsQPXType = ST->hasQPX() && + (LT.second == MVT::v4f64 || LT.second == MVT::v4f32); + + // If we can use the permutation-based load sequence, then this is also + // relatively cheap (not counting loop-invariant instructions): one load plus + // one permute (the last load in a series has extra cost, but we're + // neglecting that here). Note that on the P7, we should do unaligned loads + // for Altivec types using the VSX instructions, but that's more expensive + // than using the permutation-based load sequence. On the P8, that's no + // longer true. + if (Opcode == Instruction::Load && + ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) && + Alignment >= LT.second.getScalarType().getStoreSize()) + return Cost + LT.first; // Add the cost of the permutations. + + // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the + // P7, unaligned vector loads are more expensive than the permutation-based + // load sequence, so that might be used instead, but regardless, the net cost + // is about the same (not counting loop-invariant instructions). + if (IsVSXType || (ST->hasVSX() && IsAltivecType)) + return Cost; // PPC in general does not support unaligned loads and stores. They'll need // to be decomposed based on the alignment factor. - unsigned SrcBytes = LT.second.getStoreSize(); - if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) { - Cost += LT.first*(SrcBytes/Alignment-1); - - // For a vector type, there is also scalarization overhead (only for - // stores, loads are expanded using the vector-load + permutation sequence, - // which is much less expensive). - if (Src->isVectorTy() && Opcode == Instruction::Store) - for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) - Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); - } + + // Add the cost of each scalar load or store. + Cost += LT.first*(SrcBytes/Alignment-1); + + // For a vector type, there is also scalarization overhead (only for + // stores, loads are expanded using the vector-load + permutation sequence, + // which is much less expensive). + if (Src->isVectorTy() && Opcode == Instruction::Store) + for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i) + Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i); + + return Cost; +} + +int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace) { + assert(isa<VectorType>(VecTy) && + "Expect a vector type for interleaved memory op"); + + // Legalize the type. + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy); + + // Firstly, the cost of load/store operation. + int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace); + + // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations + // (at least in the sense that there need only be one non-loop-invariant + // instruction). For each result vector, we need one shuffle per incoming + // vector (except that the first shuffle can take two incoming vectors + // because it does not need to take itself). + Cost += Factor*(LT.first-1); return Cost; } diff --git a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h index 368bef9..04c1b02 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -37,7 +37,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> { const PPCTargetLowering *getTLI() const { return TLI; } public: - explicit PPCTTIImpl(const PPCTargetMachine *TM, Function &F) + explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -52,12 +52,11 @@ public: /// @{ using BaseT::getIntImmCost; - unsigned getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty); - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP); @@ -68,22 +67,27 @@ public: /// @{ bool enableAggressiveInterleaving(bool LoopHasReductions); + bool enableInterleavedAccessVectorization(); unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, + unsigned Factor, + ArrayRef<unsigned> Indices, + unsigned Alignment, + unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp index 5e3ae2a..782583c 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXCopy.cpp @@ -77,6 +77,14 @@ namespace { return IsRegInClass(Reg, &PPC::F8RCRegClass, MRI); } + bool IsVSFReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSFRCRegClass, MRI); + } + + bool IsVSSReg(unsigned Reg, MachineRegisterInfo &MRI) { + return IsRegInClass(Reg, &PPC::VSSRCRegClass, MRI); + } + protected: bool processBlock(MachineBasicBlock &MBB) { bool Changed = false; @@ -100,7 +108,9 @@ protected: IsVRReg(SrcMO.getReg(), MRI) ? &PPC::VSHRCRegClass : &PPC::VSLRCRegClass; assert((IsF8Reg(SrcMO.getReg(), MRI) || - IsVRReg(SrcMO.getReg(), MRI)) && + IsVRReg(SrcMO.getReg(), MRI) || + IsVSSReg(SrcMO.getReg(), MRI) || + IsVSFReg(SrcMO.getReg(), MRI)) && "Unknown source for a VSX copy"); unsigned NewVReg = MRI.createVirtualRegister(SrcRC); @@ -123,6 +133,8 @@ protected: IsVRReg(DstMO.getReg(), MRI) ? &PPC::VSHRCRegClass : &PPC::VSLRCRegClass; assert((IsF8Reg(DstMO.getReg(), MRI) || + IsVSFReg(DstMO.getReg(), MRI) || + IsVSSReg(DstMO.getReg(), MRI) || IsVRReg(DstMO.getReg(), MRI)) && "Unknown destination for a VSX copy"); diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp index 46b8d13..6b19a2f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp @@ -103,10 +103,10 @@ protected: VNInfo *AddendValNo = LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn(); - if (!AddendValNo) { - // This can be null if the register is undef. + + // This can be null if the register is undef. + if (!AddendValNo) continue; - } MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def); @@ -186,18 +186,17 @@ protected: if (!KilledProdOp) continue; - // If the addend copy is used only by this MI, then the addend source - // register is likely not live here. This could be fixed (based on the - // legality checks above, the live range for the addend source register - // could be extended), but it seems likely that such a trivial copy can - // be coalesced away later, and thus is not worth the effort. - if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) && + // If the addend copy is used only by this MI, then the addend source + // register is likely not live here. This could be fixed (based on the + // legality checks above, the live range for the addend source register + // could be extended), but it seems likely that such a trivial copy can + // be coalesced away later, and thus is not worth the effort. + if (TargetRegisterInfo::isVirtualRegister(AddendSrcReg) && !LIS->getInterval(AddendSrcReg).liveAt(FMAIdx)) continue; // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3. - unsigned AddReg = AddendMI->getOperand(1).getReg(); unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg(); unsigned OtherProdReg = MI->getOperand(OtherProdOp).getReg(); @@ -221,6 +220,14 @@ protected: if (OldFMAReg == KilledProdReg) continue; + // If there isn't a class that fits, we can't perform the transform. + // This is needed for correctness with a mixture of VSX and Altivec + // instructions to make sure that a low VSX register is not assigned to + // the Altivec instruction. + if (!MRI.constrainRegClass(KilledProdReg, + MRI.getRegClass(OldFMAReg))) + continue; + assert(OldFMAReg == AddendMI->getOperand(0).getReg() && "Addend copy not tied to old FMA output!"); @@ -228,7 +235,7 @@ protected: MI->getOperand(0).setReg(KilledProdReg); MI->getOperand(1).setReg(KilledProdReg); - MI->getOperand(3).setReg(AddReg); + MI->getOperand(3).setReg(AddendSrcReg); MI->getOperand(2).setReg(OtherProdReg); MI->getOperand(0).setSubReg(KilledProdSubReg); @@ -263,8 +270,7 @@ protected: if (UseMI == AddendMI) continue; - UseMO.setReg(KilledProdReg); - UseMO.setSubReg(KilledProdSubReg); + UseMO.substVirtReg(KilledProdReg, KilledProdSubReg, *TRI); } // Extend the live intervals of the killed product operand to hold the @@ -286,6 +292,20 @@ protected: } DEBUG(dbgs() << " extended: " << NewFMAInt << '\n'); + // Extend the live interval of the addend source (it might end at the + // copy to be removed, or somewhere in between there and here). This + // is necessary only if it is a physical register. + if (!TargetRegisterInfo::isVirtualRegister(AddendSrcReg)) + for (MCRegUnitIterator Units(AddendSrcReg, TRI); Units.isValid(); + ++Units) { + unsigned Unit = *Units; + + LiveRange &AddendSrcRange = LIS->getRegUnit(Unit); + AddendSrcRange.extendInBlock(LIS->getMBBStartIdx(&MBB), + FMAIdx.getRegSlot()); + DEBUG(dbgs() << " extended: " << AddendSrcRange << '\n'); + } + FMAInt.removeValNo(FMAValNo); DEBUG(dbgs() << " trimmed: " << FMAInt << '\n'); @@ -347,7 +367,6 @@ INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE, char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID; char PPCVSXFMAMutate::ID = 0; -FunctionPass* -llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); } - - +FunctionPass *llvm::createPPCVSXFMAMutatePass() { + return new PPCVSXFMAMutate(); +} diff --git a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index d7132d5..27c540f 100644 --- a/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/contrib/llvm/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -94,7 +94,7 @@ enum SHValues { SH_NOSWAP_ST, SH_SPLAT, SH_XXPERMDI, - SH_COPYSCALAR + SH_COPYWIDEN }; struct PPCVSXSwapRemoval : public MachineFunctionPass { @@ -149,6 +149,11 @@ private: // handling. Return true iff any changes are made. bool removeSwaps(); + // Insert a swap instruction from SrcReg to DstReg at the given + // InsertPoint. + void insertSwap(MachineInstr *MI, MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg); + // Update instructions requiring special handling. void handleSpecialSwappables(int EntryIdx); @@ -159,9 +164,7 @@ private: bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) { if (TargetRegisterInfo::isVirtualRegister(Reg)) return RC->hasSubClassEq(MRI->getRegClass(Reg)); - if (RC->contains(Reg)) - return true; - return false; + return RC->contains(Reg); } // Return true iff the given register is a full vector register. @@ -215,7 +218,7 @@ public: void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); - TII = static_cast<const PPCInstrInfo*>(MF->getSubtarget().getInstrInfo()); + TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo(); // An initial vector size of 256 appears to work well in practice. // Small/medium functions with vector content tend not to incur a @@ -343,6 +346,15 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { SwapVector[VecIdx].IsLoad = 1; SwapVector[VecIdx].IsSwap = 1; break; + case PPC::LXSDX: + case PPC::LXSSPX: + // A load of a floating-point value into the high-order half of + // a vector register is safe, provided that we introduce a swap + // following the load, which will be done by the SUBREG_TO_REG + // support. So just mark these as safe. + SwapVector[VecIdx].IsLoad = 1; + SwapVector[VecIdx].IsSwappable = 1; + break; case PPC::STVX: // Non-permuting stores are currently unsafe. We can use special // handling for this in the future. By not marking these as @@ -385,7 +397,7 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { else if (isVecReg(MI.getOperand(0).getReg()) && isScalarVecReg(MI.getOperand(2).getReg())) { SwapVector[VecIdx].IsSwappable = 1; - SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYSCALAR; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_COPYWIDEN; } break; } @@ -420,7 +432,14 @@ bool PPCVSXSwapRemoval::gatherVectorInstructions() { case PPC::STVEHX: case PPC::STVEWX: case PPC::STVXL: + // We can handle STXSDX and STXSSPX similarly to LXSDX and LXSSPX, + // by adding special handling for narrowing copies as well as + // widening ones. However, I've experimented with this, and in + // practice we currently do not appear to use STXSDX fed by + // a narrowing copy from a full vector register. Since I can't + // generate any useful test cases, I've left this alone for now. case PPC::STXSDX: + case PPC::STXSSPX: case PPC::VCIPHER: case PPC::VCIPHERLAST: case PPC::VMRGHB: @@ -543,7 +562,8 @@ unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg, } if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) { - SwapVector[VecIdx].MentionsPhysVR = 1; + if (!isScalarVecReg(CopySrcReg)) + SwapVector[VecIdx].MentionsPhysVR = 1; return CopySrcReg; } @@ -629,8 +649,8 @@ void PPCVSXSwapRemoval::recordUnoptimizableWebs() { SwapVector[Repr].WebRejected = 1; DEBUG(dbgs() << - format("Web %d rejected for physreg, partial reg, or not swap[pable]\n", - Repr)); + format("Web %d rejected for physreg, partial reg, or not " + "swap[pable]\n", Repr)); DEBUG(dbgs() << " in " << EntryIdx << ": "); DEBUG(SwapVector[EntryIdx].VSEMI->dump()); DEBUG(dbgs() << "\n"); @@ -743,6 +763,21 @@ void PPCVSXSwapRemoval::markSwapsForRemoval() { } } +// Create an xxswapd instruction and insert it prior to the given point. +// MI is used to determine basic block and debug loc information. +// FIXME: When inserting a swap, we should check whether SrcReg is +// defined by another swap: SrcReg = XXPERMDI Reg, Reg, 2; If so, +// then instead we should generate a copy from Reg to DstReg. +void PPCVSXSwapRemoval::insertSwap(MachineInstr *MI, + MachineBasicBlock::iterator InsertPoint, + unsigned DstReg, unsigned SrcReg) { + BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), + TII->get(PPC::XXPERMDI), DstReg) + .addReg(SrcReg) + .addReg(SrcReg) + .addImm(2); +} + // The identified swap entry requires special handling to allow its // containing computation to be optimized. Perform that handling // here. @@ -752,8 +787,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { switch (SwapVector[EntryIdx].SpecialHandling) { default: - assert(false && "Unexpected special handling type"); - break; + llvm_unreachable("Unexpected special handling type"); // For splats based on an index into a vector, add N/2 modulo N // to the index, where N is the number of vector elements. @@ -766,7 +800,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { switch (MI->getOpcode()) { default: - assert(false && "Unexpected splat opcode"); + llvm_unreachable("Unexpected splat opcode"); case PPC::VSPLTB: NElts = 16; break; case PPC::VSPLTH: NElts = 8; break; case PPC::VSPLTW: NElts = 4; break; @@ -811,7 +845,7 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { // For a copy from a scalar floating-point register to a vector // register, removing swaps will leave the copied value in the // wrong lane. Insert a swap following the copy to fix this. - case SHValues::SH_COPYSCALAR: { + case SHValues::SH_COPYWIDEN: { MachineInstr *MI = SwapVector[EntryIdx].VSEMI; DEBUG(dbgs() << "Changing SUBREG_TO_REG: "); @@ -825,14 +859,13 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { DEBUG(dbgs() << " Into: "); DEBUG(MI->dump()); - MachineBasicBlock::iterator InsertPoint = MI->getNextNode(); + auto InsertPoint = ++MachineBasicBlock::iterator(MI); // Note that an XXPERMDI requires a VSRC, so if the SUBREG_TO_REG // is copying to a VRRC, we need to be careful to avoid a register // assignment problem. In this case we must copy from VRRC to VSRC // prior to the swap, and from VSRC to VRRC following the swap. // Coalescing will usually remove all this mess. - if (DstRC == &PPC::VRRCRegClass) { unsigned VSRCTmp1 = MRI->createVirtualRegister(&PPC::VSRCRegClass); unsigned VSRCTmp2 = MRI->createVirtualRegister(&PPC::VSRCRegClass); @@ -840,29 +873,19 @@ void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), TII->get(PPC::COPY), VSRCTmp1) .addReg(NewVReg); - DEBUG(MI->getNextNode()->dump()); + DEBUG(std::prev(InsertPoint)->dump()); - BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), - TII->get(PPC::XXPERMDI), VSRCTmp2) - .addReg(VSRCTmp1) - .addReg(VSRCTmp1) - .addImm(2); - DEBUG(MI->getNextNode()->getNextNode()->dump()); + insertSwap(MI, InsertPoint, VSRCTmp2, VSRCTmp1); + DEBUG(std::prev(InsertPoint)->dump()); BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), TII->get(PPC::COPY), DstReg) .addReg(VSRCTmp2); - DEBUG(MI->getNextNode()->getNextNode()->getNextNode()->dump()); + DEBUG(std::prev(InsertPoint)->dump()); } else { - - BuildMI(*MI->getParent(), InsertPoint, MI->getDebugLoc(), - TII->get(PPC::XXPERMDI), DstReg) - .addReg(NewVReg) - .addReg(NewVReg) - .addImm(2); - - DEBUG(MI->getNextNode()->dump()); + insertSwap(MI, InsertPoint, DstReg, NewVReg); + DEBUG(std::prev(InsertPoint)->dump()); } break; } @@ -947,8 +970,8 @@ void PPCVSXSwapRemoval::dumpSwapVector() { case SH_XXPERMDI: DEBUG(dbgs() << "special:xxpermdi "); break; - case SH_COPYSCALAR: - DEBUG(dbgs() << "special:copyscalar "); + case SH_COPYWIDEN: + DEBUG(dbgs() << "special:copywiden "); break; } } diff --git a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 1c4e486..a552747 100644 --- a/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/contrib/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -14,6 +14,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -34,7 +35,6 @@ namespace { class SparcOperand; class SparcAsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; MCAsmParser &Parser; /// @name Auto-generated Match Functions @@ -69,6 +69,10 @@ class SparcAsmParser : public MCTargetAsmParser { OperandMatchResultTy parseBranchModifiers(OperandVector &Operands); + // Helper function for dealing with %lo / %hi in PIC mode. + const SparcMCExpr *adjustPICRelocation(SparcMCExpr::VariantKind VK, + const MCExpr *subExpr); + // returns true if Tok is matched to a register and returns register in RegNo. bool matchRegisterName(const AsmToken &Tok, unsigned &RegNo, unsigned &RegKind); @@ -77,24 +81,24 @@ class SparcAsmParser : public MCTargetAsmParser { bool parseDirectiveWord(unsigned Size, SMLoc L); bool is64Bit() const { - return STI.getTargetTriple().getArch() == Triple::sparcv9; + return getSTI().getTargetTriple().getArch() == Triple::sparcv9; } void expandSET(MCInst &Inst, SMLoc IDLoc, SmallVectorImpl<MCInst> &Instructions); public: - SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, + SparcAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), Parser(parser) { + : MCTargetAsmParser(Options, sti), Parser(parser) { // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } }; - static unsigned IntRegs[32] = { + static const MCPhysReg IntRegs[32] = { Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3, Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7, Sparc::O0, Sparc::O1, Sparc::O2, Sparc::O3, @@ -104,7 +108,7 @@ public: Sparc::I0, Sparc::I1, Sparc::I2, Sparc::I3, Sparc::I4, Sparc::I5, Sparc::I6, Sparc::I7 }; - static unsigned FloatRegs[32] = { + static const MCPhysReg FloatRegs[32] = { Sparc::F0, Sparc::F1, Sparc::F2, Sparc::F3, Sparc::F4, Sparc::F5, Sparc::F6, Sparc::F7, Sparc::F8, Sparc::F9, Sparc::F10, Sparc::F11, @@ -114,7 +118,7 @@ public: Sparc::F24, Sparc::F25, Sparc::F26, Sparc::F27, Sparc::F28, Sparc::F29, Sparc::F30, Sparc::F31 }; - static unsigned DoubleRegs[32] = { + static const MCPhysReg DoubleRegs[32] = { Sparc::D0, Sparc::D1, Sparc::D2, Sparc::D3, Sparc::D4, Sparc::D5, Sparc::D6, Sparc::D7, Sparc::D8, Sparc::D7, Sparc::D8, Sparc::D9, @@ -124,13 +128,13 @@ public: Sparc::D24, Sparc::D25, Sparc::D26, Sparc::D27, Sparc::D28, Sparc::D29, Sparc::D30, Sparc::D31 }; - static unsigned QuadFPRegs[32] = { + static const MCPhysReg QuadFPRegs[32] = { Sparc::Q0, Sparc::Q1, Sparc::Q2, Sparc::Q3, Sparc::Q4, Sparc::Q5, Sparc::Q6, Sparc::Q7, Sparc::Q8, Sparc::Q9, Sparc::Q10, Sparc::Q11, Sparc::Q12, Sparc::Q13, Sparc::Q14, Sparc::Q15 }; - static unsigned ASRRegs[32] = { + static const MCPhysReg ASRRegs[32] = { SP::Y, SP::ASR1, SP::ASR2, SP::ASR3, SP::ASR4, SP::ASR5, SP::ASR6, SP::ASR7, SP::ASR8, SP::ASR9, SP::ASR10, SP::ASR11, @@ -140,6 +144,12 @@ public: SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27, SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31}; + static const MCPhysReg IntPairRegs[] = { + Sparc::G0_G1, Sparc::G2_G3, Sparc::G4_G5, Sparc::G6_G7, + Sparc::O0_O1, Sparc::O2_O3, Sparc::O4_O5, Sparc::O6_O7, + Sparc::L0_L1, Sparc::L2_L3, Sparc::L4_L5, Sparc::L6_L7, + Sparc::I0_I1, Sparc::I2_I3, Sparc::I4_I5, Sparc::I6_I7}; + /// SparcOperand - Instances of this class represent a parsed Sparc machine /// instruction. class SparcOperand : public MCParsedAsmOperand { @@ -147,6 +157,7 @@ public: enum RegisterKind { rk_None, rk_IntReg, + rk_IntPairReg, rk_FloatReg, rk_DoubleReg, rk_QuadReg, @@ -200,6 +211,10 @@ public: bool isMEMrr() const { return Kind == k_MemoryReg; } bool isMEMri() const { return Kind == k_MemoryImm; } + bool isIntReg() const { + return (Kind == k_Register && Reg.Kind == rk_IntReg); + } + bool isFloatReg() const { return (Kind == k_Register && Reg.Kind == rk_FloatReg); } @@ -330,6 +345,25 @@ public: return Op; } + static bool MorphToIntPairReg(SparcOperand &Op) { + unsigned Reg = Op.getReg(); + assert(Op.Reg.Kind == rk_IntReg); + unsigned regIdx = 32; + if (Reg >= Sparc::G0 && Reg <= Sparc::G7) + regIdx = Reg - Sparc::G0; + else if (Reg >= Sparc::O0 && Reg <= Sparc::O7) + regIdx = Reg - Sparc::O0 + 8; + else if (Reg >= Sparc::L0 && Reg <= Sparc::L7) + regIdx = Reg - Sparc::L0 + 16; + else if (Reg >= Sparc::I0 && Reg <= Sparc::I7) + regIdx = Reg - Sparc::I0 + 24; + if (regIdx % 2 || regIdx > 31) + return false; + Op.Reg.RegNum = IntPairRegs[regIdx / 2]; + Op.Reg.Kind = rk_IntPairReg; + return true; + } + static bool MorphToDoubleReg(SparcOperand &Op) { unsigned Reg = Op.getReg(); assert(Op.Reg.Kind == rk_FloatReg); @@ -407,7 +441,22 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc, // the imm operand can be either an expression or an immediate. bool IsImm = Inst.getOperand(1).isImm(); - uint64_t ImmValue = IsImm ? MCValOp.getImm() : 0; + int64_t RawImmValue = IsImm ? MCValOp.getImm() : 0; + + // Allow either a signed or unsigned 32-bit immediate. + if (RawImmValue < -2147483648LL || RawImmValue > 4294967295LL) { + Error(IDLoc, "set: argument must be between -2147483648 and 4294967295"); + return; + } + + // If the value was expressed as a large unsigned number, that's ok. + // We want to see if it "looks like" a small signed number. + int32_t ImmValue = RawImmValue; + // For 'set' you can't use 'or' with a negative operand on V9 because + // that would splat the sign bit across the upper half of the destination + // register, whereas 'set' is defined to zero the high 32 bits. + bool IsEffectivelyImm13 = + IsImm && ((is64Bit() ? 0 : -4096) <= ImmValue && ImmValue < 4096); const MCExpr *ValExpr; if (IsImm) ValExpr = MCConstantExpr::create(ImmValue, getContext()); @@ -416,10 +465,12 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc, MCOperand PrevReg = MCOperand::createReg(Sparc::G0); - if (!IsImm || (ImmValue & ~0x1fff)) { + // If not just a signed imm13 value, then either we use a 'sethi' with a + // following 'or', or a 'sethi' by itself if there are no more 1 bits. + // In either case, start with the 'sethi'. + if (!IsEffectivelyImm13) { MCInst TmpInst; - const MCExpr *Expr = - SparcMCExpr::create(SparcMCExpr::VK_Sparc_HI, ValExpr, getContext()); + const MCExpr *Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_HI, ValExpr); TmpInst.setLoc(IDLoc); TmpInst.setOpcode(SP::SETHIi); TmpInst.addOperand(MCRegOp); @@ -428,10 +479,23 @@ void SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc, PrevReg = MCRegOp; } - if (!IsImm || ((ImmValue & 0x1fff) != 0 || ImmValue == 0)) { + // The low bits require touching in 3 cases: + // * A non-immediate value will always require both instructions. + // * An effectively imm13 value needs only an 'or' instruction. + // * Otherwise, an immediate that is not effectively imm13 requires the + // 'or' only if bits remain after clearing the 22 bits that 'sethi' set. + // If the low bits are known zeros, there's nothing to do. + // In the second case, and only in that case, must we NOT clear + // bits of the immediate value via the %lo() assembler function. + // Note also, the 'or' instruction doesn't mind a large value in the case + // where the operand to 'set' was 0xFFFFFzzz - it does exactly what you mean. + if (!IsImm || IsEffectivelyImm13 || (ImmValue & 0x3ff)) { MCInst TmpInst; - const MCExpr *Expr = - SparcMCExpr::create(SparcMCExpr::VK_Sparc_LO, ValExpr, getContext()); + const MCExpr *Expr; + if (IsEffectivelyImm13) + Expr = ValExpr; + else + Expr = adjustPICRelocation(SparcMCExpr::VK_Sparc_LO, ValExpr); TmpInst.setLoc(IDLoc); TmpInst.setOpcode(SP::ORri); TmpInst.addOperand(MCRegOp); @@ -463,7 +527,7 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, } for (const MCInst &I : Instructions) { - Out.EmitInstruction(I, STI); + Out.EmitInstruction(I, getSTI()); } return false; } @@ -742,6 +806,9 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op, case Sparc::PSR: Op = SparcOperand::CreateToken("%psr", S); break; + case Sparc::FSR: + Op = SparcOperand::CreateToken("%fsr", S); + break; case Sparc::WIM: Op = SparcOperand::CreateToken("%wim", S); break; @@ -766,6 +833,7 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op, case AsmToken::Minus: case AsmToken::Integer: case AsmToken::LParen: + case AsmToken::Dot: if (!getParser().parseExpression(EVal, E)) Op = SparcOperand::CreateImm(EVal, S, E); break; @@ -848,6 +916,13 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, return true; } + // %fprs is an alias of %asr6. + if (name.equals("fprs")) { + RegNo = ASRRegs[6]; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("icc")) { RegNo = Sparc::ICC; RegKind = SparcOperand::rk_Special; @@ -860,6 +935,12 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, return true; } + if (name.equals("fsr")) { + RegNo = Sparc::FSR; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("wim")) { RegNo = Sparc::WIM; RegKind = SparcOperand::rk_Special; @@ -943,6 +1024,82 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, RegKind = SparcOperand::rk_IntReg; return true; } + + if (name.equals("tpc")) { + RegNo = Sparc::TPC; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tnpc")) { + RegNo = Sparc::TNPC; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tstate")) { + RegNo = Sparc::TSTATE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tt")) { + RegNo = Sparc::TT; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tick")) { + RegNo = Sparc::TICK; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tba")) { + RegNo = Sparc::TBA; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("pstate")) { + RegNo = Sparc::PSTATE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("tl")) { + RegNo = Sparc::TL; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("pil")) { + RegNo = Sparc::PIL; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("cwp")) { + RegNo = Sparc::CWP; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("cansave")) { + RegNo = Sparc::CANSAVE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("canrestore")) { + RegNo = Sparc::CANRESTORE; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("cleanwin")) { + RegNo = Sparc::CLEANWIN; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("otherwin")) { + RegNo = Sparc::OTHERWIN; + RegKind = SparcOperand::rk_Special; + return true; + } + if (name.equals("wstate")) { + RegNo = Sparc::WSTATE; + RegKind = SparcOperand::rk_Special; + return true; + } } return false; } @@ -975,6 +1132,32 @@ static bool hasGOTReference(const MCExpr *Expr) { return false; } +const SparcMCExpr * +SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK, + const MCExpr *subExpr) +{ + // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently. + // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is + // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted + // as %got10 or %got22 relocation. + + if (getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_) { + switch(VK) { + default: break; + case SparcMCExpr::VK_Sparc_LO: + VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC10 + : SparcMCExpr::VK_Sparc_GOT10); + break; + case SparcMCExpr::VK_Sparc_HI: + VK = (hasGOTReference(subExpr) ? SparcMCExpr::VK_Sparc_PC22 + : SparcMCExpr::VK_Sparc_GOT22); + break; + } + } + + return SparcMCExpr::create(VK, subExpr, getContext()); +} + bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal, SMLoc &EndLoc) { @@ -998,30 +1181,7 @@ bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal, if (Parser.parseParenExpression(subExpr, EndLoc)) return false; - bool isPIC = getContext().getObjectFileInfo()->getRelocM() == Reloc::PIC_; - - // Ugly: if a sparc assembly expression says "%hi(...)" but the - // expression within contains _GLOBAL_OFFSET_TABLE_, it REALLY means - // %pc22. Same with %lo -> %pc10. Worse, if it doesn't contain that, - // the meaning depends on whether the assembler was invoked with - // -KPIC or not: if so, it really means %got22/%got10; if not, it - // actually means what it said! Sigh, historical mistakes... - - switch(VK) { - default: break; - case SparcMCExpr::VK_Sparc_LO: - VK = (hasGOTReference(subExpr) - ? SparcMCExpr::VK_Sparc_PC10 - : (isPIC ? SparcMCExpr::VK_Sparc_GOT10 : VK)); - break; - case SparcMCExpr::VK_Sparc_HI: - VK = (hasGOTReference(subExpr) - ? SparcMCExpr::VK_Sparc_PC22 - : (isPIC ? SparcMCExpr::VK_Sparc_GOT22 : VK)); - break; - } - - EVal = SparcMCExpr::create(VK, subExpr, getContext()); + EVal = adjustPICRelocation(VK, subExpr); return true; } @@ -1051,5 +1211,9 @@ unsigned SparcAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp, break; } } + if (Op.isIntReg() && Kind == MCK_IntPair) { + if (SparcOperand::MorphToIntPairReg(Op)) + return MCTargetAsmParser::Match_Success; + } return Match_InvalidOperand; } diff --git a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp index 38bff44..c689b7f 100644 --- a/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp +++ b/contrib/llvm/lib/Target/Sparc/DelaySlotFiller.cpp @@ -122,6 +122,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) { continue; } + // TODO: If we ever want to support v7, this needs to be extended + // to cover all floating point operations. if (!Subtarget->isV9() && (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD || MI->getOpcode() == SP::FCMPQ)) { diff --git a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp index 3e56b9e..51751ec 100644 --- a/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp +++ b/contrib/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp @@ -117,6 +117,19 @@ static const unsigned ASRRegDecoderTable[] = { SP::ASR24, SP::ASR25, SP::ASR26, SP::ASR27, SP::ASR28, SP::ASR29, SP::ASR30, SP::ASR31}; +static const unsigned PRRegDecoderTable[] = { + SP::TPC, SP::TNPC, SP::TSTATE, SP::TT, SP::TICK, SP::TBA, SP::PSTATE, + SP::TL, SP::PIL, SP::CWP, SP::CANSAVE, SP::CANRESTORE, SP::CLEANWIN, + SP::OTHERWIN, SP::WSTATE +}; + +static const uint16_t IntPairDecoderTable[] = { + SP::G0_G1, SP::G2_G3, SP::G4_G5, SP::G6_G7, + SP::O0_O1, SP::O2_O3, SP::O4_O5, SP::O6_O7, + SP::L0_L1, SP::L2_L3, SP::L4_L5, SP::L6_L7, + SP::I0_I1, SP::I2_I3, SP::I4_I5, SP::I6_I7, +}; + static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address, @@ -196,9 +209,34 @@ static DecodeStatus DecodeASRRegsRegisterClass(MCInst &Inst, unsigned RegNo, return MCDisassembler::Success; } +static DecodeStatus DecodePRRegsRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, + const void *Decoder) { + if (RegNo >= array_lengthof(PRRegDecoderTable)) + return MCDisassembler::Fail; + Inst.addOperand(MCOperand::createReg(PRRegDecoderTable[RegNo])); + return MCDisassembler::Success; +} + +static DecodeStatus DecodeIntPairRegisterClass(MCInst &Inst, unsigned RegNo, + uint64_t Address, const void *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + if (RegNo > 31) + return MCDisassembler::Fail; + + if ((RegNo & 1)) + S = MCDisassembler::SoftFail; + + unsigned RegisterPair = IntPairDecoderTable[RegNo/2]; + Inst.addOperand(MCOperand::createReg(RegisterPair)); + return S; +} static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address, + const void *Decoder); static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeLoadDFP(MCInst &Inst, unsigned insn, uint64_t Address, @@ -207,6 +245,8 @@ static DecodeStatus DecodeLoadQFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); +static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn, + uint64_t Address, const void *Decoder); static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder); static DecodeStatus DecodeStoreDFP(MCInst &Inst, unsigned insn, @@ -326,6 +366,12 @@ static DecodeStatus DecodeLoadInt(MCInst &Inst, unsigned insn, uint64_t Address, DecodeIntRegsRegisterClass); } +static DecodeStatus DecodeLoadIntPair(MCInst &Inst, unsigned insn, uint64_t Address, + const void *Decoder) { + return DecodeMem(Inst, insn, Address, Decoder, true, + DecodeIntPairRegisterClass); +} + static DecodeStatus DecodeLoadFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder) { return DecodeMem(Inst, insn, Address, Decoder, true, @@ -350,6 +396,12 @@ static DecodeStatus DecodeStoreInt(MCInst &Inst, unsigned insn, DecodeIntRegsRegisterClass); } +static DecodeStatus DecodeStoreIntPair(MCInst &Inst, unsigned insn, + uint64_t Address, const void *Decoder) { + return DecodeMem(Inst, insn, Address, Decoder, false, + DecodeIntPairRegisterClass); +} + static DecodeStatus DecodeStoreFP(MCInst &Inst, unsigned insn, uint64_t Address, const void *Decoder) { return DecodeMem(Inst, insn, Address, Decoder, false, diff --git a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h index 0b01b88..6f06d1d 100644 --- a/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h +++ b/contrib/llvm/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h @@ -15,12 +15,9 @@ #define LLVM_LIB_TARGET_SPARC_INSTPRINTER_SPARCINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class SparcInstPrinter : public MCInstPrinter { public: SparcInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h index 12386f1..ad44122 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h @@ -21,6 +21,7 @@ class Triple; class SparcELFMCAsmInfo : public MCAsmInfoELF { void anchor() override; + public: explicit SparcELFMCAsmInfo(const Triple &TheTriple); const MCExpr* diff --git a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h index d08ad86..13f0819 100644 --- a/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h +++ b/contrib/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h @@ -90,8 +90,8 @@ public: const MCAsmLayout *Layout, const MCFixup *Fixup) const override; void visitUsedExpr(MCStreamer &Streamer) const override; - MCSection *findAssociatedSection() const override { - return getSubExpr()->findAssociatedSection(); + MCFragment *findAssociatedFragment() const override { + return getSubExpr()->findAssociatedFragment(); } void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override; diff --git a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp index c5f046b..e3b0f52 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp @@ -267,11 +267,11 @@ void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI) LowerGETPCXAndEmitMCInsts(MI, getSubtargetInfo()); return; } - MachineBasicBlock::const_instr_iterator I = MI; + MachineBasicBlock::const_instr_iterator I = MI->getIterator(); MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end(); do { MCInst TmpInst; - LowerSparcMachineInstrToMCInst(I, TmpInst, *this); + LowerSparcMachineInstrToMCInst(&*I, TmpInst, *this); EmitToStreamer(*OutStreamer, TmpInst); } while ((++I != E) && I->isInsideBundle()); // Delay slot check. } @@ -296,7 +296,7 @@ void SparcAsmPrinter::EmitFunctionBodyStart() { void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const MachineOperand &MO = MI->getOperand (opNum); SparcMCExpr::VariantKind TF = (SparcMCExpr::VariantKind) MO.getTargetFlags(); @@ -373,7 +373,7 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum, O << MO.getSymbolName(); break; case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" + O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_" << MO.getIndex(); break; default: diff --git a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td index dfaaabf..0aa29d1 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td +++ b/contrib/llvm/lib/Target/Sparc/SparcCallingConv.td @@ -21,7 +21,11 @@ def CC_Sparc32 : CallingConv<[ // i32 f32 arguments get passed in integer registers if there is space. CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>, // f64 arguments are split and passed through registers or through stack. - CCIfType<[f64], CCCustom<"CC_Sparc_Assign_f64">>, + CCIfType<[f64], CCCustom<"CC_Sparc_Assign_Split_64">>, + // As are v2i32 arguments (this would be the default behavior for + // v2i32 if it wasn't allocated to the IntPair register-class) + CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Split_64">>, + // Alternatively, they are assigned to the stack in 4-byte aligned units. CCAssignToStack<4, 4> @@ -30,7 +34,8 @@ def CC_Sparc32 : CallingConv<[ def RetCC_Sparc32 : CallingConv<[ CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>, CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>, - CCIfType<[f64], CCAssignToReg<[D0, D1]>> + CCIfType<[f64], CCAssignToReg<[D0, D1]>>, + CCIfType<[v2i32], CCCustom<"CC_Sparc_Assign_Ret_Split_64">> ]>; diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp index c0279da..39b5e80 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.cpp @@ -44,7 +44,7 @@ void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF, unsigned ADDrr, unsigned ADDri) const { - DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc(); + DebugLoc dl; const SparcInstrInfo &TII = *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo()); @@ -90,8 +90,23 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, MachineFrameInfo *MFI = MF.getFrameInfo(); const SparcInstrInfo &TII = *static_cast<const SparcInstrInfo *>(MF.getSubtarget().getInstrInfo()); + const SparcRegisterInfo &RegInfo = + *static_cast<const SparcRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); MachineBasicBlock::iterator MBBI = MBB.begin(); - DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc dl; + bool NeedsStackRealignment = RegInfo.needsStackRealignment(MF); + + // FIXME: unfortunately, returning false from canRealignStack + // actually just causes needsStackRealignment to return false, + // rather than reporting an error, as would be sensible. This is + // poor, but fixing that bogosity is going to be a large project. + // For now, just see if it's lied, and report an error here. + if (!NeedsStackRealignment && MFI->getMaxAlignment() > getStackAlignment()) + report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required " + "stack re-alignment, but LLVM couldn't handle it " + "(probably because it has a dynamic alloca)."); // Get the number of bytes to allocate from the FrameInfo int NumBytes = (int) MFI->getStackSize(); @@ -104,12 +119,43 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, SAVEri = SP::ADDri; SAVErr = SP::ADDrr; } - NumBytes = -MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); - emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri); + + // The SPARC ABI is a bit odd in that it requires a reserved 92-byte + // (128 in v9) area in the user's stack, starting at %sp. Thus, the + // first part of the stack that can actually be used is located at + // %sp + 92. + // + // We therefore need to add that offset to the total stack size + // after all the stack objects are placed by + // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack needs to be + // aligned *after* the extra size is added, we need to disable + // calculateFrameObjectOffsets's built-in stack alignment, by having + // targetHandlesStackFrameRounding return true. + + + // Add the extra call frame stack size, if needed. (This is the same + // code as in PrologEpilogInserter, but also gets disabled by + // targetHandlesStackFrameRounding) + if (MFI->adjustsStack() && hasReservedCallFrame(MF)) + NumBytes += MFI->getMaxCallFrameSize(); + + // Adds the SPARC subtarget-specific spill area to the stack + // size. Also ensures target-required alignment. + NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); + + // Finally, ensure that the size is sufficiently aligned for the + // data on the stack. + if (MFI->getMaxAlignment() > 0) { + NumBytes = RoundUpToAlignment(NumBytes, MFI->getMaxAlignment()); + } + + // Update stack size with corrected value. + MFI->setStackSize(NumBytes); + + emitSPAdjustment(MF, MBB, MBBI, -NumBytes, SAVErr, SAVEri); MachineModuleInfo &MMI = MF.getMMI(); - const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); - unsigned regFP = MRI->getDwarfRegNum(SP::I6, true); + unsigned regFP = RegInfo.getDwarfRegNum(SP::I6, true); // Emit ".cfi_def_cfa_register 30". unsigned CFIIndex = @@ -122,13 +168,19 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); - unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true); - unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true); + unsigned regInRA = RegInfo.getDwarfRegNum(SP::I7, true); + unsigned regOutRA = RegInfo.getDwarfRegNum(SP::O7, true); // Emit ".cfi_register 15, 31". CFIIndex = MMI.addFrameInst( MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA)); BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); + + if (NeedsStackRealignment) { + // andn %o6, MaxAlign-1, %o6 + int MaxAlign = MFI->getMaxAlignment(); + BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), SP::O6).addReg(SP::O6).addImm(MaxAlign - 1); + } } void SparcFrameLowering:: @@ -167,7 +219,6 @@ void SparcFrameLowering::emitEpilogue(MachineFunction &MF, if (NumBytes == 0) return; - NumBytes = MF.getSubtarget<SparcSubtarget>().getAdjustedFrameSize(NumBytes); emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri); } @@ -180,21 +231,69 @@ bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { // pointer register. This is true if the function has variable sized allocas or // if frame pointer elimination is disabled. bool SparcFrameLowering::hasFP(const MachineFunction &MF) const { + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); return MF.getTarget().Options.DisableFramePointerElim(MF) || - MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken(); + RegInfo->needsStackRealignment(MF) || + MFI->hasVarSizedObjects() || + MFI->isFrameAddressTaken(); } +int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { + const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + const SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>(); + bool isFixed = MFI->isFixedObjectIndex(FI); + + // Addressable stack objects are accessed using neg. offsets from + // %fp, or positive offsets from %sp. + bool UseFP; + + // Sparc uses FP-based references in general, even when "hasFP" is + // false. That function is rather a misnomer, because %fp is + // actually always available, unless isLeafProc. + if (FuncInfo->isLeafProc()) { + // If there's a leaf proc, all offsets need to be %sp-based, + // because we haven't caused %fp to actually point to our frame. + UseFP = false; + } else if (isFixed) { + // Otherwise, argument access should always use %fp. + UseFP = true; + } else if (RegInfo->needsStackRealignment(MF)) { + // If there is dynamic stack realignment, all local object + // references need to be via %sp, to take account of the + // re-alignment. + UseFP = false; + } else { + // Finally, default to using %fp. + UseFP = true; + } + + int64_t FrameOffset = MF.getFrameInfo()->getObjectOffset(FI) + + Subtarget.getStackPointerBias(); + + if (UseFP) { + FrameReg = RegInfo->getFrameRegister(MF); + return FrameOffset; + } else { + FrameReg = SP::O6; // %sp + return FrameOffset + MF.getFrameInfo()->getStackSize(); + } +} + static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI) { for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) - if (MRI->isPhysRegUsed(reg)) + if (!MRI->reg_nodbg_empty(reg)) return false; for (unsigned reg = SP::L0; reg <= SP::L7; ++reg) - if (MRI->isPhysRegUsed(reg)) + if (!MRI->reg_nodbg_empty(reg)) return false; return true; @@ -206,33 +305,42 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const MachineRegisterInfo &MRI = MF.getRegInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); - return !(MFI->hasCalls() // has calls - || MRI.isPhysRegUsed(SP::L0) // Too many registers needed - || MRI.isPhysRegUsed(SP::O6) // %SP is used - || hasFP(MF)); // need %FP + return !(MFI->hasCalls() // has calls + || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed + || !MRI.reg_nodbg_empty(SP::O6) // %SP is used + || hasFP(MF)); // need %FP } void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - // Remap %i[0-7] to %o[0-7]. for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) { - if (!MRI.isPhysRegUsed(reg)) + if (MRI.reg_nodbg_empty(reg)) continue; - unsigned mapped_reg = (reg - SP::I0 + SP::O0); - assert(!MRI.isPhysRegUsed(mapped_reg)); + + unsigned mapped_reg = reg - SP::I0 + SP::O0; + assert(MRI.reg_nodbg_empty(mapped_reg)); // Replace I register with O register. MRI.replaceRegWith(reg, mapped_reg); - // Mark the reg unused. - MRI.setPhysRegUnused(reg); + // Also replace register pair super-registers. + if ((reg - SP::I0) % 2 == 0) { + unsigned preg = (reg - SP::I0) / 2 + SP::I0_I1; + unsigned mapped_preg = preg - SP::I0_I1 + SP::O0_O1; + MRI.replaceRegWith(preg, mapped_preg); + } } // Rewrite MBB's Live-ins. for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); MBB != E; ++MBB) { + for (unsigned reg = SP::I0_I1; reg <= SP::I6_I7; ++reg) { + if (!MBB->isLiveIn(reg)) + continue; + MBB->removeLiveIn(reg); + MBB->addLiveIn(reg - SP::I0_I1 + SP::O0_O1); + } for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) { if (!MBB->isLiveIn(reg)) continue; diff --git a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h index 29fc7b7..cbb4dc0 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h +++ b/contrib/llvm/lib/Target/Sparc/SparcFrameLowering.h @@ -39,6 +39,14 @@ public: void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS = nullptr) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; + + /// targetHandlesStackFrameRounding - Returns true if the target is + /// responsible for rounding up the stack frame (probably at emitPrologue + /// time). + bool targetHandlesStackFrameRounding() const override { return true; } + private: // Remap input registers to output registers for leaf procedure. void remapRegsForLeafProc(MachineFunction &MF) const; diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp index 340b72e..c4c6416 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "SparcTargetMachine.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Compiler.h" @@ -62,6 +63,7 @@ public: private: SDNode* getGlobalBaseReg(); + SDNode *SelectInlineAsm(SDNode *N); }; } // end anonymous namespace @@ -141,6 +143,181 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) { return true; } + +// Re-assemble i64 arguments split up in SelectionDAGBuilder's +// visitInlineAsm / GetRegistersForValue functions. +// +// Note: This function was copied from, and is essentially identical +// to ARMISelDAGToDAG::SelectInlineAsm. It is very unfortunate that +// such hacking-up is necessary; a rethink of how inline asm operands +// are handled may be in order to make doing this more sane. +// +// TODO: fix inline asm support so I can simply tell it that 'i64' +// inputs to asm need to be allocated to the IntPair register type, +// and have that work. Then, delete this function. +SDNode *SparcDAGToDAGISel::SelectInlineAsm(SDNode *N){ + std::vector<SDValue> AsmNodeOperands; + unsigned Flag, Kind; + bool Changed = false; + unsigned NumOps = N->getNumOperands(); + + // Normally, i64 data is bounded to two arbitrary GPRs for "%r" + // constraint. However, some instructions (e.g. ldd/std) require + // (even/even+1) GPRs. + + // So, here, we check for this case, and mutate the inlineasm to use + // a single IntPair register instead, which guarantees such even/odd + // placement. + + SDLoc dl(N); + SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) + : SDValue(nullptr,0); + + SmallVector<bool, 8> OpChanged; + // Glue node will be appended late. + for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) { + SDValue op = N->getOperand(i); + AsmNodeOperands.push_back(op); + + if (i < InlineAsm::Op_FirstOperand) + continue; + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(i))) { + Flag = C->getZExtValue(); + Kind = InlineAsm::getKind(Flag); + } + else + continue; + + // Immediate operands to inline asm in the SelectionDAG are modeled with + // two operands. The first is a constant of value InlineAsm::Kind_Imm, and + // the second is a constant with the value of the immediate. If we get here + // and we have a Kind_Imm, skip the next operand, and continue. + if (Kind == InlineAsm::Kind_Imm) { + SDValue op = N->getOperand(++i); + AsmNodeOperands.push_back(op); + continue; + } + + unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag); + if (NumRegs) + OpChanged.push_back(false); + + unsigned DefIdx = 0; + bool IsTiedToChangedOp = false; + // If it's a use that is tied with a previous def, it has no + // reg class constraint. + if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx)) + IsTiedToChangedOp = OpChanged[DefIdx]; + + if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef + && Kind != InlineAsm::Kind_RegDefEarlyClobber) + continue; + + unsigned RC; + bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC); + if ((!IsTiedToChangedOp && (!HasRC || RC != SP::IntRegsRegClassID)) + || NumRegs != 2) + continue; + + assert((i+2 < NumOps) && "Invalid number of operands in inline asm"); + SDValue V0 = N->getOperand(i+1); + SDValue V1 = N->getOperand(i+2); + unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg(); + unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg(); + SDValue PairedReg; + MachineRegisterInfo &MRI = MF->getRegInfo(); + + if (Kind == InlineAsm::Kind_RegDef || + Kind == InlineAsm::Kind_RegDefEarlyClobber) { + // Replace the two GPRs with 1 GPRPair and copy values from GPRPair to + // the original GPRs. + + unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); + PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32); + SDValue Chain = SDValue(N,0); + + SDNode *GU = N->getGluedUser(); + SDValue RegCopy = CurDAG->getCopyFromReg(Chain, dl, GPVR, MVT::v2i32, + Chain.getValue(1)); + + // Extract values from a GPRPair reg and copy to the original GPR reg. + SDValue Sub0 = CurDAG->getTargetExtractSubreg(SP::sub_even, dl, MVT::i32, + RegCopy); + SDValue Sub1 = CurDAG->getTargetExtractSubreg(SP::sub_odd, dl, MVT::i32, + RegCopy); + SDValue T0 = CurDAG->getCopyToReg(Sub0, dl, Reg0, Sub0, + RegCopy.getValue(1)); + SDValue T1 = CurDAG->getCopyToReg(Sub1, dl, Reg1, Sub1, T0.getValue(1)); + + // Update the original glue user. + std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1); + Ops.push_back(T1.getValue(1)); + CurDAG->UpdateNodeOperands(GU, Ops); + } + else { + // For Kind == InlineAsm::Kind_RegUse, we first copy two GPRs into a + // GPRPair and then pass the GPRPair to the inline asm. + SDValue Chain = AsmNodeOperands[InlineAsm::Op_InputChain]; + + // As REG_SEQ doesn't take RegisterSDNode, we copy them first. + SDValue T0 = CurDAG->getCopyFromReg(Chain, dl, Reg0, MVT::i32, + Chain.getValue(1)); + SDValue T1 = CurDAG->getCopyFromReg(Chain, dl, Reg1, MVT::i32, + T0.getValue(1)); + SDValue Pair = SDValue( + CurDAG->getMachineNode( + TargetOpcode::REG_SEQUENCE, dl, MVT::v2i32, + { + CurDAG->getTargetConstant(SP::IntPairRegClassID, dl, + MVT::i32), + T0, + CurDAG->getTargetConstant(SP::sub_even, dl, MVT::i32), + T1, + CurDAG->getTargetConstant(SP::sub_odd, dl, MVT::i32), + }), + 0); + + // Copy REG_SEQ into a GPRPair-typed VR and replace the original two + // i32 VRs of inline asm with it. + unsigned GPVR = MRI.createVirtualRegister(&SP::IntPairRegClass); + PairedReg = CurDAG->getRegister(GPVR, MVT::v2i32); + Chain = CurDAG->getCopyToReg(T1, dl, GPVR, Pair, T1.getValue(1)); + + AsmNodeOperands[InlineAsm::Op_InputChain] = Chain; + Glue = Chain.getValue(1); + } + + Changed = true; + + if(PairedReg.getNode()) { + OpChanged[OpChanged.size() -1 ] = true; + Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/); + if (IsTiedToChangedOp) + Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx); + else + Flag = InlineAsm::getFlagWordForRegClass(Flag, SP::IntPairRegClassID); + // Replace the current flag. + AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant( + Flag, dl, MVT::i32); + // Add the new register node and skip the original two GPRs. + AsmNodeOperands.push_back(PairedReg); + // Skip the next two GPRs. + i += 2; + } + } + + if (Glue.getNode()) + AsmNodeOperands.push_back(Glue); + if (!Changed) + return nullptr; + + SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N), + CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands); + New->setNodeId(-1); + return New.getNode(); +} + SDNode *SparcDAGToDAGISel::Select(SDNode *N) { SDLoc dl(N); if (N->isMachineOpcode()) { @@ -150,6 +327,12 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) { switch (N->getOpcode()) { default: break; + case ISD::INLINEASM: { + SDNode *ResNode = SelectInlineAsm(N); + if (ResNode) + return ResNode; + break; + } case SPISD::GLOBAL_BASE_REG: return getGlobalBaseReg(); diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 4879d4e..5e70ffe 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -49,9 +49,9 @@ static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT, return true; } -static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT, - MVT &LocVT, CCValAssign::LocInfo &LocInfo, - ISD::ArgFlagsTy &ArgFlags, CCState &State) +static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { static const MCPhysReg RegList[] = { SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5 @@ -77,6 +77,29 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT, return true; } +static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) +{ + static const MCPhysReg RegList[] = { + SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5 + }; + + // Try to get first reg. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + return false; + + // Try to get second reg. + if (unsigned Reg = State.AllocateReg(RegList)) + State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + else + return false; + + return true; +} + // Allocate a full-sized argument for the 64-bit ABI. static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT, MVT &LocVT, CCValAssign::LocInfo &LocInfo, @@ -202,12 +225,34 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain, RetOps.push_back(SDValue()); // Copy the result values into the output registers. - for (unsigned i = 0; i != RVLocs.size(); ++i) { + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); - Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), - OutVals[i], Flag); + SDValue Arg = OutVals[realRVLocIdx]; + + if (VA.needsCustom()) { + assert(VA.getLocVT() == MVT::v2i32); + // Legalize ret v2i32 -> ret 2 x i32 (Basically: do what would + // happen by default if this wasn't a legal type) + + SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Arg, + DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Arg, + DAG.getConstant(1, DL, getVectorIdxTy(DAG.getDataLayout()))); + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part0, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + VA = RVLocs[++i]; // skip ahead to next loc + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Part1, + Flag); + } else + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are stuck together with flags. Flag = Chain.getValue(1); @@ -355,6 +400,7 @@ LowerFormalArguments_32(SDValue Chain, CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32); const unsigned StackOffset = 92; + bool IsLittleEndian = DAG.getDataLayout().isLittleEndian(); unsigned InIdx = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) { @@ -375,7 +421,8 @@ LowerFormalArguments_32(SDValue Chain, if (VA.isRegLoc()) { if (VA.needsCustom()) { - assert(VA.getLocVT() == MVT::f64); + assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32); + unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass); MF.getRegInfo().addLiveIn(VA.getLocReg(), VRegHi); SDValue HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32); @@ -396,9 +443,13 @@ LowerFormalArguments_32(SDValue Chain, &SP::IntRegsRegClass); LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32); } + + if (IsLittleEndian) + std::swap(LoVal, HiVal); + SDValue WholeValue = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal); - WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue); + WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), WholeValue); InVals.push_back(WholeValue); continue; } @@ -422,7 +473,7 @@ LowerFormalArguments_32(SDValue Chain, auto PtrVT = getPointerTy(DAG.getDataLayout()); if (VA.needsCustom()) { - assert(VA.getValVT() == MVT::f64); + assert(VA.getValVT() == MVT::f64 || MVT::v2i32); // If it is double-word aligned, just load. if (Offset % 8 == 0) { int FI = MF.getFrameInfo()->CreateFixedObject(8, @@ -452,9 +503,12 @@ LowerFormalArguments_32(SDValue Chain, MachinePointerInfo(), false, false, false, 0); + if (IsLittleEndian) + std::swap(LoVal, HiVal); + SDValue WholeValue = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal); - WholeValue = DAG.getNode(ISD::BITCAST, dl, MVT::f64, WholeValue); + WholeValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), WholeValue); InVals.push_back(WholeValue); continue; } @@ -468,16 +522,12 @@ LowerFormalArguments_32(SDValue Chain, Load = DAG.getLoad(VA.getValVT(), dl, Chain, FIPtr, MachinePointerInfo(), false, false, false, 0); + } else if (VA.getValVT() == MVT::f128) { + report_fatal_error("SPARCv8 does not handle f128 in calls; " + "pass indirectly"); } else { - ISD::LoadExtType LoadOp = ISD::SEXTLOAD; - // Sparc is big endian, so add an offset based on the ObjectVT. - unsigned Offset = 4-std::max(1U, VA.getValVT().getSizeInBits()/8); - FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr, - DAG.getConstant(Offset, dl, MVT::i32)); - Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr, - MachinePointerInfo(), - VA.getValVT(), false, false, false,0); - Load = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Load); + // We shouldn't see any other value types here. + llvm_unreachable("Unexpected ValVT encountered in frame lowering."); } InVals.push_back(Load); } @@ -612,7 +662,7 @@ LowerFormalArguments_64(SDValue Chain, InVals.push_back(DAG.getLoad( VA.getValVT(), DL, Chain, DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())), - MachinePointerInfo::getFixedStack(FI), false, false, false, 0)); + MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0)); } if (!IsVarArg) @@ -640,9 +690,9 @@ LowerFormalArguments_64(SDValue Chain, SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); int FI = MF.getFrameInfo()->CreateFixedObject(8, ArgOffset + ArgArea, true); auto PtrVT = getPointerTy(MF.getDataLayout()); - OutChains.push_back( - DAG.getStore(Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT), - MachinePointerInfo::getFixedStack(FI), false, false, 0)); + OutChains.push_back(DAG.getStore( + Chain, DL, VArg, DAG.getFrameIndex(FI, PtrVT), + MachinePointerInfo::getFixedStack(MF, FI), false, false, 0)); } if (!OutChains.empty()) @@ -788,7 +838,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, } if (VA.needsCustom()) { - assert(VA.getLocVT() == MVT::f64); + assert(VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2i32); if (VA.isMemLoc()) { unsigned Offset = VA.getLocMemOffset() + StackOffset; @@ -804,49 +854,53 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, } } - SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32); - SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, - Arg, StackPtr, MachinePointerInfo(), - false, false, 0); - // Sparc is big-endian, so the high part comes first. - SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr, - MachinePointerInfo(), false, false, false, 0); - // Increment the pointer to the other half. - StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr, - DAG.getIntPtrConstant(4, dl)); - // Load the low part. - SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr, - MachinePointerInfo(), false, false, false, 0); + if (VA.getLocVT() == MVT::f64) { + // Move from the float value from float registers into the + // integer registers. + + // TODO: The f64 -> v2i32 conversion is super-inefficient for + // constants: it sticks them in the constant pool, then loads + // to a fp register, then stores to temp memory, then loads to + // integer registers. + Arg = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, Arg); + } + + SDValue Part0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + Arg, + DAG.getConstant(0, dl, getVectorIdxTy(DAG.getDataLayout()))); + SDValue Part1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + Arg, + DAG.getConstant(1, dl, getVectorIdxTy(DAG.getDataLayout()))); if (VA.isRegLoc()) { - RegsToPass.push_back(std::make_pair(VA.getLocReg(), Hi)); + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Part0)); assert(i+1 != e); CCValAssign &NextVA = ArgLocs[++i]; if (NextVA.isRegLoc()) { - RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo)); + RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Part1)); } else { - // Store the low part in stack. + // Store the second part in stack. unsigned Offset = NextVA.getLocMemOffset() + StackOffset; SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32); SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff); - MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff, + MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo(), false, false, 0)); } } else { unsigned Offset = VA.getLocMemOffset() + StackOffset; - // Store the high part. + // Store the first part. SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32); SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff); - MemOpChains.push_back(DAG.getStore(Chain, dl, Hi, PtrOff, + MemOpChains.push_back(DAG.getStore(Chain, dl, Part0, PtrOff, MachinePointerInfo(), false, false, 0)); - // Store the low part. + // Store the second part. PtrOff = DAG.getIntPtrConstant(Offset + 4, dl); PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff); - MemOpChains.push_back(DAG.getStore(Chain, dl, Lo, PtrOff, + MemOpChains.push_back(DAG.getStore(Chain, dl, Part1, PtrOff, MachinePointerInfo(), false, false, 0)); } @@ -990,8 +1044,8 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const if (!CalleeFn) return 0; - assert(CalleeFn->hasStructRetAttr() && - "Callee does not have the StructRet attribute."); + // It would be nice to check for the sret attribute on CalleeFn here, + // but since it is not part of the function type, any check will misfire. PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType()); Type *ElementTy = Ty->getElementType(); @@ -1370,15 +1424,60 @@ static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) { SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, const SparcSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { - auto &DL = *TM.getDataLayout(); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); + + // Instructions which use registers as conditionals examine all the + // bits (as does the pseudo SELECT_CC expansion). I don't think it + // matters much whether it's ZeroOrOneBooleanContent, or + // ZeroOrNegativeOneBooleanContent, so, arbitrarily choose the + // former. + setBooleanContents(ZeroOrOneBooleanContent); + setBooleanVectorContents(ZeroOrOneBooleanContent); // Set up the register classes. addRegisterClass(MVT::i32, &SP::IntRegsRegClass); addRegisterClass(MVT::f32, &SP::FPRegsRegClass); addRegisterClass(MVT::f64, &SP::DFPRegsRegClass); addRegisterClass(MVT::f128, &SP::QFPRegsRegClass); - if (Subtarget->is64Bit()) + if (Subtarget->is64Bit()) { addRegisterClass(MVT::i64, &SP::I64RegsRegClass); + } else { + // On 32bit sparc, we define a double-register 32bit register + // class, as well. This is modeled in LLVM as a 2-vector of i32. + addRegisterClass(MVT::v2i32, &SP::IntPairRegClass); + + // ...but almost all operations must be expanded, so set that as + // the default. + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { + setOperationAction(Op, MVT::v2i32, Expand); + } + // Truncating/extending stores/loads are also not supported. + for (MVT VT : MVT::integer_vector_valuetypes()) { + setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i32, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Expand); + + setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, VT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, VT, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, VT, Expand); + + setTruncStoreAction(VT, MVT::v2i32, Expand); + setTruncStoreAction(MVT::v2i32, VT, Expand); + } + // However, load and store *are* legal. + setOperationAction(ISD::LOAD, MVT::v2i32, Legal); + setOperationAction(ISD::STORE, MVT::v2i32, Legal); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Legal); + setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Legal); + + // And we need to promote i64 loads/stores into vector load/store + setOperationAction(ISD::LOAD, MVT::i64, Custom); + setOperationAction(ISD::STORE, MVT::i64, Custom); + + // Sadly, this doesn't work: + // AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32); + // AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32); + } // Turn FP extload into load/fextend for (MVT VT : MVT::fp_valuetypes()) { @@ -1396,10 +1495,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, setTruncStoreAction(MVT::f128, MVT::f64, Expand); // Custom legalize GlobalAddress nodes into LO/HI parts. - setOperationAction(ISD::GlobalAddress, getPointerTy(DL), Custom); - setOperationAction(ISD::GlobalTLSAddress, getPointerTy(DL), Custom); - setOperationAction(ISD::ConstantPool, getPointerTy(DL), Custom); - setOperationAction(ISD::BlockAddress, getPointerTy(DL), Custom); + setOperationAction(ISD::GlobalAddress, PtrVT, Custom); + setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom); + setOperationAction(ISD::ConstantPool, PtrVT, Custom); + setOperationAction(ISD::BlockAddress, PtrVT, Custom); // Sparc doesn't have sext_inreg, replace them with shl/sra setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); @@ -1579,9 +1678,6 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM, setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); - setExceptionPointerRegister(SP::I0); - setExceptionSelectorRegister(SP::I1); - setStackPointerRegisterToSaveRestore(SP::O6); setOperationAction(ISD::CTPOP, MVT::i32, @@ -1744,18 +1840,15 @@ void SparcTargetLowering::computeKnownBitsForTargetNode // set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition. static void LookThroughSetCC(SDValue &LHS, SDValue &RHS, ISD::CondCode CC, unsigned &SPCC) { - if (isa<ConstantSDNode>(RHS) && - cast<ConstantSDNode>(RHS)->isNullValue() && + if (isNullConstant(RHS) && CC == ISD::SETNE && (((LHS.getOpcode() == SPISD::SELECT_ICC || LHS.getOpcode() == SPISD::SELECT_XCC) && LHS.getOperand(3).getOpcode() == SPISD::CMPICC) || (LHS.getOpcode() == SPISD::SELECT_FCC && LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) && - isa<ConstantSDNode>(LHS.getOperand(0)) && - isa<ConstantSDNode>(LHS.getOperand(1)) && - cast<ConstantSDNode>(LHS.getOperand(0))->isOne() && - cast<ConstantSDNode>(LHS.getOperand(1))->isNullValue()) { + isOneConstant(LHS.getOperand(0)) && + isNullConstant(LHS.getOperand(1))) { SDValue CMPCC = LHS.getOperand(3); SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue(); LHS = CMPCC.getOperand(0); @@ -1821,7 +1914,8 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); MFI->setHasCalls(true); return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // This is one of the absolute code models. @@ -1872,6 +1966,9 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); + SDLoc DL(GA); const GlobalValue *GV = GA->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2601,6 +2698,17 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG) return DAG.getMergeValues(Ops, dl); } +static SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) +{ + LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode()); + + EVT MemVT = LdNode->getMemoryVT(); + if (MemVT == MVT::f128) + return LowerF128Load(Op, DAG); + + return Op; +} + // Lower a f128 store into two f64 stores. static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); @@ -2645,6 +2753,29 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) { return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); } +static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) +{ + SDLoc dl(Op); + StoreSDNode *St = cast<StoreSDNode>(Op.getNode()); + + EVT MemVT = St->getMemoryVT(); + if (MemVT == MVT::f128) + return LowerF128Store(Op, DAG); + + if (MemVT == MVT::i64) { + // Custom handling for i64 stores: turn it into a bitcast and a + // v2i32 store. + SDValue Val = DAG.getNode(ISD::BITCAST, dl, MVT::v2i32, St->getValue()); + SDValue Chain = DAG.getStore( + St->getChain(), dl, Val, St->getBasePtr(), St->getPointerInfo(), + St->isVolatile(), St->isNonTemporal(), St->getAlignment(), + St->getAAInfo()); + return Chain; + } + + return SDValue(); +} + static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) { assert((Op.getOpcode() == ISD::FNEG || Op.getOpcode() == ISD::FABS) && "invalid opcode"); @@ -2752,7 +2883,7 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG, SDValue MulResult = TLI.makeLibCall(DAG, RTLIB::MUL_I128, WideVT, - Args, 4, isSigned, dl).first; + Args, isSigned, dl).first; SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, MulResult, DAG.getIntPtrConstant(0, dl)); SDValue TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, @@ -2783,7 +2914,6 @@ static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) { return SDValue(); } - SDValue SparcTargetLowering:: LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -2818,8 +2948,8 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget); - case ISD::LOAD: return LowerF128Load(Op, DAG); - case ISD::STORE: return LowerF128Store(Op, DAG); + case ISD::LOAD: return LowerLOAD(Op, DAG); + case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::FADD: return LowerF128Op(Op, DAG, getLibcallName(RTLIB::ADD_F128), 2); case ISD::FSUB: return LowerF128Op(Op, DAG, @@ -2921,8 +3051,7 @@ SparcTargetLowering::expandSelectCC(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -3007,7 +3136,7 @@ SparcTargetLowering::expandAtomicRMW(MachineInstr *MI, .addReg(AddrReg).addImm(0); // Split the basic block MBB before MI and insert the loop block in the hole. - MachineFunction::iterator MFI = MBB; + MachineFunction::iterator MFI = MBB->getIterator(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *MF = MBB->getParent(); MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB); @@ -3149,9 +3278,12 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': - return std::make_pair(0U, &SP::IntRegsRegClass); + if (VT == MVT::v2i32) + return std::make_pair(0U, &SP::IntPairRegClass); + else + return std::make_pair(0U, &SP::IntRegsRegClass); } - } else if (!Constraint.empty() && Constraint.size() <= 5 + } else if (!Constraint.empty() && Constraint.size() <= 5 && Constraint[0] == '{' && *(Constraint.end()-1) == '}') { // constraint = '{r<d>}' // Remove the braces from around the name. @@ -3227,5 +3359,24 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N, getLibcallName(libCall), 1)); return; + case ISD::LOAD: { + LoadSDNode *Ld = cast<LoadSDNode>(N); + // Custom handling only for i64: turn i64 load into a v2i32 load, + // and a bitcast. + if (Ld->getValueType(0) != MVT::i64 || Ld->getMemoryVT() != MVT::i64) + return; + + SDLoc dl(N); + SDValue LoadRes = DAG.getExtLoad( + Ld->getExtensionType(), dl, MVT::v2i32, + Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), + MVT::v2i32, Ld->isVolatile(), Ld->isNonTemporal(), + Ld->isInvariant(), Ld->getAlignment(), Ld->getAAInfo()); + + SDValue Res = DAG.getNode(ISD::BITCAST, dl, MVT::i64, LoadRes); + Results.push_back(Res); + Results.push_back(LoadRes.getValue(1)); + return; + } } } diff --git a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h index bbc91a4..4e46709 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h +++ b/contrib/llvm/lib/Target/Sparc/SparcISelLowering.h @@ -89,6 +89,20 @@ namespace llvm { return MVT::i32; } + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return SP::I0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return SP::I1; + } + /// getSetCCResultType - Return the ISD::SETCC ValueType EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; @@ -167,8 +181,8 @@ namespace llvm { } void ReplaceNodeResults(SDNode *N, - SmallVectorImpl<SDValue>& Results, - SelectionDAG &DAG) const override; + SmallVectorImpl<SDValue>& Results, + SelectionDAG &DAG) const override; MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB, unsigned BROpcode) const; diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td index 25cc652..d51e2cc 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrAliases.td @@ -250,6 +250,7 @@ defm : int_cond_alias<"n", 0b0000>; defm : int_cond_alias<"ne", 0b1001>; defm : int_cond_alias<"nz", 0b1001>; // same as ne defm : int_cond_alias<"e", 0b0001>; +defm : int_cond_alias<"eq", 0b0001>; // same as e defm : int_cond_alias<"z", 0b0001>; // same as e defm : int_cond_alias<"g", 0b1010>; defm : int_cond_alias<"le", 0b0010>; @@ -429,6 +430,9 @@ def : InstAlias<"wr $simm13, %tbr", (WRTBRri G0, i32imm:$simm13), 0>; def : InstAlias<"flush", (FLUSH), 0>; +def : MnemonicAlias<"lduw", "ld">, Requires<[HasV9]>; +def : MnemonicAlias<"lduwa", "lda">, Requires<[HasV9]>; + def : MnemonicAlias<"return", "rett">, Requires<[HasV9]>; def : MnemonicAlias<"addc", "addx">, Requires<[HasV9]>; @@ -450,3 +454,8 @@ def : InstAlias<"fcmpeq $rs1, $rs2", (V9FCMPEQ FCC0, QFPRegs:$rs1, QFPRegs:$rs2)>, Requires<[HasHardQuad]>; +// signx rd -> sra rd, %g0, rd +def : InstAlias<"signx $rd", (SRArr IntRegs:$rd, IntRegs:$rd, G0), 0>, Requires<[HasV9]>; + +// signx reg, rd -> sra reg, %g0, rd +def : InstAlias<"signx $rs1, $rd", (SRArr IntRegs:$rd, IntRegs:$rs1, G0), 0>, Requires<[HasV9]>; diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp index 6167c53..733027a 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.cpp @@ -284,7 +284,9 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned numSubRegs = 0; unsigned movOpc = 0; const unsigned *subRegIdx = nullptr; + bool ExtraG0 = false; + const unsigned DW_SubRegsIdx[] = { SP::sub_even, SP::sub_odd }; const unsigned DFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd }; const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 }; const unsigned QFP_FP_SubRegsIdx[] = { SP::sub_even, SP::sub_odd, @@ -294,7 +296,12 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (SP::IntRegsRegClass.contains(DestReg, SrcReg)) BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0) .addReg(SrcReg, getKillRegState(KillSrc)); - else if (SP::FPRegsRegClass.contains(DestReg, SrcReg)) + else if (SP::IntPairRegClass.contains(DestReg, SrcReg)) { + subRegIdx = DW_SubRegsIdx; + numSubRegs = 2; + movOpc = SP::ORrr; + ExtraG0 = true; + } else if (SP::FPRegsRegClass.contains(DestReg, SrcReg)) BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) { @@ -347,7 +354,11 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Src = TRI->getSubReg(SrcReg, subRegIdx[i]); assert(Dst && Src && "Bad sub-register"); - MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src); + MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(movOpc), Dst); + if (ExtraG0) + MIB.addReg(SP::G0); + MIB.addReg(Src); + MovMI = MIB.getInstr(); } // Add implicit super-register defs and kills to the last MovMI. MovMI->addRegisterDefined(DestReg, TRI); @@ -365,19 +376,20 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOStore, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); // On the order of operands here: think "[FrameIdx + 0] = SrcReg". - if (RC == &SP::I64RegsRegClass) + if (RC == &SP::I64RegsRegClass) BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); else if (RC == &SP::IntRegsRegClass) BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); + else if (RC == &SP::IntPairRegClass) + BuildMI(MBB, I, DL, get(SP::STDri)).addFrameIndex(FI).addImm(0) + .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); else if (RC == &SP::FPRegsRegClass) BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0) .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO); @@ -403,11 +415,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); if (RC == &SP::I64RegsRegClass) BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0) @@ -415,6 +425,9 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, else if (RC == &SP::IntRegsRegClass) BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0) .addMemOperand(MMO); + else if (RC == &SP::IntPairRegClass) + BuildMI(MBB, I, DL, get(SP::LDDri), DestReg).addFrameIndex(FI).addImm(0) + .addMemOperand(MMO); else if (RC == &SP::FPRegsRegClass) BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0) .addMemOperand(MMO); diff --git a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td index 3b9e048..ec37c22 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td +++ b/contrib/llvm/lib/Target/Sparc/SparcInstrInfo.td @@ -283,17 +283,32 @@ multiclass Load<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, [(set Ty:$dst, (OpNode ADDRri:$addr))]>; } +// TODO: Instructions of the LoadASI class are currently asm only; hooking up +// CodeGen's address spaces to use these is a future task. +class LoadASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, + RegisterClass RC, ValueType Ty> : + F3_1_asi<3, Op3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi), + !strconcat(OpcStr, "a [$addr] $asi, $dst"), + []>; + // LoadA multiclass - As above, but also define alternate address space variant multiclass LoadA<string OpcStr, bits<6> Op3Val, bits<6> LoadAOp3Val, SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> : Load<OpcStr, Op3Val, OpNode, RC, Ty> { - // TODO: The LD*Arr instructions are currently asm only; hooking up - // CodeGen's address spaces to use these is a future task. - def Arr : F3_1_asi<3, LoadAOp3Val, (outs RC:$dst), (ins MEMrr:$addr, i8imm:$asi), - !strconcat(OpcStr, "a [$addr] $asi, $dst"), - []>; + def Arr : LoadASI<OpcStr, LoadAOp3Val, OpNode, RC, Ty>; } +// The LDSTUB instruction is supported for asm only. +// It is unlikely that general-purpose code could make use of it. +// CAS is preferred for sparc v9. +def LDSTUBrr : F3_1<3, 0b001101, (outs IntRegs:$dst), (ins MEMrr:$addr), + "ldstub [$addr], $dst", []>; +def LDSTUBri : F3_2<3, 0b001101, (outs IntRegs:$dst), (ins MEMri:$addr), + "ldstub [$addr], $dst", []>; +def LDSTUBArr : F3_1_asi<3, 0b011101, (outs IntRegs:$dst), + (ins MEMrr:$addr, i8imm:$asi), + "ldstuba [$addr] $asi, $dst", []>; + // Store multiclass - Define both Reg+Reg/Reg+Imm patterns in one shot. multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> { @@ -307,14 +322,18 @@ multiclass Store<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, [(OpNode Ty:$rd, ADDRri:$addr)]>; } -multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val, +// TODO: Instructions of the StoreASI class are currently asm only; hooking up +// CodeGen's address spaces to use these is a future task. +class StoreASI<string OpcStr, bits<6> Op3Val, SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> : - Store<OpcStr, Op3Val, OpNode, RC, Ty> { - // TODO: The ST*Arr instructions are currently asm only; hooking up - // CodeGen's address spaces to use these is a future task. - def Arr : F3_1_asi<3, StoreAOp3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi), + F3_1_asi<3, Op3Val, (outs), (ins MEMrr:$addr, RC:$rd, i8imm:$asi), !strconcat(OpcStr, "a $rd, [$addr] $asi"), []>; + +multiclass StoreA<string OpcStr, bits<6> Op3Val, bits<6> StoreAOp3Val, + SDPatternOperator OpNode, RegisterClass RC, ValueType Ty> : + Store<OpcStr, Op3Val, OpNode, RC, Ty> { + def Arr : StoreASI<OpcStr, StoreAOp3Val, OpNode, RC, Ty>; } //===----------------------------------------------------------------------===// @@ -408,15 +427,40 @@ let DecoderMethod = "DecodeLoadInt" in { defm LD : LoadA<"ld", 0b000000, 0b010000, load, IntRegs, i32>; } +let DecoderMethod = "DecodeLoadIntPair" in + defm LDD : LoadA<"ldd", 0b000011, 0b010011, load, IntPair, v2i32>; + // Section B.2 - Load Floating-point Instructions, p. 92 -let DecoderMethod = "DecodeLoadFP" in - defm LDF : Load<"ld", 0b100000, load, FPRegs, f32>; -let DecoderMethod = "DecodeLoadDFP" in - defm LDDF : Load<"ldd", 0b100011, load, DFPRegs, f64>; +let DecoderMethod = "DecodeLoadFP" in { + defm LDF : Load<"ld", 0b100000, load, FPRegs, f32>; + def LDFArr : LoadASI<"ld", 0b110000, load, FPRegs, f32>, + Requires<[HasV9]>; +} +let DecoderMethod = "DecodeLoadDFP" in { + defm LDDF : Load<"ldd", 0b100011, load, DFPRegs, f64>; + def LDDFArr : LoadASI<"ldd", 0b110011, load, DFPRegs, f64>, + Requires<[HasV9]>; +} let DecoderMethod = "DecodeLoadQFP" in - defm LDQF : Load<"ldq", 0b100010, load, QFPRegs, f128>, + defm LDQF : LoadA<"ldq", 0b100010, 0b110010, load, QFPRegs, f128>, Requires<[HasV9, HasHardQuad]>; +let DecoderMethod = "DecodeLoadFP" in + let Defs = [FSR] in { + let rd = 0 in { + def LDFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr), + "ld [$addr], %fsr", []>; + def LDFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr), + "ld [$addr], %fsr", []>; + } + let rd = 1 in { + def LDXFSRrr : F3_1<3, 0b100001, (outs), (ins MEMrr:$addr), + "ldx [$addr], %fsr", []>, Requires<[HasV9]>; + def LDXFSRri : F3_2<3, 0b100001, (outs), (ins MEMri:$addr), + "ldx [$addr], %fsr", []>, Requires<[HasV9]>; + } + } + // Section B.4 - Store Integer Instructions, p. 95 let DecoderMethod = "DecodeStoreInt" in { defm STB : StoreA<"stb", 0b000101, 0b010101, truncstorei8, IntRegs, i32>; @@ -424,15 +468,40 @@ let DecoderMethod = "DecodeStoreInt" in { defm ST : StoreA<"st", 0b000100, 0b010100, store, IntRegs, i32>; } +let DecoderMethod = "DecodeStoreIntPair" in + defm STD : StoreA<"std", 0b000111, 0b010111, store, IntPair, v2i32>; + // Section B.5 - Store Floating-point Instructions, p. 97 -let DecoderMethod = "DecodeStoreFP" in +let DecoderMethod = "DecodeStoreFP" in { defm STF : Store<"st", 0b100100, store, FPRegs, f32>; -let DecoderMethod = "DecodeStoreDFP" in - defm STDF : Store<"std", 0b100111, store, DFPRegs, f64>; + def STFArr : StoreASI<"st", 0b110100, store, FPRegs, f32>, + Requires<[HasV9]>; +} +let DecoderMethod = "DecodeStoreDFP" in { + defm STDF : Store<"std", 0b100111, store, DFPRegs, f64>; + def STDFArr : StoreASI<"std", 0b110111, store, DFPRegs, f64>, + Requires<[HasV9]>; +} let DecoderMethod = "DecodeStoreQFP" in - defm STQF : Store<"stq", 0b100110, store, QFPRegs, f128>, + defm STQF : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>, Requires<[HasV9, HasHardQuad]>; +let DecoderMethod = "DecodeStoreFP" in + let Defs = [FSR] in { + let rd = 0 in { + def STFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins), + "st %fsr, [$addr]", []>; + def STFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins), + "st %fsr, [$addr]", []>; + } + let rd = 1 in { + def STXFSRrr : F3_1<3, 0b100101, (outs MEMrr:$addr), (ins), + "stx %fsr, [$addr]", []>, Requires<[HasV9]>; + def STXFSRri : F3_2<3, 0b100101, (outs MEMri:$addr), (ins), + "stx %fsr, [$addr]", []>, Requires<[HasV9]>; + } + } + // Section B.8 - SWAP Register with Memory Instruction // (Atomic swap) let Constraints = "$val = $dst", DecoderMethod = "DecodeSWAP" in { @@ -559,6 +628,10 @@ let Defs = [Y, ICC] in { defm SMULCC : F3_12np<"smulcc", 0b011011>; } +let Defs = [Y, ICC], Uses = [Y, ICC] in { + defm MULSCC : F3_12np<"mulscc", 0b100100>; +} + // Section B.19 - Divide Instructions, p. 115 let Uses = [Y], Defs = [Y] in { defm UDIV : F3_12np<"udiv", 0b001110>; @@ -1221,8 +1294,8 @@ let Predicates = [HasV9] in { // the top 32-bits before using it. To do this clearing, we use a SRLri X,0. let rs1 = 0 in def POPCrr : F3_1<2, 0b101110, - (outs IntRegs:$dst), (ins IntRegs:$src), - "popc $src, $dst", []>, Requires<[HasV9]>; + (outs IntRegs:$rd), (ins IntRegs:$rs2), + "popc $rs2, $rd", []>, Requires<[HasV9]>; def : Pat<(ctpop i32:$src), (POPCrr (SRLri $src, 0))>; @@ -1254,6 +1327,25 @@ let hasSideEffects = 1 in { } } + +// Section A.43 - Read Privileged Register Instructions +let Predicates = [HasV9] in { +let rs2 = 0 in + def RDPR : F3_1<2, 0b101010, + (outs IntRegs:$rd), (ins PRRegs:$rs1), + "rdpr $rs1, $rd", []>; +} + +// Section A.62 - Write Privileged Register Instructions +let Predicates = [HasV9] in { + def WRPRrr : F3_1<2, 0b110010, + (outs PRRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2), + "wrpr $rs1, $rs2, $rd", []>; + def WRPRri : F3_2<2, 0b110010, + (outs PRRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13), + "wrpr $rs1, $simm13, $rd", []>; +} + //===----------------------------------------------------------------------===// // Non-Instruction Patterns //===----------------------------------------------------------------------===// @@ -1327,6 +1419,18 @@ def : Pat<(i32 (atomic_load ADDRri:$src)), (LDri ADDRri:$src)>; def : Pat<(atomic_store ADDRrr:$dst, i32:$val), (STrr ADDRrr:$dst, $val)>; def : Pat<(atomic_store ADDRri:$dst, i32:$val), (STri ADDRri:$dst, $val)>; +// extract_vector +def : Pat<(extractelt (v2i32 IntPair:$Rn), 0), + (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_even))>; +def : Pat<(extractelt (v2i32 IntPair:$Rn), 1), + (i32 (EXTRACT_SUBREG IntPair:$Rn, sub_odd))>; + +// build_vector +def : Pat<(build_vector (i32 IntRegs:$a1), (i32 IntRegs:$a2)), + (INSERT_SUBREG + (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), (i32 IntRegs:$a1), sub_even), + (i32 IntRegs:$a2), sub_odd)>; + include "SparcInstr64Bit.td" include "SparcInstrVIS.td" diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp index 9667bc0..da31783 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp @@ -75,6 +75,18 @@ BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(SP::G6); Reserved.set(SP::G7); + // Also reserve the register pair aliases covering the above + // registers, with the same conditions. + Reserved.set(SP::G0_G1); + if (ReserveAppRegisters) + Reserved.set(SP::G2_G3); + if (ReserveAppRegisters || !Subtarget.is64Bit()) + Reserved.set(SP::G4_G5); + + Reserved.set(SP::O6_O7); + Reserved.set(SP::I6_I7); + Reserved.set(SP::G6_G7); + // Unaliased double registers are not available in non-V9 targets. if (!Subtarget.isV9()) { for (unsigned n = 0; n != 16; ++n) { @@ -158,21 +170,15 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineInstr &MI = *II; DebugLoc dl = MI.getDebugLoc(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); - - // Addressable stack objects are accessed using neg. offsets from %fp MachineFunction &MF = *MI.getParent()->getParent(); const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>(); - int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) + - MI.getOperand(FIOperandNum + 1).getImm() + - Subtarget.getStackPointerBias(); - SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>(); - unsigned FramePtr = SP::I6; - if (FuncInfo->isLeafProc()) { - // Use %sp and adjust offset if needed. - FramePtr = SP::O6; - int stackSize = MF.getFrameInfo()->getStackSize(); - Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ; - } + const SparcFrameLowering *TFI = getFrameLowering(MF); + + unsigned FrameReg; + int Offset; + Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg); + + Offset += MI.getOperand(FIOperandNum + 1).getImm(); if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) { if (MI.getOpcode() == SP::STQFri) { @@ -182,8 +188,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned SrcOddReg = getSubReg(SrcReg, SP::sub_odd64); MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri)) - .addReg(FramePtr).addImm(0).addReg(SrcEvenReg); - replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr); + .addReg(FrameReg).addImm(0).addReg(SrcEvenReg); + replaceFI(MF, II, *StMI, dl, 0, Offset, FrameReg); MI.setDesc(TII.get(SP::STDFri)); MI.getOperand(2).setReg(SrcOddReg); Offset += 8; @@ -194,8 +200,8 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned DestOddReg = getSubReg(DestReg, SP::sub_odd64); MachineInstr *StMI = BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg) - .addReg(FramePtr).addImm(0); - replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr); + .addReg(FrameReg).addImm(0); + replaceFI(MF, II, *StMI, dl, 1, Offset, FrameReg); MI.setDesc(TII.get(SP::LDDFri)); MI.getOperand(0).setReg(DestOddReg); @@ -203,7 +209,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } - replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr); + replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg); } @@ -211,3 +217,25 @@ unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return SP::I6; } +// Sparc has no architectural need for stack realignment support, +// except that LLVM unfortunately currently implements overaligned +// stack objects by depending upon stack realignment support. +// If that ever changes, this can probably be deleted. +bool SparcRegisterInfo::canRealignStack(const MachineFunction &MF) const { + if (!TargetRegisterInfo::canRealignStack(MF)) + return false; + + // Sparc always has a fixed frame pointer register, so don't need to + // worry about needing to reserve it. [even if we don't have a frame + // pointer for our frame, it still cannot be used for other things, + // or register window traps will be SADNESS.] + + // If there's a reserved call frame, we can use SP to access locals. + if (getFrameLowering(MF)->hasReservedCallFrame(MF)) + return true; + + // Otherwise, we'd need a base pointer, but those aren't implemented + // for SPARC at the moment. + + return false; +} diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h index 764a894..32075b1 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h +++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.h @@ -42,8 +42,10 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo { void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS = nullptr) const; - // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; + + bool canRealignStack(const MachineFunction &MF) const override; + }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td index db8a7e8..cca9463 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td +++ b/contrib/llvm/lib/Target/Sparc/SparcRegisterInfo.td @@ -32,6 +32,12 @@ def sub_odd64 : SubRegIndex<64, 64>; // Ri - 32-bit integer registers class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>; +// Rdi - pairs of 32-bit integer registers +class Rdi<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> { + let SubRegs = subregs; + let SubRegIndices = [sub_even, sub_odd]; + let CoveredBySubRegs = 1; +} // Rf - 32-bit floating-point registers class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>; @@ -54,6 +60,8 @@ def ICC : SparcCtrlReg<0, "ICC">; // This represents icc and xcc in 64-bit code. foreach I = 0-3 in def FCC#I : SparcCtrlReg<I, "FCC"#I>; +def FSR : SparcCtrlReg<0, "FSR">; // Floating-point state register. + // Y register def Y : SparcCtrlReg<0, "Y">, DwarfRegNum<[64]>; // Ancillary state registers (implementation defined) @@ -94,6 +102,22 @@ def PSR : SparcCtrlReg<0, "PSR">; def WIM : SparcCtrlReg<0, "WIM">; def TBR : SparcCtrlReg<0, "TBR">; +def TPC : SparcCtrlReg<0, "TPC">; +def TNPC : SparcCtrlReg<1, "TNPC">; +def TSTATE : SparcCtrlReg<2, "TSTATE">; +def TT : SparcCtrlReg<3, "TT">; +def TICK : SparcCtrlReg<4, "TICK">; +def TBA : SparcCtrlReg<5, "TBA">; +def PSTATE : SparcCtrlReg<6, "PSTATE">; +def TL : SparcCtrlReg<7, "TL">; +def PIL : SparcCtrlReg<8, "PIL">; +def CWP : SparcCtrlReg<9, "CWP">; +def CANSAVE : SparcCtrlReg<10, "CANSAVE">; +def CANRESTORE : SparcCtrlReg<11, "CANRESTORE">; +def CLEANWIN : SparcCtrlReg<12, "CLEANWIN">; +def OTHERWIN : SparcCtrlReg<13, "OTHERWIN">; +def WSTATE : SparcCtrlReg<14, "WSTATE">; + // Integer registers def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>; def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>; @@ -217,6 +241,24 @@ def Q13 : Rq<21, "F52", [D26, D27]>; def Q14 : Rq<25, "F56", [D28, D29]>; def Q15 : Rq<29, "F60", [D30, D31]>; +// Aliases of the integer registers used for LDD/STD double-word operations +def G0_G1 : Rdi<0, "G0", [G0, G1]>; +def G2_G3 : Rdi<2, "G2", [G2, G3]>; +def G4_G5 : Rdi<4, "G4", [G4, G5]>; +def G6_G7 : Rdi<6, "G6", [G6, G7]>; +def O0_O1 : Rdi<8, "O0", [O0, O1]>; +def O2_O3 : Rdi<10, "O2", [O2, O3]>; +def O4_O5 : Rdi<12, "O4", [O4, O5]>; +def O6_O7 : Rdi<14, "O6", [O6, O7]>; +def L0_L1 : Rdi<16, "L0", [L0, L1]>; +def L2_L3 : Rdi<18, "L2", [L2, L3]>; +def L4_L5 : Rdi<20, "L4", [L4, L5]>; +def L6_L7 : Rdi<22, "L6", [L6, L7]>; +def I0_I1 : Rdi<24, "I0", [I0, I1]>; +def I2_I3 : Rdi<26, "I2", [I2, I3]>; +def I4_I5 : Rdi<28, "I4", [I4, I5]>; +def I6_I7 : Rdi<30, "I6", [I6, I7]>; + // Register classes. // // FIXME: the register order should be defined in terms of the preferred @@ -231,6 +273,13 @@ def IntRegs : RegisterClass<"SP", [i32, i64], 32, (sequence "L%u", 0, 7), (sequence "O%u", 0, 7))>; +// Should be in the same order as IntRegs. +def IntPair : RegisterClass<"SP", [v2i32], 64, + (add I0_I1, I2_I3, I4_I5, I6_I7, + G0_G1, G2_G3, G4_G5, G6_G7, + L0_L1, L2_L3, L4_L5, L6_L7, + O0_O1, O2_O3, O4_O5, O6_O7)>; + // Register class for 64-bit mode, with a 64-bit spill slot size. // These are the same as the 32-bit registers, so TableGen will consider this // to be a sub-class of IntRegs. That works out because requiring a 64-bit @@ -252,3 +301,8 @@ def ASRRegs : RegisterClass<"SP", [i32], 32, (add Y, (sequence "ASR%u", 1, 31))> { let isAllocatable = 0; } + +// Privileged Registers +def PRRegs : RegisterClass<"SP", [i64], 64, + (add TPC, TNPC, TSTATE, TT, TICK, TBA, PSTATE, TL, PIL, CWP, + CANSAVE, CANRESTORE, CLEANWIN, OTHERWIN, WSTATE)>; diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp index d69da40..d701594 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp +++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.cpp @@ -64,7 +64,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const { frameSize += 128; // Frames with calls must also reserve space for 6 outgoing arguments // whether they are used or not. LowerCall_64 takes care of that. - assert(frameSize % 16 == 0 && "Stack size not 16-byte aligned"); + frameSize = RoundUpToAlignment(frameSize, 16); } else { // Emit the correct save instruction based on the number of bytes in // the frame. Minimum stack frame size according to V8 ABI is: @@ -81,3 +81,7 @@ int SparcSubtarget::getAdjustedFrameSize(int frameSize) const { } return frameSize; } + +bool SparcSubtarget::enableMachineScheduler() const { + return true; +} diff --git a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h index 9d21911..e2fd2f0 100644 --- a/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h +++ b/contrib/llvm/lib/Target/Sparc/SparcSubtarget.h @@ -60,6 +60,8 @@ public: return &TSInfo; } + bool enableMachineScheduler() const override; + bool isV9() const { return IsV9; } bool isVIS() const { return IsVIS; } bool isVIS2() const { return IsVIS2; } @@ -85,7 +87,6 @@ public: /// returns adjusted framesize which includes space for register window /// spills and arguments. int getAdjustedFrameSize(int stackSize) const; - }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 3aa4c6b..9c995bf 100644 --- a/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/contrib/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -349,7 +349,6 @@ class SystemZAsmParser : public MCTargetAsmParser { #include "SystemZGenAsmMatcher.inc" private: - MCSubtargetInfo &STI; MCAsmParser &Parser; enum RegisterGroup { RegGR, @@ -386,14 +385,14 @@ private: bool parseOperand(OperandVector &Operands, StringRef Mnemonic); public: - SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser, + SystemZAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser, const MCInstrInfo &MII, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), Parser(parser) { + : MCTargetAsmParser(Options, sti), Parser(parser) { MCAsmParserExtension::Initialize(Parser); // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); } // Override MCTargetAsmParser. @@ -533,14 +532,16 @@ bool SystemZAsmParser::parseRegister(Register &Reg) { } // Parse a register of group Group. If Regs is nonnull, use it to map -// the raw register number to LLVM numbering, with zero entries indicating -// an invalid register. IsAddress says whether the register appears in an -// address context. +// the raw register number to LLVM numbering, with zero entries +// indicating an invalid register. IsAddress says whether the +// register appears in an address context. Allow FP Group if expecting +// RegV Group, since the f-prefix yields the FP group even while used +// with vector instructions. bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs, bool IsAddress) { if (parseRegister(Reg)) return true; - if (Reg.Group != Group) + if (Reg.Group != Group && !(Reg.Group == RegFP && Group == RegV)) return Error(Reg.StartLoc, "invalid operand for instruction"); if (Regs && Regs[Reg.Num] == 0) return Error(Reg.StartLoc, "invalid register pair"); @@ -791,7 +792,7 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, switch (MatchResult) { case Match_Success: Inst.setLoc(IDLoc); - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, getSTI()); return false; case Match_MissingFeature: { diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp index 059ae3f..6444cf8 100644 --- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp +++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp @@ -60,15 +60,15 @@ void SystemZInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const { O << '%' << getRegisterName(RegNo); } -template<unsigned N> -void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { +template <unsigned N> +static void printUImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { int64_t Value = MI->getOperand(OpNum).getImm(); assert(isUInt<N>(Value) && "Invalid uimm argument"); O << Value; } -template<unsigned N> -void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { +template <unsigned N> +static void printSImmOperand(const MCInst *MI, int OpNum, raw_ostream &O) { int64_t Value = MI->getOperand(OpNum).getImm(); assert(isInt<N>(Value) && "Invalid simm argument"); O << Value; diff --git a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h index ba55e68..7ca386f 100644 --- a/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h +++ b/contrib/llvm/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h @@ -15,7 +15,6 @@ #define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/Compiler.h" namespace llvm { class MCOperand; diff --git a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp index 5fefa31..2115d44 100644 --- a/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp @@ -226,7 +226,7 @@ extern "C" void LLVMInitializeSystemZTargetMC() { // Register the MCCodeEmitter. TargetRegistry::RegisterMCCodeEmitter(TheSystemZTarget, - createSystemZMCCodeEmitter); + createSystemZMCCodeEmitter); // Register the MCInstrInfo. TargetRegistry::RegisterMCInstrInfo(TheSystemZTarget, diff --git a/contrib/llvm/lib/Target/SystemZ/README.txt b/contrib/llvm/lib/Target/SystemZ/README.txt index e089047..cd367d6 100644 --- a/contrib/llvm/lib/Target/SystemZ/README.txt +++ b/contrib/llvm/lib/Target/SystemZ/README.txt @@ -52,12 +52,6 @@ We don't use the TEST DATA CLASS instructions. -- -We could use the generic floating-point forms of LOAD COMPLEMENT, -LOAD NEGATIVE and LOAD POSITIVE in cases where we don't need the -condition codes. For example, we could use LCDFR instead of LCDBR. - --- - We only use MVC, XC and CLC for constant-length block operations. We could extend them to variable-length operations too, using EXECUTE RELATIVE LONG. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp index 3dca7bd..7527311 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp @@ -288,7 +288,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) { MCSymbolRefExpr::create(getSymbol(ZCPV->getGlobalValue()), getModifierVariantKind(ZCPV->getModifier()), OutContext); - uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType()); + uint64_t Size = getDataLayout().getTypeAllocSize(ZCPV->getType()); OutStreamer->EmitValue(Expr, Size); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp index 44ea1d2..4a6beb6 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp @@ -26,21 +26,6 @@ SystemZConstantPoolValue::Create(const GlobalValue *GV, return new SystemZConstantPoolValue(GV, Modifier); } -unsigned SystemZConstantPoolValue::getRelocationInfo() const { - switch (Modifier) { - case SystemZCP::TLSGD: - case SystemZCP::TLSLDM: - case SystemZCP::DTPOFF: - // May require a dynamic relocation. - return 2; - case SystemZCP::NTPOFF: - // May require a relocation, but the relocations are always resolved - // by the static linker. - return 1; - } - llvm_unreachable("Unknown modifier"); -} - int SystemZConstantPoolValue:: getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) { unsigned AlignMask = Alignment - 1; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h index e5f1bb1..a71b595 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h @@ -43,7 +43,6 @@ public: Create(const GlobalValue *GV, SystemZCP::SystemZCPModifier Modifier); // Override MachineConstantPoolValue. - unsigned getRelocationInfo() const override; int getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) override; void addSelectionDAGCSEId(FoldingSetNodeID &ID) override; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp index 16f9adc..4818ed0 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp @@ -37,13 +37,11 @@ namespace { // instructions. struct Reference { Reference() - : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {} + : Def(false), Use(false) {} Reference &operator|=(const Reference &Other) { Def |= Other.Def; - IndirectDef |= Other.IndirectDef; Use |= Other.Use; - IndirectUse |= Other.IndirectUse; return *this; } @@ -53,11 +51,6 @@ struct Reference { // via a sub- or super-register. bool Def; bool Use; - - // True if the register is defined or used indirectly, by a sub- or - // super-register. - bool IndirectDef; - bool IndirectUse; }; class SystemZElimCompare : public MachineFunctionPass { @@ -104,14 +97,12 @@ static bool isCCLiveOut(MachineBasicBlock &MBB) { return false; } -// Return true if any CC result of MI would reflect the value of subreg -// SubReg of Reg. -static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) { +// Return true if any CC result of MI would reflect the value of Reg. +static bool resultTests(MachineInstr *MI, unsigned Reg) { if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg() && MI->getOperand(0).isDef() && - MI->getOperand(0).getReg() == Reg && - MI->getOperand(0).getSubReg() == SubReg) + MI->getOperand(0).getReg() == Reg) return true; switch (MI->getOpcode()) { @@ -127,30 +118,25 @@ static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) { case SystemZ::LTEBR: case SystemZ::LTDBR: case SystemZ::LTXBR: - if (MI->getOperand(1).getReg() == Reg && - MI->getOperand(1).getSubReg() == SubReg) + if (MI->getOperand(1).getReg() == Reg) return true; } return false; } -// Describe the references to Reg in MI, including sub- and super-registers. +// Describe the references to Reg or any of its aliases in MI. Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) { Reference Ref; for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { const MachineOperand &MO = MI->getOperand(I); if (MO.isReg()) { if (unsigned MOReg = MO.getReg()) { - if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) { - if (MO.isUse()) { + if (TRI->regsOverlap(MOReg, Reg)) { + if (MO.isUse()) Ref.Use = true; - Ref.IndirectUse |= (MOReg != Reg); - } - if (MO.isDef()) { + else if (MO.isDef()) Ref.Def = true; - Ref.IndirectDef |= (MOReg != Reg); - } } } } @@ -158,6 +144,30 @@ Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) { return Ref; } +// Return true if this is a load and test which can be optimized the +// same way as compare instruction. +static bool isLoadAndTestAsCmp(MachineInstr *MI) { + // If we during isel used a load-and-test as a compare with 0, the + // def operand is dead. + return ((MI->getOpcode() == SystemZ::LTEBR || + MI->getOpcode() == SystemZ::LTDBR || + MI->getOpcode() == SystemZ::LTXBR) && + MI->getOperand(0).isDead()); +} + +// Return the source register of Compare, which is the unknown value +// being tested. +static unsigned getCompareSourceReg(MachineInstr *Compare) { + unsigned reg = 0; + if (Compare->isCompare()) + reg = Compare->getOperand(0).getReg(); + else if (isLoadAndTestAsCmp(Compare)) + reg = Compare->getOperand(1).getReg(); + assert (reg); + + return reg; +} + // Compare compares the result of MI against zero. If MI is an addition // of -1 and if CCUsers is a single branch on nonzero, eliminate the addition // and convert the branch to a BRCT(G). Return true on success. @@ -188,7 +198,7 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare, // We already know that there are no references to the register between // MI and Compare. Make sure that there are also no references between // Compare and Branch. - unsigned SrcReg = Compare->getOperand(0).getReg(); + unsigned SrcReg = getCompareSourceReg(Compare); MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch; for (++MBBI; MBBI != MBBE; ++MBBI) if (getRegReferences(MBBI, SrcReg)) @@ -196,16 +206,15 @@ SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare, // The transformation is OK. Rebuild Branch as a BRCT(G). MachineOperand Target(Branch->getOperand(2)); - Branch->RemoveOperand(2); - Branch->RemoveOperand(1); - Branch->RemoveOperand(0); + while (Branch->getNumOperands()) + Branch->RemoveOperand(0); Branch->setDesc(TII->get(BRCT)); MachineInstrBuilder(*Branch->getParent()->getParent(), Branch) .addOperand(MI->getOperand(0)) .addOperand(MI->getOperand(1)) .addOperand(Target) .addReg(SystemZ::CC, RegState::ImplicitDefine); - MI->removeFromParent(); + MI->eraseFromParent(); return true; } @@ -308,6 +317,10 @@ static bool isCompareZero(MachineInstr *Compare) { return true; default: + + if (isLoadAndTestAsCmp(Compare)) + return true; + return (Compare->getNumExplicitOperands() == 2 && Compare->getOperand(1).isImm() && Compare->getOperand(1).getImm() == 0); @@ -325,8 +338,7 @@ optimizeCompareZero(MachineInstr *Compare, return false; // Search back for CC results that are based on the first operand. - unsigned SrcReg = Compare->getOperand(0).getReg(); - unsigned SrcSubReg = Compare->getOperand(0).getSubReg(); + unsigned SrcReg = getCompareSourceReg(Compare); MachineBasicBlock &MBB = *Compare->getParent(); MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB.begin(); Reference CCRefs; @@ -334,7 +346,7 @@ optimizeCompareZero(MachineInstr *Compare, while (MBBI != MBBE) { --MBBI; MachineInstr *MI = MBBI; - if (resultTests(MI, SrcReg, SrcSubReg)) { + if (resultTests(MI, SrcReg)) { // Try to remove both MI and Compare by converting a branch to BRCT(G). // We don't care in this case whether CC is modified between MI and // Compare. @@ -435,23 +447,21 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock &MBB) { while (MBBI != MBB.begin()) { MachineInstr *MI = --MBBI; if (CompleteCCUsers && - MI->isCompare() && + (MI->isCompare() || isLoadAndTestAsCmp(MI)) && (optimizeCompareZero(MI, CCUsers) || fuseCompareAndBranch(MI, CCUsers))) { ++MBBI; - MI->removeFromParent(); + MI->eraseFromParent(); Changed = true; CCUsers.clear(); - CompleteCCUsers = true; continue; } - Reference CCRefs(getRegReferences(MI, SystemZ::CC)); - if (CCRefs.Def) { + if (MI->definesRegister(SystemZ::CC)) { CCUsers.clear(); - CompleteCCUsers = !CCRefs.IndirectDef; + CompleteCCUsers = true; } - if (CompleteCCUsers && CCRefs.Use) + if (MI->readsRegister(SystemZ::CC) && CompleteCCUsers) CCUsers.push_back(MI); } return Changed; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 397de47..e1b20d0 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -48,7 +48,8 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = { SystemZFrameLowering::SystemZFrameLowering() : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8, - -SystemZMC::CallFrameSize, 8) { + -SystemZMC::CallFrameSize, 8, + false /* StackRealignable */) { // Create a mapping from register number to save slot offset. RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS); for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I) @@ -133,7 +134,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB, const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>(); bool IsVarArg = MF.getFunction()->isVarArg(); - DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + DebugLoc DL; // Scan the call-saved GPRs and find the bounds of the register spill area. unsigned LowGPR = 0; @@ -322,7 +323,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF, const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo(); bool HasFP = hasFP(MF); - DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. + DebugLoc DL; // The current offset of the stack pointer from the CFA. int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP; @@ -394,7 +398,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF, // Add CFI for the this save. unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - int64_t Offset = getFrameIndexOffset(MF, Save.getFrameIdx()); + unsigned IgnoredFrameReg; + int64_t Offset = + getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg); + unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset( nullptr, DwarfReg, SPOffsetFromCFA + Offset)); CFIIndexes.push_back(CFIIndex); @@ -455,9 +462,14 @@ bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const { MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP()); } -int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFFrame = MF.getFrameInfo(); + const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); + + // Fill in FrameReg output argument. + FrameReg = RI->getFrameRegister(MF); // Start with the offset of FI from the top of the caller-allocated frame // (i.e. the top of the 160 bytes allocated by the caller). This initial diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h index 5ade757..46bb6b7 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZFrameLowering.h @@ -43,7 +43,8 @@ public: void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; bool hasFP(const MachineFunction &MF) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; + int getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp index 75fd37f..a909309 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -585,7 +585,7 @@ bool SystemZDAGToDAGISel::selectAddress(SDValue Addr, static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) { if (N.getNode()->getNodeId() == -1 || N.getNode()->getNodeId() > Pos->getNodeId()) { - DAG->RepositionNode(Pos, N.getNode()); + DAG->RepositionNode(Pos->getIterator(), N.getNode()); N.getNode()->setNodeId(Pos->getNodeId()); } } @@ -801,7 +801,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const { RxSBG.Input = N.getOperand(0); return true; } - + case ISD::ANY_EXTEND: // Bits above the extended operand are don't-care. RxSBG.Input = N.getOperand(0); @@ -818,7 +818,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const { return true; } // Fall through. - + case ISD::SIGN_EXTEND: { // Check that the extension bits are don't-care (i.e. are masked out // by the final mask). @@ -938,7 +938,23 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) { } return nullptr; } - } + } + + // If the RISBG operands require no rotation and just masks the bottom + // 8/16 bits, attempt to convert this to a LLC zero extension. + if (RISBG.Rotate == 0 && (RISBG.Mask == 0xff || RISBG.Mask == 0xffff)) { + unsigned OpCode = (RISBG.Mask == 0xff ? SystemZ::LLGCR : SystemZ::LLGHR); + if (VT == MVT::i32) { + if (Subtarget->hasHighWord()) + OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCRMux : SystemZ::LLHRMux); + else + OpCode = (RISBG.Mask == 0xff ? SystemZ::LLCR : SystemZ::LLHR); + } + + SDValue In = convertTo(DL, VT, RISBG.Input); + N = CurDAG->getMachineNode(OpCode, DL, VT, In); + return convertTo(DL, VT, SDValue(N, 0)).getNode(); + } unsigned Opcode = SystemZ::RISBG; // Prefer RISBGN if available, since it does not clobber CC. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 9a753c8..ee73267 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -84,8 +84,7 @@ static MachineOperand earlyUseOperand(MachineOperand Op) { SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, const SystemZSubtarget &STI) : TargetLowering(TM), Subtarget(STI) { - auto &DL = *TM.getDataLayout(); - MVT PtrVT = getPointerTy(DL); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the register classes. if (Subtarget.hasHighWord()) @@ -115,8 +114,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, computeRegisterProperties(Subtarget.getRegisterInfo()); // Set up special registers. - setExceptionPointerRegister(SystemZ::R6D); - setExceptionSelectorRegister(SystemZ::R7D); setStackPointerRegisterToSaveRestore(SystemZ::R15D); // TODO: It may be better to default to latency-oriented scheduling, however @@ -370,7 +367,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // No special instructions for these. setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FPOW, VT, Expand); } } @@ -776,9 +775,7 @@ bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType, } bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { - if (!CI->isTailCall()) - return false; - return true; + return CI->isTailCall(); } // We do not yet support 128-bit single-element vector types. If the user @@ -939,8 +936,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); ArgValue = DAG.getLoad(LocVT, DL, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + MachinePointerInfo::getFixedStack(MF, FI), false, + false, false, 0); } // Convert the value of the argument register into the value that's @@ -976,9 +973,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, &SystemZ::FP64BitRegClass); SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64); MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN, - MachinePointerInfo::getFixedStack(FI), + MachinePointerInfo::getFixedStack(MF, FI), false, false, 0); - } // Join the stores, which are independent of one another. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, @@ -1060,9 +1056,9 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Store the argument in a stack slot and pass its address. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); - MemOpChains.push_back(DAG.getStore(Chain, DL, ArgValue, SpillSlot, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains.push_back(DAG.getStore( + Chain, DL, ArgValue, SpillSlot, + MachinePointerInfo::getFixedStack(MF, FI), false, false, 0)); ArgValue = SpillSlot; } else ArgValue = convertValVTToLocVT(DAG, DL, VA, ArgValue); @@ -1607,8 +1603,8 @@ static void adjustSubwordCmp(SelectionDAG &DAG, SDLoc DL, Comparison &C) { } else if (Load->getExtensionType() == ISD::ZEXTLOAD) { if (Value > Mask) return; - assert(C.ICmpType == SystemZICMP::Any && - "Signedness shouldn't matter here."); + // If the constant is in range, we can use any comparison. + C.ICmpType = SystemZICMP::Any; } else return; @@ -2439,7 +2435,8 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node, Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT); Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result); Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // If there was a non-zero offset that we didn't fold, create an explicit @@ -2499,7 +2496,9 @@ SDValue SystemZTargetLowering::lowerTLSGetOffset(GlobalAddressSDNode *Node, } SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, - SelectionDAG &DAG) const { + SelectionDAG &DAG) const { + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(Node, DAG); SDLoc DL(Node); const GlobalValue *GV = Node->getGlobal(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); @@ -2529,9 +2528,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD); Offset = DAG.getConstantPool(CPV, PtrVT, 8); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); // Call __tls_get_offset to retrieve the offset. Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_GDCALL, Offset); @@ -2544,9 +2544,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM); Offset = DAG.getConstantPool(CPV, PtrVT, 8); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); // Call __tls_get_offset to retrieve the module base offset. Offset = lowerTLSGetOffset(Node, DAG, SystemZISD::TLS_LDCALL, Offset); @@ -2562,9 +2563,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF); SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8); - DTPOffset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - DTPOffset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + DTPOffset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), DTPOffset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); Offset = DAG.getNode(ISD::ADD, DL, PtrVT, Offset, DTPOffset); break; @@ -2575,8 +2577,8 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, Offset = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_INDNTPOFF); Offset = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Offset); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getGOT(), + Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); break; } @@ -2587,9 +2589,10 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node, SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF); Offset = DAG.getConstantPool(CPV, PtrVT, 8); - Offset = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), - Offset, MachinePointerInfo::getConstantPool(), - false, false, false, 0); + Offset = DAG.getLoad( + PtrVT, DL, DAG.getEntryNode(), Offset, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, 0); break; } } @@ -2628,10 +2631,10 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP, SDValue Result; if (CP->isMachineConstantPoolEntry()) Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, - CP->getAlignment()); + CP->getAlignment()); else Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, - CP->getAlignment(), CP->getOffset()); + CP->getAlignment(), CP->getOffset()); // Use LARL to load the address of the constant pool entry. return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result); @@ -2736,17 +2739,37 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op, SDValue SystemZTargetLowering:: lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { + const TargetFrameLowering *TFI = Subtarget.getFrameLowering(); + bool RealignOpt = !DAG.getMachineFunction().getFunction()-> + hasFnAttribute("no-realign-stack"); + SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); + SDValue Align = Op.getOperand(2); SDLoc DL(Op); + // If user has set the no alignment function attribute, ignore + // alloca alignments. + uint64_t AlignVal = (RealignOpt ? + dyn_cast<ConstantSDNode>(Align)->getZExtValue() : 0); + + uint64_t StackAlign = TFI->getStackAlignment(); + uint64_t RequiredAlign = std::max(AlignVal, StackAlign); + uint64_t ExtraAlignSpace = RequiredAlign - StackAlign; + unsigned SPReg = getStackPointerRegisterToSaveRestore(); + SDValue NeededSpace = Size; // Get a reference to the stack pointer. SDValue OldSP = DAG.getCopyFromReg(Chain, DL, SPReg, MVT::i64); + // Add extra space for alignment if needed. + if (ExtraAlignSpace) + NeededSpace = DAG.getNode(ISD::ADD, DL, MVT::i64, NeededSpace, + DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); + // Get the new stack pointer value. - SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, Size); + SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace); // Copy the new stack pointer back. Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP); @@ -2757,6 +2780,16 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { SDValue ArgAdjust = DAG.getNode(SystemZISD::ADJDYNALLOC, DL, MVT::i64); SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust); + // Dynamically realign if needed. + if (RequiredAlign > StackAlign) { + Result = + DAG.getNode(ISD::ADD, DL, MVT::i64, Result, + DAG.getConstant(ExtraAlignSpace, DL, MVT::i64)); + Result = + DAG.getNode(ISD::AND, DL, MVT::i64, Result, + DAG.getConstant(~(RequiredAlign - 1), DL, MVT::i64)); + } + SDValue Ops[2] = { Result, Chain }; return DAG.getMergeValues(Ops, DL); } @@ -2837,7 +2870,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op, } else if (DAG.ComputeNumSignBits(Op1) > 32) { Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1); Opcode = SystemZISD::SDIVREM32; - } else + } else Opcode = SystemZISD::SDIVREM64; // DSG(F) takes a 64-bit dividend, so the even register in the GR128 @@ -3247,8 +3280,8 @@ SystemZTargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op, if (Op->getNumValues() == 1) return CC; assert(Op->getNumValues() == 2 && "Expected a CC and non-CC result"); - return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), - Glued, CC); + return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), Op->getVTList(), Glued, + CC); } unsigned Id = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); @@ -3890,7 +3923,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG, GS.addUndef(); } else { GS.add(SDValue(), ResidueOps.size()); - ResidueOps.push_back(Op); + ResidueOps.push_back(BVN->getOperand(I)); } } @@ -3901,7 +3934,7 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG, // Create the BUILD_VECTOR for the remaining elements, if any. if (!ResidueOps.empty()) { while (ResidueOps.size() < NumElements) - ResidueOps.push_back(DAG.getUNDEF(VT.getVectorElementType())); + ResidueOps.push_back(DAG.getUNDEF(ResidueOps[0].getValueType())); for (auto &Op : GS.Ops) { if (!Op.getNode()) { Op = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BVN), VT, ResidueOps); @@ -4204,7 +4237,7 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const { + unsigned UnpackHigh) const { SDValue PackedOp = Op.getOperand(0); EVT OutVT = Op.getValueType(); EVT InVT = PackedOp.getValueType(); @@ -4566,9 +4599,9 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT, } return Op; } else if ((Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || - Opcode == ISD::ZERO_EXTEND_VECTOR_INREG || - Opcode == ISD::ANY_EXTEND_VECTOR_INREG) && - canTreatAsByteVector(Op.getValueType()) && + Opcode == ISD::ZERO_EXTEND_VECTOR_INREG || + Opcode == ISD::ANY_EXTEND_VECTOR_INREG) && + canTreatAsByteVector(Op.getValueType()) && canTreatAsByteVector(Op.getOperand(0).getValueType())) { // Make sure that only the unextended bits are significant. EVT ExtVT = Op.getValueType(); @@ -4579,14 +4612,14 @@ SDValue SystemZTargetLowering::combineExtract(SDLoc DL, EVT ResVT, EVT VecVT, unsigned SubByte = Byte % ExtBytesPerElement; unsigned MinSubByte = ExtBytesPerElement - OpBytesPerElement; if (SubByte < MinSubByte || - SubByte + BytesPerElement > ExtBytesPerElement) - break; + SubByte + BytesPerElement > ExtBytesPerElement) + break; // Get the byte offset of the unextended element Byte = Byte / ExtBytesPerElement * OpBytesPerElement; // ...then add the byte offset relative to that element. Byte += SubByte - MinSubByte; if (Byte % BytesPerElement != 0) - break; + break; Op = Op.getOperand(0); Index = Byte / BytesPerElement; Force = true; @@ -5611,6 +5644,31 @@ SystemZTargetLowering::emitTransactionBegin(MachineInstr *MI, return MBB; } +MachineBasicBlock * +SystemZTargetLowering::emitLoadAndTestCmp0(MachineInstr *MI, + MachineBasicBlock *MBB, + unsigned Opcode) const { + MachineFunction &MF = *MBB->getParent(); + MachineRegisterInfo *MRI = &MF.getRegInfo(); + const SystemZInstrInfo *TII = + static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); + DebugLoc DL = MI->getDebugLoc(); + + unsigned SrcReg = MI->getOperand(0).getReg(); + + // Create new virtual register of the same class as source. + const TargetRegisterClass *RC = MRI->getRegClass(SrcReg); + unsigned DstReg = MRI->createVirtualRegister(RC); + + // Replace pseudo with a normal load-and-test that models the def as + // well. + BuildMI(*MBB, MI, DL, TII->get(Opcode), DstReg) + .addReg(SrcReg); + MI->eraseFromParent(); + + return MBB; +} + MachineBasicBlock *SystemZTargetLowering:: EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const { switch (MI->getOpcode()) { @@ -5858,6 +5916,13 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const { return emitTransactionBegin(MI, MBB, SystemZ::TBEGIN, true); case SystemZ::TBEGINC: return emitTransactionBegin(MI, MBB, SystemZ::TBEGINC, true); + case SystemZ::LTEBRCompare_VecPseudo: + return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTEBR); + case SystemZ::LTDBRCompare_VecPseudo: + return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTDBR); + case SystemZ::LTXBRCompare_VecPseudo: + return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR); + default: llvm_unreachable("Unexpected instr type to insert"); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 07ff251..391636e 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -409,6 +409,20 @@ public: return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return SystemZ::R6D; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return SystemZ::R7D; + } + MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const override; @@ -481,7 +495,7 @@ private: SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, - unsigned UnpackHigh) const; + unsigned UnpackHigh) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp, @@ -530,6 +544,10 @@ private: MachineBasicBlock *MBB, unsigned Opcode, bool NoFloat) const; + MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr *MI, + MachineBasicBlock *MBB, + unsigned Opcode) const; + }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h index 464f79a..5a1c874 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h @@ -35,11 +35,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) { if (MCID.mayStore()) Flags |= MachineMemOperand::MOStore; int64_t Offset = 0; - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo( - PseudoSourceValue::getFixedStack(FI), Offset), - Flags, MFFrame->getObjectSize(FI), - MFFrame->getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags, + MFFrame->getObjectSize(FI), MFFrame->getObjectAlignment(FI)); return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td index 27fbd7d..0cb2672 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -46,15 +46,28 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>; defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>; } -// Note that the comparison against zero operation is not available if we -// have vector support, since load-and-test instructions will partially -// clobber the target (vector) register. +// Note that LTxBRCompare is not available if we have vector support, +// since load-and-test instructions will partially clobber the target +// (vector) register. let Predicates = [FeatureNoVector] in { defm : CompareZeroFP<LTEBRCompare, FP32>; defm : CompareZeroFP<LTDBRCompare, FP64>; defm : CompareZeroFP<LTXBRCompare, FP128>; } +// Use a normal load-and-test for compare against zero in case of +// vector support (via a pseudo to simplify instruction selection). +let Defs = [CC], usesCustomInserter = 1 in { + def LTEBRCompare_VecPseudo : Pseudo<(outs), (ins FP32:$R1, FP32:$R2), []>; + def LTDBRCompare_VecPseudo : Pseudo<(outs), (ins FP64:$R1, FP64:$R2), []>; + def LTXBRCompare_VecPseudo : Pseudo<(outs), (ins FP128:$R1, FP128:$R2), []>; +} +let Predicates = [FeatureVector] in { + defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>; + defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>; + defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>; +} + // Moves between 64-bit integer and floating-point registers. def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>; def LDGR : UnaryRRE<"ldg", 0xB3C1, bitconvert, FP64, GR64>; @@ -238,26 +251,46 @@ let Predicates = [FeatureFPExtension] in { // Unary arithmetic //===----------------------------------------------------------------------===// +// We prefer generic instructions during isel, because they do not +// clobber CC and therefore give the scheduler more freedom. In cases +// the CC is actually useful, the SystemZElimCompare pass will try to +// convert generic instructions into opcodes that also set CC. Note +// that lcdf / lpdf / lndf only affect the sign bit, and can therefore +// be used with fp32 as well. This could be done for fp128, in which +// case the operands would have to be tied. + // Negation (Load Complement). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { - def LCEBR : UnaryRRE<"lceb", 0xB303, fneg, FP32, FP32>; - def LCDBR : UnaryRRE<"lcdb", 0xB313, fneg, FP64, FP64>; + def LCEBR : UnaryRRE<"lceb", 0xB303, null_frag, FP32, FP32>; + def LCDBR : UnaryRRE<"lcdb", 0xB313, null_frag, FP64, FP64>; def LCXBR : UnaryRRE<"lcxb", 0xB343, fneg, FP128, FP128>; } +// Generic form, which does not set CC. +def LCDFR : UnaryRRE<"lcdf", 0xB373, fneg, FP64, FP64>; +let isCodeGenOnly = 1 in + def LCDFR_32 : UnaryRRE<"lcdf", 0xB373, fneg, FP32, FP32>; // Absolute value (Load Positive). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { - def LPEBR : UnaryRRE<"lpeb", 0xB300, fabs, FP32, FP32>; - def LPDBR : UnaryRRE<"lpdb", 0xB310, fabs, FP64, FP64>; + def LPEBR : UnaryRRE<"lpeb", 0xB300, null_frag, FP32, FP32>; + def LPDBR : UnaryRRE<"lpdb", 0xB310, null_frag, FP64, FP64>; def LPXBR : UnaryRRE<"lpxb", 0xB340, fabs, FP128, FP128>; } +// Generic form, which does not set CC. +def LPDFR : UnaryRRE<"lpdf", 0xB370, fabs, FP64, FP64>; +let isCodeGenOnly = 1 in + def LPDFR_32 : UnaryRRE<"lpdf", 0xB370, fabs, FP32, FP32>; // Negative absolute value (Load Negative). let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in { - def LNEBR : UnaryRRE<"lneb", 0xB301, fnabs, FP32, FP32>; - def LNDBR : UnaryRRE<"lndb", 0xB311, fnabs, FP64, FP64>; + def LNEBR : UnaryRRE<"lneb", 0xB301, null_frag, FP32, FP32>; + def LNDBR : UnaryRRE<"lndb", 0xB311, null_frag, FP64, FP64>; def LNXBR : UnaryRRE<"lnxb", 0xB341, fnabs, FP128, FP128>; } +// Generic form, which does not set CC. +def LNDFR : UnaryRRE<"lndf", 0xB371, fnabs, FP64, FP64>; +let isCodeGenOnly = 1 in + def LNDFR_32 : UnaryRRE<"lndf", 0xB371, fnabs, FP32, FP32>; // Square root. def SQEBR : UnaryRRE<"sqeb", 0xB314, fsqrt, FP32, FP32>; @@ -414,6 +447,6 @@ let Defs = [CC], CCValues = 0xF in { // Peepholes //===----------------------------------------------------------------------===// -def : Pat<(f32 fpimmneg0), (LCEBR (LZER))>; -def : Pat<(f64 fpimmneg0), (LCDBR (LZDR))>; +def : Pat<(f32 fpimmneg0), (LCDFR_32 (LZER))>; +def : Pat<(f64 fpimmneg0), (LCDFR (LZDR))>; def : Pat<(f128 fpimmneg0), (LCXBR (LZXR))>; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td index 71eb998..01f4cde 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -2381,6 +2381,7 @@ multiclass StringRRE<string mnemonic, bits<16> opcode, def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2), (ins GR64:$R1src, GR64:$R2src), mnemonic#"\t$R1, $R2", []> { + let Uses = [R0L]; let Constraints = "$R1 = $R1src, $R2 = $R2src"; let DisableEncoding = "$R1src, $R2src"; } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp index 5d4a34f..e6b5fc8 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -69,6 +69,11 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI, MachineOperand &LowOffsetOp = MI->getOperand(2); LowOffsetOp.setImm(LowOffsetOp.getImm() + 8); + // Clear the kill flags for the base and index registers in the first + // instruction. + EarlierMI->getOperand(1).setIsKill(false); + EarlierMI->getOperand(3).setIsKill(false); + // Set the opcodes. unsigned HighOpcode = getOpcodeForOffset(NewOpcode, HighOffsetOp.getImm()); unsigned LowOpcode = getOpcodeForOffset(NewOpcode, LowOffsetOp.getImm()); @@ -111,7 +116,7 @@ void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode, } // MI is a three-operand RIE-style pseudo instruction. Replace it with -// LowOpcode3 if the registers are both low GR32s, otherwise use a move +// LowOpcodeK if the registers are both low GR32s, otherwise use a move // followed by HighOpcode or LowOpcode, depending on whether the target // is a high or low GR32. void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode, @@ -129,6 +134,7 @@ void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode, MI->getOperand(1).isKill()); MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode)); MI->getOperand(1).setReg(DestReg); + MI->tieOperands(0, 1); } } @@ -486,11 +492,8 @@ SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare, const MachineRegisterInfo *MRI) const { assert(!SrcReg2 && "Only optimizing constant comparisons so far"); bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0; - if (Value == 0 && - !IsLogical && - removeIPMBasedCompare(Compare, SrcReg, MRI, &RI)) - return true; - return false; + return Value == 0 && !IsLogical && + removeIPMBasedCompare(Compare, SrcReg, MRI, &RI); } // If Opcode is a move that has a conditional variant, return that variant, @@ -505,16 +508,13 @@ static unsigned getConditionalMove(unsigned Opcode) { bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const { unsigned Opcode = MI->getOpcode(); - if (STI.hasLoadStoreOnCond() && - getConditionalMove(Opcode)) - return true; - return false; + return STI.hasLoadStoreOnCond() && getConditionalMove(Opcode); } bool SystemZInstrInfo:: isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const { + BranchProbability Probability) const { // For now only convert single instructions. return NumCycles == 1; } @@ -524,7 +524,7 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumCyclesT, unsigned ExtraPredCyclesT, MachineBasicBlock &FMBB, unsigned NumCyclesF, unsigned ExtraPredCyclesF, - const BranchProbability &Probability) const { + BranchProbability Probability) const { // For now avoid converting mutually-exclusive cases. return false; } @@ -548,11 +548,10 @@ PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const { return false; } -void -SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { +void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { // Split 128-bit GPR moves into two 64-bit moves. This handles ADDR128 too. if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) { copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64), @@ -590,13 +589,10 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(KillSrc)); } -void -SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned SrcReg, bool isKill, - int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void SystemZInstrInfo::storeRegToStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg, + bool isKill, int FrameIdx, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves @@ -604,15 +600,14 @@ SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned LoadOpcode, StoreOpcode; getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode); addFrameReference(BuildMI(MBB, MBBI, DL, get(StoreOpcode)) - .addReg(SrcReg, getKillRegState(isKill)), FrameIdx); + .addReg(SrcReg, getKillRegState(isKill)), + FrameIdx); } -void -SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - unsigned DestReg, int FrameIdx, - const TargetRegisterClass *RC, - const TargetRegisterInfo *TRI) const { +void SystemZInstrInfo::loadRegFromStackSlot( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg, + int FrameIdx, const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc(); // Callers may expect a single instruction, so keep 128-bit moves @@ -681,7 +676,8 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, LiveVariables *LV) const { MachineInstr *MI = MBBI; MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned Opcode = MI->getOpcode(); unsigned NumOps = MI->getNumOperands(); @@ -708,14 +704,19 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, } int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode); if (ThreeOperandOpcode >= 0) { - MachineInstrBuilder MIB = - BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode)) - .addOperand(Dest); + // Create three address instruction without adding the implicit + // operands. Those will instead be copied over from the original + // instruction by the loop below. + MachineInstrBuilder MIB(*MF, + MF->CreateMachineInstr(get(ThreeOperandOpcode), + MI->getDebugLoc(), /*NoImplicit=*/true)); + MIB.addOperand(Dest); // Keep the kill state, but drop the tied flag. MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg()); // Keep the remaining operands as-is. for (unsigned I = 2; I < NumOps; ++I) MIB.addOperand(MI->getOperand(I)); + MBB->insert(MI, MIB); return finishConvertToThreeAddress(MI, MIB, LV); } } @@ -1191,6 +1192,12 @@ unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const { case SystemZ::LER: return SystemZ::LTEBR; case SystemZ::LDR: return SystemZ::LTDBR; case SystemZ::LXR: return SystemZ::LTXBR; + case SystemZ::LCDFR: return SystemZ::LCDBR; + case SystemZ::LPDFR: return SystemZ::LPDBR; + case SystemZ::LNDFR: return SystemZ::LNDBR; + case SystemZ::LCDFR_32: return SystemZ::LCEBR; + case SystemZ::LPDFR_32: return SystemZ::LPEBR; + case SystemZ::LNDFR_32: return SystemZ::LNEBR; // On zEC12 we prefer to use RISBGN. But if there is a chance to // actually use the condition code, we may turn it back into RISGB. // Note that RISBG is not really a "load-and-test" instruction, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h index 31c9db2..d9094ba 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -159,12 +159,12 @@ public: bool isPredicable(MachineInstr *MI) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumCyclesT, unsigned ExtraPredCyclesT, MachineBasicBlock &FMBB, unsigned NumCyclesF, unsigned ExtraPredCyclesF, - const BranchProbability &Probability) const override; + BranchProbability Probability) const override; bool PredicateInstruction(MachineInstr *MI, ArrayRef<MachineOperand> Pred) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td index 820f30b..b9f2eb5 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZInstrInfo.td @@ -397,7 +397,7 @@ let mayLoad = 1, mayStore = 1 in defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>; // String moves. -let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in +let mayLoad = 1, mayStore = 1, Defs = [CC] in defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>; //===----------------------------------------------------------------------===// @@ -424,7 +424,7 @@ let hasSideEffects = 0 in { def LGFR : UnaryRRE<"lgf", 0xB914, sext32, GR64, GR32>; } let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in - def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR64>; + def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR32>; // Match 32-to-64-bit sign extensions in which the source is already // in a 64-bit register. @@ -490,7 +490,7 @@ def : Pat<(and GR64:$src, 0xffffffff), def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>, Requires<[FeatureHighWord]>; def LLC : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>; -def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>, +def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GRH32, 1>, Requires<[FeatureHighWord]>; // 32-bit extensions from 16-bit memory. LLHMux expands to LLH or LLHH, @@ -498,7 +498,7 @@ def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>, def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>, Requires<[FeatureHighWord]>; def LLH : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>; -def LLHH : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>, +def LLHH : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GRH32, 2>, Requires<[FeatureHighWord]>; def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>; @@ -1147,7 +1147,7 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in { def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>, Requires<[FeatureHighWord]>; def CLFI : CompareRIL<"clfi", 0xC2F, z_ucmp, GR32, uimm32>; - def CLIH : CompareRIL<"clih", 0xCCF, z_ucmp, GR32, uimm32>, + def CLIH : CompareRIL<"clih", 0xCCF, z_ucmp, GRH32, uimm32>, Requires<[FeatureHighWord]>; def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>; @@ -1185,7 +1185,7 @@ let mayLoad = 1, Defs = [CC] in defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>; // String comparison. -let mayLoad = 1, Defs = [CC], Uses = [R0L] in +let mayLoad = 1, Defs = [CC] in defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>; // Test under mask. @@ -1459,9 +1459,29 @@ let usesCustomInserter = 1 in { } // Search a block of memory for a character. -let mayLoad = 1, Defs = [CC], Uses = [R0L] in +let mayLoad = 1, Defs = [CC] in defm SRST : StringRRE<"srst", 0xb25e, z_search_string>; +// Other instructions for inline assembly +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STCK : InstS<0xB205, (outs), (ins bdaddr12only:$BD2), + "stck\t$BD2", + []>; +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STCKF : InstS<0xB27C, (outs), (ins bdaddr12only:$BD2), + "stckf\t$BD2", + []>; +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STCKE : InstS<0xB278, (outs), (ins bdaddr12only:$BD2), + "stcke\t$BD2", + []>; +let hasSideEffects = 1, Defs = [CC], mayStore = 1 in + def STFLE : InstS<0xB2B0, (outs), (ins bdaddr12only:$BD2), + "stfle\t$BD2", + []>; + + + //===----------------------------------------------------------------------===// // Peepholes. //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp index 00572d0..1a7c0d7 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=// +//=== SystemZMachineFunctionInfo.cpp - SystemZ machine function info ------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h index 34fc36d..f4a517b 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h @@ -1,4 +1,4 @@ -//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=// +//=== SystemZMachineFunctionInfo.h - SystemZ machine function info -*- C++ -*-// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp index dc7bd25..6fd24e3 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp @@ -69,8 +69,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // Decompose the frame index into a base and offset. int FrameIndex = MI->getOperand(FIOperandNum).getIndex(); - unsigned BasePtr = getFrameRegister(MF); - int64_t Offset = (TFI->getFrameIndexOffset(MF, FrameIndex) + + unsigned BasePtr; + int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) + MI->getOperand(FIOperandNum + 1).getImm()); // Special handling of dbg_value instructions. diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td index 85aa0a6..0d8b08b 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/contrib/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -282,4 +282,5 @@ def v128any : TypedReg<untyped, VR128>; // The 2-bit condition code field of the PSW. Every register named in an // inline asm needs a class associated with it. def CC : SystemZReg<"cc">; -def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>; +let isAllocatable = 0 in + def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>; diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp index d1a17c5..846edd5 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -16,6 +16,8 @@ #include "SystemZTargetMachine.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/LivePhysRegs.h" +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; @@ -35,19 +37,16 @@ public: bool runOnMachineFunction(MachineFunction &F) override; private: - bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther, - unsigned LLIxL, unsigned LLIxH); + bool shortenIIF(MachineInstr &MI, unsigned LLIxL, unsigned LLIxH); bool shortenOn0(MachineInstr &MI, unsigned Opcode); bool shortenOn01(MachineInstr &MI, unsigned Opcode); bool shortenOn001(MachineInstr &MI, unsigned Opcode); + bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode); bool shortenFPConv(MachineInstr &MI, unsigned Opcode); const SystemZInstrInfo *TII; - - // LowGPRs[I] has bit N set if LLVM register I includes the low - // word of GPR N. HighGPRs is the same for the high word. - unsigned LowGPRs[SystemZ::NUM_TARGET_REGS]; - unsigned HighGPRs[SystemZ::NUM_TARGET_REGS]; + const TargetRegisterInfo *TRI; + LivePhysRegs LiveRegs; }; char SystemZShortenInst::ID = 0; @@ -58,33 +57,31 @@ FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) { } SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm) - : MachineFunctionPass(ID), TII(nullptr), LowGPRs(), HighGPRs() { - // Set up LowGPRs and HighGPRs. - for (unsigned I = 0; I < 16; ++I) { - LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I; - LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I; - HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I; - HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I; - if (unsigned GR128 = SystemZMC::GR128Regs[I]) { - LowGPRs[GR128] |= 3 << I; - HighGPRs[GR128] |= 3 << I; - } - } + : MachineFunctionPass(ID), TII(nullptr) {} + +// Tie operands if MI has become a two-address instruction. +static void tieOpsIfNeeded(MachineInstr &MI) { + if (MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) && + !MI.getOperand(0).isTied()) + MI.tieOperands(0, 1); } // MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH // are the halfword immediate loads for the same word. Try to use one of them -// instead of IIxF. If MI loads the high word, GPRMap[X] is the set of high -// words referenced by LLVM register X while LiveOther is the mask of low -// words that are currently live, and vice versa. -bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap, - unsigned LiveOther, unsigned LLIxL, - unsigned LLIxH) { +// instead of IIxF. +bool SystemZShortenInst::shortenIIF(MachineInstr &MI, + unsigned LLIxL, unsigned LLIxH) { unsigned Reg = MI.getOperand(0).getReg(); - assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number"); - unsigned GPRs = GPRMap[Reg]; - assert(GPRs != 0 && "Register must be a GPR"); - if (GPRs & LiveOther) + // The new opcode will clear the other half of the GR64 reg, so + // cancel if that is live. + unsigned thisSubRegIdx = (SystemZ::GRH32BitRegClass.contains(Reg) ? + SystemZ::subreg_h32 : SystemZ::subreg_l32); + unsigned otherSubRegIdx = (thisSubRegIdx == SystemZ::subreg_l32 ? + SystemZ::subreg_h32 : SystemZ::subreg_l32); + unsigned GR64BitReg = TRI->getMatchingSuperReg(Reg, thisSubRegIdx, + &SystemZ::GR64BitRegClass); + unsigned OtherReg = TRI->getSubReg(GR64BitReg, otherSubRegIdx); + if (LiveRegs.contains(OtherReg)) return false; uint64_t Imm = MI.getOperand(1).getImm(); @@ -123,12 +120,26 @@ bool SystemZShortenInst::shortenOn01(MachineInstr &MI, unsigned Opcode) { } // Change MI's opcode to Opcode if register operands 0, 1 and 2 have a -// 4-bit encoding and if operands 0 and 1 are tied. +// 4-bit encoding and if operands 0 and 1 are tied. Also ties op 0 +// with op 1, if MI becomes 2-address. bool SystemZShortenInst::shortenOn001(MachineInstr &MI, unsigned Opcode) { if (SystemZMC::getFirstReg(MI.getOperand(0).getReg()) < 16 && MI.getOperand(1).getReg() == MI.getOperand(0).getReg() && SystemZMC::getFirstReg(MI.getOperand(2).getReg()) < 16) { MI.setDesc(TII->get(Opcode)); + tieOpsIfNeeded(MI); + return true; + } + return false; +} + +// Calls shortenOn001 if CCLive is false. CC def operand is added in +// case of success. +bool SystemZShortenInst::shortenOn001AddCC(MachineInstr &MI, + unsigned Opcode) { + if (!LiveRegs.contains(SystemZ::CC) && shortenOn001(MI, Opcode)) { + MachineInstrBuilder(*MI.getParent()->getParent(), &MI) + .addReg(SystemZ::CC, RegState::ImplicitDefine); return true; } return false; @@ -164,35 +175,24 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) { bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { bool Changed = false; - // Work out which words are live on exit from the block. - unsigned LiveLow = 0; - unsigned LiveHigh = 0; - for (auto SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) { - for (auto LI = (*SI)->livein_begin(), LE = (*SI)->livein_end(); - LI != LE; ++LI) { - unsigned Reg = *LI; - assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number"); - LiveLow |= LowGPRs[Reg]; - LiveHigh |= HighGPRs[Reg]; - } - } + // Set up the set of live registers at the end of MBB (live out) + LiveRegs.clear(); + LiveRegs.addLiveOuts(&MBB); // Iterate backwards through the block looking for instructions to change. for (auto MBBI = MBB.rbegin(), MBBE = MBB.rend(); MBBI != MBBE; ++MBBI) { MachineInstr &MI = *MBBI; switch (MI.getOpcode()) { case SystemZ::IILF: - Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL, - SystemZ::LLILH); + Changed |= shortenIIF(MI, SystemZ::LLILL, SystemZ::LLILH); break; case SystemZ::IIHF: - Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL, - SystemZ::LLIHH); + Changed |= shortenIIF(MI, SystemZ::LLIHL, SystemZ::LLIHH); break; case SystemZ::WFADB: - Changed |= shortenOn001(MI, SystemZ::ADBR); + Changed |= shortenOn001AddCC(MI, SystemZ::ADBR); break; case SystemZ::WFDDB: @@ -216,15 +216,15 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { break; case SystemZ::WFLCDB: - Changed |= shortenOn01(MI, SystemZ::LCDBR); + Changed |= shortenOn01(MI, SystemZ::LCDFR); break; case SystemZ::WFLNDB: - Changed |= shortenOn01(MI, SystemZ::LNDBR); + Changed |= shortenOn01(MI, SystemZ::LNDFR); break; case SystemZ::WFLPDB: - Changed |= shortenOn01(MI, SystemZ::LPDBR); + Changed |= shortenOn01(MI, SystemZ::LPDFR); break; case SystemZ::WFSQDB: @@ -232,7 +232,7 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { break; case SystemZ::WFSDB: - Changed |= shortenOn001(MI, SystemZ::SDBR); + Changed |= shortenOn001AddCC(MI, SystemZ::SDBR); break; case SystemZ::WFCDB: @@ -257,33 +257,17 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { break; } - unsigned UsedLow = 0; - unsigned UsedHigh = 0; - for (auto MOI = MI.operands_begin(), MOE = MI.operands_end(); - MOI != MOE; ++MOI) { - MachineOperand &MO = *MOI; - if (MO.isReg()) { - if (unsigned Reg = MO.getReg()) { - assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number"); - if (MO.isDef()) { - LiveLow &= ~LowGPRs[Reg]; - LiveHigh &= ~HighGPRs[Reg]; - } else if (!MO.isUndef()) { - UsedLow |= LowGPRs[Reg]; - UsedHigh |= HighGPRs[Reg]; - } - } - } - } - LiveLow |= UsedLow; - LiveHigh |= UsedHigh; + LiveRegs.stepBackward(MI); } return Changed; } bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) { - TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo()); + const SystemZSubtarget &ST = F.getSubtarget<SystemZSubtarget>(); + TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + LiveRegs.init(TRI); bool Changed = false; for (auto &MBB : F) diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp index 00cbbd1..f305e85 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -16,6 +16,7 @@ using namespace llvm; +extern cl::opt<bool> MISchedPostRA; extern "C" void LLVMInitializeSystemZTarget() { // Register the target. RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget); @@ -32,7 +33,7 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) { VectorABI = false; SmallVector<StringRef, 3> Features; - FS.split(Features, ",", -1, false /* KeepEmpty */); + FS.split(Features, ',', -1, false /* KeepEmpty */); for (auto &Feature : Features) { if (Feature == "vector" || Feature == "+vector") VectorABI = true; @@ -130,6 +131,13 @@ void SystemZPassConfig::addPreSched2() { } void SystemZPassConfig::addPreEmitPass() { + + // Do instruction shortening before compare elimination because some + // vector instructions will be shortened into opcodes that compare + // elimination recognizes. + if (getOptLevel() != CodeGenOpt::None) + addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false); + // We eliminate comparisons here rather than earlier because some // transformations can change the set of available CC values and we // generally want those transformations to have priority. This is @@ -155,9 +163,17 @@ void SystemZPassConfig::addPreEmitPass() { // preventing that would be a win or not. if (getOptLevel() != CodeGenOpt::None) addPass(createSystemZElimComparePass(getSystemZTargetMachine()), false); - if (getOptLevel() != CodeGenOpt::None) - addPass(createSystemZShortenInstPass(getSystemZTargetMachine()), false); addPass(createSystemZLongBranchPass(getSystemZTargetMachine())); + + // Do final scheduling after all other optimizations, to get an + // optimal input for the decoder (branch relaxation must happen + // after block placement). + if (getOptLevel() != CodeGenOpt::None) { + if (MISchedPostRA) + addPass(&PostMachineSchedulerID); + else + addPass(&PostRASchedulerID); + } } TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { @@ -165,7 +181,7 @@ TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) { } TargetIRAnalysis SystemZTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(SystemZTTIImpl(this, F)); }); } diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h index 0a81e1f..1a8f1f7 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetMachine.h @@ -43,6 +43,9 @@ public: TargetLoweringObjectFile *getObjFileLowering() const override { return TLOF.get(); } + + bool targetSchedulesPostRAScheduling() const override { return true; }; + }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 5a87df1..5ff5b21 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -31,7 +31,7 @@ using namespace llvm; // //===----------------------------------------------------------------------===// -unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -63,8 +63,8 @@ unsigned SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { return 4 * TTI::TCC_Basic; } -unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -181,8 +181,8 @@ unsigned SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, return SystemZTTIImpl::getIntImmCost(Imm, Ty); } -unsigned SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, + const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); diff --git a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 4b80973..9ae736d 100644 --- a/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -28,7 +28,7 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> { const SystemZTargetLowering *getTLI() const { return TLI; } public: - explicit SystemZTTIImpl(const SystemZTargetMachine *TM, Function &F) + explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -42,12 +42,11 @@ public: /// \name Scalar TTI Implementations /// @{ - unsigned getIntImmCost(const APInt &Imm, Type *Ty); + int getIntImmCost(const APInt &Imm, Type *Ty); - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); diff --git a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp index 19b5e2a..a0b0d8f 100644 --- a/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/contrib/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -43,7 +43,6 @@ using namespace llvm; void TargetLoweringObjectFile::Initialize(MCContext &ctx, const TargetMachine &TM) { Ctx = &ctx; - DL = TM.getDataLayout(); InitMCObjectFileInfo(TM.getTargetTriple(), TM.getRelocationModel(), TM.getCodeModel(), *Ctx); } @@ -107,7 +106,7 @@ MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase( assert(!Suffix.empty()); SmallString<60> NameStr; - NameStr += DL->getPrivateGlobalPrefix(); + NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix(); TM.getNameWithPrefix(NameStr, GV, Mang); NameStr.append(Suffix.begin(), Suffix.end()); return Ctx->getOrCreateSymbol(NameStr); @@ -120,7 +119,7 @@ MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol( } void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer, - const TargetMachine &TM, + const DataLayout &, const MCSymbol *Sym) const { } @@ -170,14 +169,13 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, // If the initializer for the global contains something that requires a // relocation, then we may have to drop this into a writable data section // even though it is marked const. - switch (C->getRelocationInfo()) { - case Constant::NoRelocation: + if (!C->needsRelocation()) { // If the global is required to have a unique address, it can't be put // into a mergable section: just drop it into the general read-only // section instead. if (!GVar->hasUnnamedAddr()) return SectionKind::getReadOnly(); - + // If initializer is a null-terminated string, put it in a "cstring" // section of the right width. if (ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) { @@ -200,7 +198,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, // Otherwise, just drop it into a mergable constant section. If we have // a section for this size, use it, otherwise use the arbitrary sized // mergable section. - switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) { + switch (GV->getParent()->getDataLayout().getTypeAllocSize(C->getType())) { case 4: return SectionKind::getMergeableConst4(); case 8: return SectionKind::getMergeableConst8(); case 16: return SectionKind::getMergeableConst16(); @@ -208,20 +206,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, return SectionKind::getReadOnly(); } - case Constant::LocalRelocation: - // In static relocation model, the linker will resolve all addresses, so - // the relocation entries will actually be constants by the time the app - // starts up. However, we can't put this into a mergable section, because - // the linker doesn't take relocations into consideration when it tries to - // merge entries in the section. - if (ReloModel == Reloc::Static) - return SectionKind::getReadOnly(); - - // Otherwise, the dynamic linker needs to fix it up, put it in the - // writable data.rel.local section. - return SectionKind::getReadOnlyWithRelLocal(); - - case Constant::GlobalRelocations: + } else { // In static relocation model, the linker will resolve all addresses, so // the relocation entries will actually be constants by the time the app // starts up. However, we can't put this into a mergable section, because @@ -242,17 +227,11 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV, // globals together onto fewer pages, improving the locality of the dynamic // linker. if (ReloModel == Reloc::Static) - return SectionKind::getDataNoRel(); - - switch (C->getRelocationInfo()) { - case Constant::NoRelocation: - return SectionKind::getDataNoRel(); - case Constant::LocalRelocation: - return SectionKind::getDataRelLocal(); - case Constant::GlobalRelocations: - return SectionKind::getDataRel(); - } - llvm_unreachable("Invalid relocation"); + return SectionKind::getData(); + + if (C->needsRelocation()) + return SectionKind::getData(); + return SectionKind::getData(); } /// This method computes the appropriate section to emit the specified global @@ -273,7 +252,8 @@ TargetLoweringObjectFile::SectionForGlobal(const GlobalValue *GV, MCSection *TargetLoweringObjectFile::getSectionForJumpTable( const Function &F, Mangler &Mang, const TargetMachine &TM) const { - return getSectionForConstant(SectionKind::getReadOnly(), /*C=*/nullptr); + return getSectionForConstant(F.getParent()->getDataLayout(), + SectionKind::getReadOnly(), /*C=*/nullptr); } bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection( @@ -296,9 +276,8 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection( /// Given a mergable constant with the specified size and relocation /// information, return a section that it should be placed in. -MCSection * -TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *TargetLoweringObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isReadOnly() && ReadOnlySection != nullptr) return ReadOnlySection; @@ -345,7 +324,7 @@ const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol } void TargetLoweringObjectFile::getNameWithPrefix( - SmallVectorImpl<char> &OutName, const GlobalValue *GV, - bool CannotUsePrivateLabel, Mangler &Mang, const TargetMachine &TM) const { - Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel); + SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang, + const TargetMachine &TM) const { + Mang.getNameWithPrefix(OutName, GV, /*CannotUsePrivateLabel=*/false); } diff --git a/contrib/llvm/lib/Target/TargetMachine.cpp b/contrib/llvm/lib/Target/TargetMachine.cpp index 83174c2..850c93c 100644 --- a/contrib/llvm/lib/Target/TargetMachine.cpp +++ b/contrib/llvm/lib/Target/TargetMachine.cpp @@ -150,24 +150,11 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const { } TargetIRAnalysis TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(F.getParent()->getDataLayout()); }); } -static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo, - const MCSection &Section) { - if (!AsmInfo.isSectionAtomizableBySymbols(Section)) - return true; - - // If it is not dead stripped, it is safe to use private labels. - const MCSectionMachO &SMO = cast<MCSectionMachO>(Section); - if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP)) - return true; - - return false; -} - void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name, const GlobalValue *GV, Mangler &Mang, bool MayAlwaysUsePrivate) const { @@ -177,11 +164,8 @@ void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name, Mang.getNameWithPrefix(Name, GV, false); return; } - SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, *this); const TargetLoweringObjectFile *TLOF = getObjFileLowering(); - const MCSection *TheSection = TLOF->SectionForGlobal(GV, GVKind, Mang, *this); - bool CannotUsePrivateLabel = !canUsePrivateLabel(*AsmInfo, *TheSection); - TLOF->getNameWithPrefix(Name, GV, CannotUsePrivateLabel, Mang, *this); + TLOF->getNameWithPrefix(Name, GV, Mang, *this); } MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV, Mangler &Mang) const { diff --git a/contrib/llvm/lib/Target/TargetMachineC.cpp b/contrib/llvm/lib/Target/TargetMachineC.cpp index 7199235..f82566c 100644 --- a/contrib/llvm/lib/Target/TargetMachineC.cpp +++ b/contrib/llvm/lib/Target/TargetMachineC.cpp @@ -32,17 +32,25 @@ using namespace llvm; -inline TargetMachine *unwrap(LLVMTargetMachineRef P) { - return reinterpret_cast<TargetMachine*>(P); +namespace llvm { +// Friend to the TargetMachine, access legacy API that are made private in C++ +struct C_API_PRIVATE_ACCESS { + static const DataLayout &getDataLayout(const TargetMachine &T) { + return T.getDataLayout(); + } +}; +} + +static TargetMachine *unwrap(LLVMTargetMachineRef P) { + return reinterpret_cast<TargetMachine *>(P); } -inline Target *unwrap(LLVMTargetRef P) { +static Target *unwrap(LLVMTargetRef P) { return reinterpret_cast<Target*>(P); } -inline LLVMTargetMachineRef wrap(const TargetMachine *P) { - return - reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine*>(P)); +static LLVMTargetMachineRef wrap(const TargetMachine *P) { + return reinterpret_cast<LLVMTargetMachineRef>(const_cast<TargetMachine *>(P)); } -inline LLVMTargetRef wrap(const Target * P) { +static LLVMTargetRef wrap(const Target * P) { return reinterpret_cast<LLVMTargetRef>(const_cast<Target*>(P)); } @@ -69,16 +77,16 @@ LLVMTargetRef LLVMGetTargetFromName(const char *Name) { LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T, char **ErrorMessage) { std::string Error; - + *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error)); - + if (!*T) { if (ErrorMessage) *ErrorMessage = strdup(Error.c_str()); return 1; } - + return 0; } @@ -145,10 +153,7 @@ LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, CM, OL)); } - -void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { - delete unwrap(T); -} +void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); } LLVMTargetRef LLVMGetTargetMachineTarget(LLVMTargetMachineRef T) { const Target* target = &(unwrap(T)->getTarget()); @@ -170,8 +175,9 @@ char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) { return strdup(StringRep.c_str()); } +/** Deprecated: use LLVMGetDataLayout(LLVMModuleRef M) instead. */ LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) { - return wrap(unwrap(T)->getDataLayout()); + return wrap(&C_API_PRIVATE_ACCESS::getDataLayout(*unwrap(T))); } void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T, @@ -190,14 +196,7 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M, std::string error; - const DataLayout *td = TM->getDataLayout(); - - if (!td) { - error = "No DataLayout in TargetMachine"; - *ErrorMessage = strdup(error.c_str()); - return true; - } - Mod->setDataLayout(*td); + Mod->setDataLayout(TM->createDataLayout()); TargetMachine::CodeGenFileType ft; switch (codegen) { @@ -239,7 +238,6 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T, SmallString<0> CodeString; raw_svector_ostream OStream(CodeString); bool Result = LLVMTargetMachineEmit(T, M, OStream, codegen, ErrorMessage); - OStream.flush(); StringRef Data = OStream.str(); *OutMemBuf = diff --git a/contrib/llvm/lib/Target/TargetRecip.cpp b/contrib/llvm/lib/Target/TargetRecip.cpp index 42bc487..d41b643 100644 --- a/contrib/llvm/lib/Target/TargetRecip.cpp +++ b/contrib/llvm/lib/Target/TargetRecip.cpp @@ -26,7 +26,7 @@ using namespace llvm; // the key strings for queries and command-line inputs. // In addition, the command-line interface recognizes the global parameters // "all", "none", and "default". -static const char *RecipOps[] = { +static const char *const RecipOps[] = { "divd", "divf", "vec-divd", @@ -46,7 +46,7 @@ TargetRecip::TargetRecip() { RecipMap.insert(std::make_pair(RecipOps[i], RecipParams())); } -static bool parseRefinementStep(const StringRef &In, size_t &Position, +static bool parseRefinementStep(StringRef In, size_t &Position, uint8_t &Value) { const char RefStepToken = ':'; Position = In.find(RefStepToken); @@ -175,7 +175,7 @@ TargetRecip::TargetRecip(const std::vector<std::string> &Args) : parseIndividualParams(Args); } -bool TargetRecip::isEnabled(const StringRef &Key) const { +bool TargetRecip::isEnabled(StringRef Key) const { ConstRecipIter Iter = RecipMap.find(Key); assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); assert(Iter->second.Enabled != Uninitialized && @@ -183,7 +183,7 @@ bool TargetRecip::isEnabled(const StringRef &Key) const { return Iter->second.Enabled; } -unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const { +unsigned TargetRecip::getRefinementSteps(StringRef Key) const { ConstRecipIter Iter = RecipMap.find(Key); assert(Iter != RecipMap.end() && "Unknown name for reciprocal map"); assert(Iter->second.RefinementSteps != Uninitialized && @@ -192,7 +192,7 @@ unsigned TargetRecip::getRefinementSteps(const StringRef &Key) const { } /// Custom settings (previously initialized values) override target defaults. -void TargetRecip::setDefaults(const StringRef &Key, bool Enable, +void TargetRecip::setDefaults(StringRef Key, bool Enable, unsigned RefSteps) { if (Key == "all") { for (auto &KV : RecipMap) { @@ -213,7 +213,7 @@ void TargetRecip::setDefaults(const StringRef &Key, bool Enable, bool TargetRecip::operator==(const TargetRecip &Other) const { for (const auto &KV : RecipMap) { - const StringRef &Op = KV.first; + StringRef Op = KV.first; const RecipParams &RP = KV.second; const RecipParams &OtherRP = Other.RecipMap.find(Op)->second; if (RP.RefinementSteps != OtherRP.RefinementSteps) diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp index fbb985a..7ce3a00 100644 --- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp @@ -13,7 +13,9 @@ //===----------------------------------------------------------------------===// #include "InstPrinter/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" @@ -21,11 +23,13 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" -#include <cctype> +#include "llvm/Target/TargetRegisterInfo.h" using namespace llvm; #define DEBUG_TYPE "asm-printer" +#include "WebAssemblyGenAsmWriter.inc" + WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) @@ -33,11 +37,93 @@ WebAssemblyInstPrinter::WebAssemblyInstPrinter(const MCAsmInfo &MAI, void WebAssemblyInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const { - llvm_unreachable("TODO: implement printRegName"); + assert(RegNo != WebAssemblyFunctionInfo::UnusedReg); + // Note that there's an implicit get_local/set_local here! + OS << "$" << RegNo; } void WebAssemblyInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, - const MCSubtargetInfo &STI) { - llvm_unreachable("TODO: implement printInst"); + const MCSubtargetInfo & /*STI*/) { + // Print the instruction (this uses the AsmStrings from the .td files). + printInstruction(MI, OS); + + // Print any additional variadic operands. + const MCInstrDesc &Desc = MII.get(MI->getOpcode()); + if (Desc.isVariadic()) + for (auto i = Desc.getNumOperands(), e = MI->getNumOperands(); i < e; ++i) { + if (i != 0) + OS << ", "; + printOperand(MI, i, OS); + } + + // Print any added annotation. + printAnnotation(OS, Annot); +} + +static std::string toString(const APFloat &FP) { + static const size_t BufBytes = 128; + char buf[BufBytes]; + if (FP.isNaN()) + assert((FP.bitwiseIsEqual(APFloat::getQNaN(FP.getSemantics())) || + FP.bitwiseIsEqual( + APFloat::getQNaN(FP.getSemantics(), /*Negative=*/true))) && + "convertToHexString handles neither SNaN nor NaN payloads"); + // Use C99's hexadecimal floating-point representation. + auto Written = FP.convertToHexString( + buf, /*hexDigits=*/0, /*upperCase=*/false, APFloat::rmNearestTiesToEven); + (void)Written; + assert(Written != 0); + assert(Written < BufBytes); + return buf; +} + +void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isReg()) { + unsigned WAReg = Op.getReg(); + if (int(WAReg) >= 0) + printRegName(O, WAReg); + else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs()) + O << "$pop" << (WAReg & INT32_MAX); + else if (WAReg != WebAssemblyFunctionInfo::UnusedReg) + O << "$push" << (WAReg & INT32_MAX); + else + O << "$discard"; + // Add a '=' suffix if this is a def. + if (OpNo < MII.get(MI->getOpcode()).getNumDefs()) + O << '='; + } else if (Op.isImm()) { + switch (MI->getOpcode()) { + case WebAssembly::PARAM: + case WebAssembly::RESULT: + case WebAssembly::LOCAL: + O << WebAssembly::TypeToString(MVT::SimpleValueType(Op.getImm())); + break; + default: + O << Op.getImm(); + break; + } + } else if (Op.isFPImm()) + O << toString(APFloat(Op.getFPImm())); + else { + assert(Op.isExpr() && "unknown operand kind in printOperand"); + Op.getExpr()->print(O, &MAI); + } +} + +const char *llvm::WebAssembly::TypeToString(MVT Ty) { + switch (Ty.SimpleTy) { + case MVT::i32: + return "i32"; + case MVT::i64: + return "i64"; + case MVT::f32: + return "f32"; + case MVT::f64: + return "f64"; + default: + llvm_unreachable("unsupported type"); + } } diff --git a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h index 70fcef2..39a16f5 100644 --- a/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h +++ b/contrib/llvm/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h @@ -16,14 +16,13 @@ #define LLVM_LIB_TARGET_WEBASSEMBLY_INSTPRINTER_WEBASSEMBLYINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/CodeGen/MachineValueType.h" namespace llvm { -class MCOperand; class MCSubtargetInfo; -class WebAssemblyInstPrinter : public MCInstPrinter { +class WebAssemblyInstPrinter final : public MCInstPrinter { public: WebAssemblyInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI); @@ -31,8 +30,21 @@ public: void printRegName(raw_ostream &OS, unsigned RegNo) const override; void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot, const MCSubtargetInfo &STI) override; + + // Used by tblegen code. + void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + + // Autogenerated by tblgen. + void printInstruction(const MCInst *MI, raw_ostream &O); + static const char *getRegisterName(unsigned RegNo); }; +namespace WebAssembly { + +const char *TypeToString(MVT Ty); + +} // end namespace WebAssembly + } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp new file mode 100644 index 0000000..b158ccb --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp @@ -0,0 +1,103 @@ +//===-- WebAssemblyAsmBackend.cpp - WebAssembly Assembler Backend ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements the WebAssemblyAsmBackend class. +/// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAssembler.h" +#include "llvm/MC/MCDirectives.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +namespace { +class WebAssemblyAsmBackend final : public MCAsmBackend { + bool Is64Bit; + +public: + explicit WebAssemblyAsmBackend(bool Is64Bit) + : MCAsmBackend(), Is64Bit(Is64Bit) {} + ~WebAssemblyAsmBackend() override {} + + void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value, bool IsPCRel) const override; + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; + + // No instruction requires relaxation + bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, + const MCRelaxableFragment *DF, + const MCAsmLayout &Layout) const override { + return false; + } + + unsigned getNumFixupKinds() const override { + // We currently just use the generic fixups in MCFixup.h and don't have any + // target-specific fixups. + return 0; + } + + bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } + + void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {} + + bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override; +}; + +bool WebAssemblyAsmBackend::writeNopData(uint64_t Count, + MCObjectWriter *OW) const { + if (Count == 0) + return true; + + // FIXME: Do something. + return false; +} + +void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data, + unsigned DataSize, uint64_t Value, + bool IsPCRel) const { + const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind()); + unsigned NumBytes = RoundUpToAlignment(Info.TargetSize, 8); + if (!Value) + return; // Doesn't change encoding. + + // Shift the value into position. + Value <<= Info.TargetOffset; + + unsigned Offset = Fixup.getOffset(); + assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!"); + + // For each byte of the fragment that the fixup touches, mask in the + // bits from the fixup value. + for (unsigned i = 0; i != NumBytes; ++i) + Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff); +} + +MCObjectWriter * +WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const { + return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0); +} +} // end anonymous namespace + +MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Target &T, + const MCRegisterInfo &MRI, + const Triple &TT, + StringRef CPU) { + return new WebAssemblyAsmBackend(TT.isArch64Bit()); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp new file mode 100644 index 0000000..c47a3d9 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyELFObjectWriter.cpp @@ -0,0 +1,54 @@ +//===-- WebAssemblyELFObjectWriter.cpp - WebAssembly ELF Writer -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file handles ELF-specific object emission, converting LLVM's +/// internal fixups into the appropriate relocations. +/// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/MC/MCELFObjectWriter.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/Support/ErrorHandling.h" +using namespace llvm; + +namespace { +class WebAssemblyELFObjectWriter final : public MCELFObjectTargetWriter { +public: + WebAssemblyELFObjectWriter(bool Is64Bit, uint8_t OSABI); + +protected: + unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup, + bool IsPCRel) const override; +}; +} // end anonymous namespace + +// FIXME: Use EM_NONE as a temporary hack. Should we decide to pursue ELF +// writing seriously, we should email generic-abi@googlegroups.com and ask +// for our own ELF code. +WebAssemblyELFObjectWriter::WebAssemblyELFObjectWriter(bool Is64Bit, + uint8_t OSABI) + : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_NONE, + /*HasRelocationAddend=*/true) {} + +unsigned WebAssemblyELFObjectWriter::GetRelocType(const MCValue &Target, + const MCFixup &Fixup, + bool IsPCRel) const { + // FIXME: Do we need our own relocs? + return Fixup.getKind(); +} + +MCObjectWriter *llvm::createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, + uint8_t OSABI) { + MCELFObjectTargetWriter *MOTW = + new WebAssemblyELFObjectWriter(Is64Bit, OSABI); + return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp index 55346f7..d261779 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp @@ -23,7 +23,7 @@ using namespace llvm; WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {} WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) { - PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit(); + PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4; // TODO: What should MaxInstLength be? @@ -41,9 +41,6 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) { COMMDirectiveAlignmentIsInBytes = false; LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment; - HasDotTypeDotSizeDirective = false; - HasSingleParameterDotFile = false; - SupportsDebugInformation = true; // For now, WebAssembly does not support exceptions. diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h index d2b8fb7..2dcf2cd 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h @@ -15,13 +15,13 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H -#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCAsmInfoELF.h" namespace llvm { class Triple; -class WebAssemblyMCAsmInfo final : public MCAsmInfo { +class WebAssemblyMCAsmInfo final : public MCAsmInfoELF { public: explicit WebAssemblyMCAsmInfo(const Triple &T); ~WebAssemblyMCAsmInfo() override; diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp new file mode 100644 index 0000000..7c6c79e --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp @@ -0,0 +1,100 @@ +//=- WebAssemblyMCCodeEmitter.cpp - Convert WebAssembly code to machine code -// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements the WebAssemblyMCCodeEmitter class. +/// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "mccodeemitter" + +namespace { +class WebAssemblyMCCodeEmitter final : public MCCodeEmitter { + const MCRegisterInfo &MRI; + +public: + WebAssemblyMCCodeEmitter(const MCInstrInfo &, const MCRegisterInfo &mri, + MCContext &) + : MRI(mri) {} + + ~WebAssemblyMCCodeEmitter() override {} + + /// TableGen'erated function for getting the binary encoding for an + /// instruction. + uint64_t getBinaryCodeForInstr(const MCInst &MI, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + /// Return binary encoding of operand. If the machine operand requires + /// relocation, record the relocation and return zero. + unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + uint64_t getMemoryOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const; + + void encodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const override; +}; +} // end anonymous namespace + +MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx) { + return new WebAssemblyMCCodeEmitter(MCII, MRI, Ctx); +} + +unsigned WebAssemblyMCCodeEmitter::getMachineOpValue( + const MCInst &MI, const MCOperand &MO, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + if (MO.isReg()) + return MRI.getEncodingValue(MO.getReg()); + if (MO.isImm()) + return static_cast<unsigned>(MO.getImm()); + + assert(MO.isExpr()); + + assert(MO.getExpr()->getKind() == MCExpr::SymbolRef); + + assert(false && "FIXME: not implemented yet"); + + return 0; +} + +void WebAssemblyMCCodeEmitter::encodeInstruction( + const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + assert(false && "FIXME: not implemented yet"); +} + +// Encode WebAssembly Memory Operand +uint64_t +WebAssemblyMCCodeEmitter::getMemoryOpValue(const MCInst &MI, unsigned Op, + SmallVectorImpl<MCFixup> &Fixups, + const MCSubtargetInfo &STI) const { + assert(false && "FIXME: not implemented yet"); + return 0; +} + +#include "WebAssemblyGenMCCodeEmitter.inc" diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp index 224aa77..14cd295 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp @@ -26,25 +26,40 @@ using namespace llvm; #define DEBUG_TYPE "wasm-mc-target-desc" +#define GET_INSTRINFO_MC_DESC +#include "WebAssemblyGenInstrInfo.inc" + #define GET_SUBTARGETINFO_MC_DESC #include "WebAssemblyGenSubtargetInfo.inc" #define GET_REGINFO_MC_DESC #include "WebAssemblyGenRegisterInfo.inc" -static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo &MRI, +static MCAsmInfo *createWebAssemblyMCAsmInfo(const MCRegisterInfo & /*MRI*/, const Triple &TT) { - MCAsmInfo *MAI = new WebAssemblyMCAsmInfo(TT); - return MAI; + return new WebAssemblyMCAsmInfo(TT); +} + +static MCInstrInfo *createWebAssemblyMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitWebAssemblyMCInstrInfo(X); + return X; +} + +static MCStreamer *createWebAssemblyMCStreamer(const Triple &T, MCContext &Ctx, + MCAsmBackend &MAB, + raw_pwrite_stream &OS, + MCCodeEmitter *Emitter, + bool RelaxAll) { + return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll); } static MCInstPrinter * -createWebAssemblyMCInstPrinter(const Triple &T, unsigned SyntaxVariant, +createWebAssemblyMCInstPrinter(const Triple & /*T*/, unsigned SyntaxVariant, const MCAsmInfo &MAI, const MCInstrInfo &MII, const MCRegisterInfo &MRI) { - if (SyntaxVariant == 0 || SyntaxVariant == 1) - return new WebAssemblyInstPrinter(MAI, MII, MRI); - return nullptr; + assert(SyntaxVariant == 0); + return new WebAssemblyInstPrinter(MAI, MII, MRI); } // Force static initialization. @@ -53,7 +68,19 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() { // Register the MC asm info. RegisterMCAsmInfoFn X(*T, createWebAssemblyMCAsmInfo); + // Register the MC instruction info. + TargetRegistry::RegisterMCInstrInfo(*T, createWebAssemblyMCInstrInfo); + + // Register the object streamer + TargetRegistry::RegisterELFStreamer(*T, createWebAssemblyMCStreamer); + // Register the MCInstPrinter. TargetRegistry::RegisterMCInstPrinter(*T, createWebAssemblyMCInstPrinter); + + // Register the MC code emitter + TargetRegistry::RegisterMCCodeEmitter(*T, createWebAssemblyMCCodeEmitter); + + // Register the ASM Backend + TargetRegistry::RegisterMCAsmBackend(*T, createWebAssemblyAsmBackend); } } diff --git a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index eebf5b7..e78f73e 100644 --- a/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/contrib/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -16,7 +16,6 @@ #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCTARGETDESC_H #include "llvm/Support/DataTypes.h" -#include <string> namespace llvm { @@ -34,13 +33,21 @@ class StringRef; class Target; class Triple; class raw_ostream; +class raw_pwrite_stream; extern Target TheWebAssemblyTarget32; extern Target TheWebAssemblyTarget64; +MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII, + const MCRegisterInfo &MRI, + MCContext &Ctx); + MCAsmBackend *createWebAssemblyAsmBackend(const Target &T, const MCRegisterInfo &MRI, - StringRef TT, StringRef CPU); + const Triple &TT, StringRef CPU); + +MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, uint8_t OSABI); } // end namespace llvm @@ -50,6 +57,11 @@ MCAsmBackend *createWebAssemblyAsmBackend(const Target &T, #define GET_REGINFO_ENUM #include "WebAssemblyGenRegisterInfo.inc" +// Defines symbolic names for the WebAssembly instructions. +// +#define GET_INSTRINFO_ENUM +#include "WebAssemblyGenInstrInfo.inc" + #define GET_SUBTARGETINFO_ENUM #include "WebAssemblyGenSubtargetInfo.inc" diff --git a/contrib/llvm/lib/Target/WebAssembly/README.txt b/contrib/llvm/lib/Target/WebAssembly/README.txt index 63e02c4..b97ea45 100644 --- a/contrib/llvm/lib/Target/WebAssembly/README.txt +++ b/contrib/llvm/lib/Target/WebAssembly/README.txt @@ -12,6 +12,16 @@ binary encoding of WebAssembly itself: * https://github.com/WebAssembly/design/blob/master/AstSemantics.md * https://github.com/WebAssembly/design/blob/master/BinaryEncoding.md +The backend is built, tested and archived on the following waterfall: + https://build.chromium.org/p/client.wasm.llvm/console + +The backend's bringup is done using the GCC torture test suite first since it +doesn't require C library support. Current known failures are in +known_gcc_test_failures.txt, all other tests should pass. The waterfall will +turn red if not. Once most of these pass, further testing will use LLVM's own +test suite. The tests can be run locally using: + github.com/WebAssembly/experimental/blob/master/buildbot/torture_test.py + Interesting work that remains to be done: * Write a pass to restructurize irreducible control flow. This needs to be done before register allocation to be efficient, because it may duplicate basic @@ -19,8 +29,60 @@ Interesting work that remains to be done: level. Note that LLVM's GPU code has such a pass, but it linearizes control flow (e.g. both sides of branches execute and are masked) which is undesirable for WebAssembly. -* Basic relooper to expose control flow as an AST. -* Figure out how to properly use MC for virtual ISAs. This may require some - refactoring of MC. + +//===---------------------------------------------------------------------===// + +set_local instructions have a return value. We should (a) model this, +and (b) write optimizations which take advantage of it. Keep in mind that +many set_local instructions are implicit! + +//===---------------------------------------------------------------------===// + +Br, br_if, and tableswitch instructions can support having a value on the +expression stack across the jump (sometimes). We should (a) model this, and +(b) extend the stackifier to utilize it. + +//===---------------------------------------------------------------------===// + +The min/max operators aren't exactly a<b?a:b because of NaN and negative zero +behavior. The ARM target has the same kind of min/max instructions and has +implemented optimizations for them; we should do similar optimizations for +WebAssembly. + +//===---------------------------------------------------------------------===// + +AArch64 runs SeparateConstOffsetFromGEPPass, followed by EarlyCSE and LICM. +Would these be useful to run for WebAssembly too? Also, it has an option to +run SimplifyCFG after running the AtomicExpand pass. Would this be useful for +us too? + +//===---------------------------------------------------------------------===// + +When is it profitable to set isAsCheapAsAMove on instructions in WebAssembly? + +//===---------------------------------------------------------------------===// + +Register stackification uses the EXPR_STACK physical register to impose +ordering dependencies on instructions with stack operands. This is pessimistic; +we should consider alternate ways to model stack dependencies. + +//===---------------------------------------------------------------------===// + +Lots of things could be done in WebAssemblyTargetTransformInfo.cpp. Similarly, +there are numerous optimization-related hooks that can be overridden in +WebAssemblyTargetLowering. + +//===---------------------------------------------------------------------===// + +Instead of the OptimizeReturned pass, which should consider preserving the +"returned" attribute through to MachineInstrs and extending the StoreResults +pass to do this optimization on calls too. That would also let the +WebAssemblyPeephole pass clean up dead defs for such calls, as it does for +stores. + +//===---------------------------------------------------------------------===// + +Memset/memcpy/memmove should be marked with the "returned" attribute somehow, +even when they are translated through intrinsics. //===---------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp b/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp new file mode 100644 index 0000000..9b718ef --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Relooper.cpp @@ -0,0 +1,984 @@ +//===-- Relooper.cpp - Top-level interface for WebAssembly ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// \brief This implements the Relooper algorithm. This implementation includes +/// optimizations added since the original academic paper [1] was published. +/// +/// [1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In +/// Proceedings of the ACM international conference companion on Object +/// oriented programming systems languages and applications companion +/// (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224 +/// http://doi.acm.org/10.1145/2048147.2048224 +/// +//===-------------------------------------------------------------------===// + +#include "Relooper.h" +#include "WebAssembly.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include <cstring> +#include <cstdlib> +#include <functional> +#include <list> +#include <stack> +#include <string> + +#define DEBUG_TYPE "relooper" + +using namespace llvm; +using namespace Relooper; + +static cl::opt<int> RelooperSplittingFactor( + "relooper-splitting-factor", + cl::desc( + "How much to discount code size when deciding whether to split a node"), + cl::init(5)); + +static cl::opt<unsigned> RelooperMultipleSwitchThreshold( + "relooper-multiple-switch-threshold", + cl::desc( + "How many entries to allow in a multiple before we use a switch"), + cl::init(10)); + +static cl::opt<unsigned> RelooperNestingLimit( + "relooper-nesting-limit", + cl::desc( + "How much nesting is acceptable"), + cl::init(20)); + + +namespace { +/// +/// Implements the relooper algorithm for a function's blocks. +/// +/// Implementation details: The Relooper instance has +/// ownership of the blocks and shapes, and frees them when done. +/// +struct RelooperAlgorithm { + std::deque<Block *> Blocks; + std::deque<Shape *> Shapes; + Shape *Root; + bool MinSize; + int BlockIdCounter; + int ShapeIdCounter; + + RelooperAlgorithm(); + ~RelooperAlgorithm(); + + void AddBlock(Block *New, int Id = -1); + + // Calculates the shapes + void Calculate(Block *Entry); + + // Sets us to try to minimize size + void SetMinSize(bool MinSize_) { MinSize = MinSize_; } +}; + +struct RelooperAnalysis final : public FunctionPass { + static char ID; + RelooperAnalysis() : FunctionPass(ID) {} + const char *getPassName() const override { return "relooper"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + bool runOnFunction(Function &F) override; +}; +} + +// RelooperAnalysis + +char RelooperAnalysis::ID = 0; +FunctionPass *llvm::createWebAssemblyRelooper() { + return new RelooperAnalysis(); +} + +bool RelooperAnalysis::runOnFunction(Function &F) { + DEBUG(dbgs() << "Relooping function '" << F.getName() << "'\n"); + RelooperAlgorithm R; + // FIXME: remove duplication between relooper's and LLVM's BBs. + std::map<const BasicBlock *, Block *> BB2B; + std::map<const Block *, const BasicBlock *> B2BB; + for (const BasicBlock &BB : F) { + // FIXME: getName is wrong here, Code is meant to represent amount of code. + // FIXME: use BranchVarInit for switch. + Block *B = new Block(BB.getName().str().data(), /*BranchVarInit=*/nullptr); + R.AddBlock(B); + assert(BB2B.find(&BB) == BB2B.end() && "Inserting the same block twice"); + assert(B2BB.find(B) == B2BB.end() && "Inserting the same block twice"); + BB2B[&BB] = B; + B2BB[B] = &BB; + } + for (Block *B : R.Blocks) { + const BasicBlock *BB = B2BB[B]; + for (const BasicBlock *Successor : successors(BB)) + // FIXME: add branch's Condition and Code below. + B->AddBranchTo(BB2B[Successor], /*Condition=*/nullptr, /*Code=*/nullptr); + } + R.Calculate(BB2B[&F.getEntryBlock()]); + return false; // Analysis passes don't modify anything. +} + +// Helpers + +typedef MapVector<Block *, BlockSet> BlockBlockSetMap; +typedef std::list<Block *> BlockList; + +template <class T, class U> +static bool contains(const T &container, const U &contained) { + return container.count(contained); +} + + +// Branch + +Branch::Branch(const char *ConditionInit, const char *CodeInit) + : Ancestor(nullptr), Labeled(true) { + // FIXME: move from char* to LLVM data structures + Condition = ConditionInit ? strdup(ConditionInit) : nullptr; + Code = CodeInit ? strdup(CodeInit) : nullptr; +} + +Branch::~Branch() { + // FIXME: move from char* to LLVM data structures + free(static_cast<void *>(const_cast<char *>(Condition))); + free(static_cast<void *>(const_cast<char *>(Code))); +} + +// Block + +Block::Block(const char *CodeInit, const char *BranchVarInit) + : Parent(nullptr), Id(-1), IsCheckedMultipleEntry(false) { + // FIXME: move from char* to LLVM data structures + Code = strdup(CodeInit); + BranchVar = BranchVarInit ? strdup(BranchVarInit) : nullptr; +} + +Block::~Block() { + // FIXME: move from char* to LLVM data structures + free(static_cast<void *>(const_cast<char *>(Code))); + free(static_cast<void *>(const_cast<char *>(BranchVar))); +} + +void Block::AddBranchTo(Block *Target, const char *Condition, + const char *Code) { + assert(!contains(BranchesOut, Target) && + "cannot add more than one branch to the same target"); + BranchesOut[Target] = make_unique<Branch>(Condition, Code); +} + +// Relooper + +RelooperAlgorithm::RelooperAlgorithm() + : Root(nullptr), MinSize(false), BlockIdCounter(1), + ShapeIdCounter(0) { // block ID 0 is reserved for clearings +} + +RelooperAlgorithm::~RelooperAlgorithm() { + for (auto Curr : Blocks) + delete Curr; + for (auto Curr : Shapes) + delete Curr; +} + +void RelooperAlgorithm::AddBlock(Block *New, int Id) { + New->Id = Id == -1 ? BlockIdCounter++ : Id; + Blocks.push_back(New); +} + +struct RelooperRecursor { + RelooperAlgorithm *Parent; + RelooperRecursor(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {} +}; + +void RelooperAlgorithm::Calculate(Block *Entry) { + // Scan and optimize the input + struct PreOptimizer : public RelooperRecursor { + PreOptimizer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {} + BlockSet Live; + + void FindLive(Block *Root) { + BlockList ToInvestigate; + ToInvestigate.push_back(Root); + while (!ToInvestigate.empty()) { + Block *Curr = ToInvestigate.front(); + ToInvestigate.pop_front(); + if (contains(Live, Curr)) + continue; + Live.insert(Curr); + for (const auto &iter : Curr->BranchesOut) + ToInvestigate.push_back(iter.first); + } + } + + // If a block has multiple entries but no exits, and it is small enough, it + // is useful to split it. A common example is a C++ function where + // everything ends up at a final exit block and does some RAII cleanup. + // Without splitting, we will be forced to introduce labelled loops to + // allow reaching the final block + void SplitDeadEnds() { + unsigned TotalCodeSize = 0; + for (const auto &Curr : Live) { + TotalCodeSize += strlen(Curr->Code); + } + BlockSet Splits; + BlockSet Removed; + for (const auto &Original : Live) { + if (Original->BranchesIn.size() <= 1 || + !Original->BranchesOut.empty()) + continue; // only dead ends, for now + if (contains(Original->BranchesOut, Original)) + continue; // cannot split a looping node + if (strlen(Original->Code) * (Original->BranchesIn.size() - 1) > + TotalCodeSize / RelooperSplittingFactor) + continue; // if splitting increases raw code size by a significant + // amount, abort + // Split the node (for simplicity, we replace all the blocks, even + // though we could have reused the original) + DEBUG(dbgs() << " Splitting '" << Original->Code << "'\n"); + for (const auto &Prior : Original->BranchesIn) { + Block *Split = new Block(Original->Code, Original->BranchVar); + Parent->AddBlock(Split, Original->Id); + Split->BranchesIn.insert(Prior); + std::unique_ptr<Branch> Details; + Details.swap(Prior->BranchesOut[Original]); + Prior->BranchesOut[Split] = make_unique<Branch>(Details->Condition, + Details->Code); + for (const auto &iter : Original->BranchesOut) { + Block *Post = iter.first; + Branch *Details = iter.second.get(); + Split->BranchesOut[Post] = make_unique<Branch>(Details->Condition, + Details->Code); + Post->BranchesIn.insert(Split); + } + Splits.insert(Split); + Removed.insert(Original); + } + for (const auto &iter : Original->BranchesOut) { + Block *Post = iter.first; + Post->BranchesIn.remove(Original); + } + } + for (const auto &iter : Splits) + Live.insert(iter); + for (const auto &iter : Removed) + Live.remove(iter); + } + }; + PreOptimizer Pre(this); + Pre.FindLive(Entry); + + // Add incoming branches from live blocks, ignoring dead code + for (unsigned i = 0; i < Blocks.size(); i++) { + Block *Curr = Blocks[i]; + if (!contains(Pre.Live, Curr)) + continue; + for (const auto &iter : Curr->BranchesOut) + iter.first->BranchesIn.insert(Curr); + } + + if (!MinSize) + Pre.SplitDeadEnds(); + + // Recursively process the graph + + struct Analyzer : public RelooperRecursor { + Analyzer(RelooperAlgorithm *Parent) : RelooperRecursor(Parent) {} + + // Add a shape to the list of shapes in this Relooper calculation + void Notice(Shape *New) { + New->Id = Parent->ShapeIdCounter++; + Parent->Shapes.push_back(New); + } + + // Create a list of entries from a block. If LimitTo is provided, only + // results in that set will appear + void GetBlocksOut(Block *Source, BlockSet &Entries, + BlockSet *LimitTo = nullptr) { + for (const auto &iter : Source->BranchesOut) + if (!LimitTo || contains(*LimitTo, iter.first)) + Entries.insert(iter.first); + } + + // Converts/processes all branchings to a specific target + void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor, + BlockSet &From) { + DEBUG(dbgs() << " Solipsize '" << Target->Code << "' type " << Type + << "\n"); + for (auto iter = Target->BranchesIn.begin(); + iter != Target->BranchesIn.end();) { + Block *Prior = *iter; + if (!contains(From, Prior)) { + iter++; + continue; + } + std::unique_ptr<Branch> PriorOut; + PriorOut.swap(Prior->BranchesOut[Target]); + PriorOut->Ancestor = Ancestor; + PriorOut->Type = Type; + if (MultipleShape *Multiple = dyn_cast<MultipleShape>(Ancestor)) + Multiple->Breaks++; // We are breaking out of this Multiple, so need a + // loop + iter++; // carefully increment iter before erasing + Target->BranchesIn.remove(Prior); + Target->ProcessedBranchesIn.insert(Prior); + Prior->ProcessedBranchesOut[Target].swap(PriorOut); + } + } + + Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) { + DEBUG(dbgs() << " MakeSimple inner block '" << Inner->Code << "'\n"); + SimpleShape *Simple = new SimpleShape; + Notice(Simple); + Simple->Inner = Inner; + Inner->Parent = Simple; + if (Blocks.size() > 1) { + Blocks.remove(Inner); + GetBlocksOut(Inner, NextEntries, &Blocks); + BlockSet JustInner; + JustInner.insert(Inner); + for (const auto &iter : NextEntries) + Solipsize(iter, Branch::Direct, Simple, JustInner); + } + return Simple; + } + + Shape *MakeLoop(BlockSet &Blocks, BlockSet &Entries, + BlockSet &NextEntries) { + // Find the inner blocks in this loop. Proceed backwards from the entries + // until + // you reach a seen block, collecting as you go. + BlockSet InnerBlocks; + BlockSet Queue = Entries; + while (!Queue.empty()) { + Block *Curr = *(Queue.begin()); + Queue.remove(*Queue.begin()); + if (!contains(InnerBlocks, Curr)) { + // This element is new, mark it as inner and remove from outer + InnerBlocks.insert(Curr); + Blocks.remove(Curr); + // Add the elements prior to it + for (const auto &iter : Curr->BranchesIn) + Queue.insert(iter); + } + } + assert(!InnerBlocks.empty()); + + for (const auto &Curr : InnerBlocks) { + for (const auto &iter : Curr->BranchesOut) { + Block *Possible = iter.first; + if (!contains(InnerBlocks, Possible)) + NextEntries.insert(Possible); + } + } + + LoopShape *Loop = new LoopShape(); + Notice(Loop); + + // Solipsize the loop, replacing with break/continue and marking branches + // as Processed (will not affect later calculations) + // A. Branches to the loop entries become a continue to this shape + for (const auto &iter : Entries) + Solipsize(iter, Branch::Continue, Loop, InnerBlocks); + // B. Branches to outside the loop (a next entry) become breaks on this + // shape + for (const auto &iter : NextEntries) + Solipsize(iter, Branch::Break, Loop, InnerBlocks); + // Finish up + Shape *Inner = Process(InnerBlocks, Entries, nullptr); + Loop->Inner = Inner; + return Loop; + } + + // For each entry, find the independent group reachable by it. The + // independent group is the entry itself, plus all the blocks it can + // reach that cannot be directly reached by another entry. Note that we + // ignore directly reaching the entry itself by another entry. + // @param Ignore - previous blocks that are irrelevant + void FindIndependentGroups(BlockSet &Entries, + BlockBlockSetMap &IndependentGroups, + BlockSet *Ignore = nullptr) { + typedef std::map<Block *, Block *> BlockBlockMap; + + struct HelperClass { + BlockBlockSetMap &IndependentGroups; + BlockBlockMap Ownership; // For each block, which entry it belongs to. + // We have reached it from there. + + HelperClass(BlockBlockSetMap &IndependentGroupsInit) + : IndependentGroups(IndependentGroupsInit) {} + void InvalidateWithChildren(Block *New) { + // Being in the list means you need to be invalidated + BlockList ToInvalidate; + ToInvalidate.push_back(New); + while (!ToInvalidate.empty()) { + Block *Invalidatee = ToInvalidate.front(); + ToInvalidate.pop_front(); + Block *Owner = Ownership[Invalidatee]; + // Owner may have been invalidated, do not add to + // IndependentGroups! + if (contains(IndependentGroups, Owner)) + IndependentGroups[Owner].remove(Invalidatee); + if (Ownership[Invalidatee]) { // may have been seen before and + // invalidated already + Ownership[Invalidatee] = nullptr; + for (const auto &iter : Invalidatee->BranchesOut) { + Block *Target = iter.first; + BlockBlockMap::iterator Known = Ownership.find(Target); + if (Known != Ownership.end()) { + Block *TargetOwner = Known->second; + if (TargetOwner) + ToInvalidate.push_back(Target); + } + } + } + } + } + }; + HelperClass Helper(IndependentGroups); + + // We flow out from each of the entries, simultaneously. + // When we reach a new block, we add it as belonging to the one we got to + // it from. + // If we reach a new block that is already marked as belonging to someone, + // it is reachable by two entries and is not valid for any of them. + // Remove it and all it can reach that have been visited. + + // Being in the queue means we just added this item, and + // we need to add its children + BlockList Queue; + for (const auto &Entry : Entries) { + Helper.Ownership[Entry] = Entry; + IndependentGroups[Entry].insert(Entry); + Queue.push_back(Entry); + } + while (!Queue.empty()) { + Block *Curr = Queue.front(); + Queue.pop_front(); + Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership + // map if we are in the queue + if (!Owner) + continue; // we have been invalidated meanwhile after being reached + // from two entries + // Add all children + for (const auto &iter : Curr->BranchesOut) { + Block *New = iter.first; + BlockBlockMap::iterator Known = Helper.Ownership.find(New); + if (Known == Helper.Ownership.end()) { + // New node. Add it, and put it in the queue + Helper.Ownership[New] = Owner; + IndependentGroups[Owner].insert(New); + Queue.push_back(New); + continue; + } + Block *NewOwner = Known->second; + if (!NewOwner) + continue; // We reached an invalidated node + if (NewOwner != Owner) + // Invalidate this and all reachable that we have seen - we reached + // this from two locations + Helper.InvalidateWithChildren(New); + // otherwise, we have the same owner, so do nothing + } + } + + // Having processed all the interesting blocks, we remain with just one + // potential issue: + // If a->b, and a was invalidated, but then b was later reached by + // someone else, we must invalidate b. To check for this, we go over all + // elements in the independent groups, if an element has a parent which + // does *not* have the same owner, we/ must remove it and all its + // children. + + for (const auto &iter : Entries) { + BlockSet &CurrGroup = IndependentGroups[iter]; + BlockList ToInvalidate; + for (const auto &iter : CurrGroup) { + Block *Child = iter; + for (const auto &iter : Child->BranchesIn) { + Block *Parent = iter; + if (Ignore && contains(*Ignore, Parent)) + continue; + if (Helper.Ownership[Parent] != Helper.Ownership[Child]) + ToInvalidate.push_back(Child); + } + } + while (!ToInvalidate.empty()) { + Block *Invalidatee = ToInvalidate.front(); + ToInvalidate.pop_front(); + Helper.InvalidateWithChildren(Invalidatee); + } + } + + // Remove empty groups + for (const auto &iter : Entries) + if (IndependentGroups[iter].empty()) + IndependentGroups.erase(iter); + } + + Shape *MakeMultiple(BlockSet &Blocks, BlockSet &Entries, + BlockBlockSetMap &IndependentGroups, Shape *Prev, + BlockSet &NextEntries) { + bool Fused = isa<SimpleShape>(Prev); + MultipleShape *Multiple = new MultipleShape(); + Notice(Multiple); + BlockSet CurrEntries; + for (auto &iter : IndependentGroups) { + Block *CurrEntry = iter.first; + BlockSet &CurrBlocks = iter.second; + // Create inner block + CurrEntries.clear(); + CurrEntries.insert(CurrEntry); + for (const auto &CurrInner : CurrBlocks) { + // Remove the block from the remaining blocks + Blocks.remove(CurrInner); + // Find new next entries and fix branches to them + for (auto iter = CurrInner->BranchesOut.begin(); + iter != CurrInner->BranchesOut.end();) { + Block *CurrTarget = iter->first; + auto Next = iter; + Next++; + if (!contains(CurrBlocks, CurrTarget)) { + NextEntries.insert(CurrTarget); + Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks); + } + iter = Next; // increment carefully because Solipsize can remove us + } + } + Multiple->InnerMap[CurrEntry->Id] = + Process(CurrBlocks, CurrEntries, nullptr); + // If we are not fused, then our entries will actually be checked + if (!Fused) + CurrEntry->IsCheckedMultipleEntry = true; + } + // Add entries not handled as next entries, they are deferred + for (const auto &Entry : Entries) + if (!contains(IndependentGroups, Entry)) + NextEntries.insert(Entry); + // The multiple has been created, we can decide how to implement it + if (Multiple->InnerMap.size() >= RelooperMultipleSwitchThreshold) { + Multiple->UseSwitch = true; + Multiple->Breaks++; // switch captures breaks + } + return Multiple; + } + + // Main function. + // Process a set of blocks with specified entries, returns a shape + // The Make* functions receive a NextEntries. If they fill it with data, + // those are the entries for the ->Next block on them, and the blocks + // are what remains in Blocks (which Make* modify). In this way + // we avoid recursing on Next (imagine a long chain of Simples, if we + // recursed we could blow the stack). + Shape *Process(BlockSet &Blocks, BlockSet &InitialEntries, Shape *Prev) { + BlockSet *Entries = &InitialEntries; + BlockSet TempEntries[2]; + int CurrTempIndex = 0; + BlockSet *NextEntries; + Shape *Ret = nullptr; + + auto Make = [&](Shape *Temp) { + if (Prev) + Prev->Next = Temp; + if (!Ret) + Ret = Temp; + Prev = Temp; + Entries = NextEntries; + }; + + while (1) { + CurrTempIndex = 1 - CurrTempIndex; + NextEntries = &TempEntries[CurrTempIndex]; + NextEntries->clear(); + + if (Entries->empty()) + return Ret; + if (Entries->size() == 1) { + Block *Curr = *(Entries->begin()); + if (Curr->BranchesIn.empty()) { + // One entry, no looping ==> Simple + Make(MakeSimple(Blocks, Curr, *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + // One entry, looping ==> Loop + Make(MakeLoop(Blocks, *Entries, *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + + // More than one entry, try to eliminate through a Multiple groups of + // independent blocks from an entry/ies. It is important to remove + // through multiples as opposed to looping since the former is more + // performant. + BlockBlockSetMap IndependentGroups; + FindIndependentGroups(*Entries, IndependentGroups); + + if (!IndependentGroups.empty()) { + // We can handle a group in a multiple if its entry cannot be reached + // by another group. + // Note that it might be reachable by itself - a loop. But that is + // fine, we will create a loop inside the multiple block (which + // is the performant order to do it). + for (auto iter = IndependentGroups.begin(); + iter != IndependentGroups.end();) { + Block *Entry = iter->first; + BlockSet &Group = iter->second; + auto curr = iter++; // iterate carefully, we may delete + for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin(); + iterBranch != Entry->BranchesIn.end(); iterBranch++) { + Block *Origin = *iterBranch; + if (!contains(Group, Origin)) { + // Reached from outside the group, so we cannot handle this + IndependentGroups.erase(curr); + break; + } + } + } + + // As an optimization, if we have 2 independent groups, and one is a + // small dead end, we can handle only that dead end. + // The other then becomes a Next - without nesting in the code and + // recursion in the analysis. + // TODO: if the larger is the only dead end, handle that too + // TODO: handle >2 groups + // TODO: handle not just dead ends, but also that do not branch to the + // NextEntries. However, must be careful there since we create a + // Next, and that Next can prevent eliminating a break (since we no + // longer naturally reach the same place), which may necessitate a + // one-time loop, which makes the unnesting pointless. + if (IndependentGroups.size() == 2) { + // Find the smaller one + auto iter = IndependentGroups.begin(); + Block *SmallEntry = iter->first; + auto SmallSize = iter->second.size(); + iter++; + Block *LargeEntry = iter->first; + auto LargeSize = iter->second.size(); + if (SmallSize != LargeSize) { // ignore the case where they are + // identical - keep things symmetrical + // there + if (SmallSize > LargeSize) { + Block *Temp = SmallEntry; + SmallEntry = LargeEntry; + LargeEntry = Temp; // Note: we did not flip the Sizes too, they + // are now invalid. TODO: use the smaller + // size as a limit? + } + // Check if dead end + bool DeadEnd = true; + BlockSet &SmallGroup = IndependentGroups[SmallEntry]; + for (const auto &Curr : SmallGroup) { + for (const auto &iter : Curr->BranchesOut) { + Block *Target = iter.first; + if (!contains(SmallGroup, Target)) { + DeadEnd = false; + break; + } + } + if (!DeadEnd) + break; + } + if (DeadEnd) + IndependentGroups.erase(LargeEntry); + } + } + + if (!IndependentGroups.empty()) + // Some groups removable ==> Multiple + Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev, + *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + // No independent groups, must be loopable ==> Loop + Make(MakeLoop(Blocks, *Entries, *NextEntries)); + if (NextEntries->empty()) + return Ret; + continue; + } + } + }; + + // Main + + BlockSet AllBlocks; + for (const auto &Curr : Pre.Live) { + AllBlocks.insert(Curr); + } + + BlockSet Entries; + Entries.insert(Entry); + Root = Analyzer(this).Process(AllBlocks, Entries, nullptr); + assert(Root); + + /// + /// Relooper post-optimizer + /// + struct PostOptimizer { + RelooperAlgorithm *Parent; + std::stack<Shape *> LoopStack; + + PostOptimizer(RelooperAlgorithm *ParentInit) : Parent(ParentInit) {} + + void ShapeSwitch(Shape* var, + std::function<void (SimpleShape*)> simple, + std::function<void (MultipleShape*)> multiple, + std::function<void (LoopShape*)> loop) { + switch (var->getKind()) { + case Shape::SK_Simple: { + simple(cast<SimpleShape>(var)); + break; + } + case Shape::SK_Multiple: { + multiple(cast<MultipleShape>(var)); + break; + } + case Shape::SK_Loop: { + loop(cast<LoopShape>(var)); + break; + } + } + } + + // Find the blocks that natural control flow can get us directly to, or + // through a multiple that we ignore + void FollowNaturalFlow(Shape *S, BlockSet &Out) { + ShapeSwitch(S, [&](SimpleShape* Simple) { + Out.insert(Simple->Inner); + }, [&](MultipleShape* Multiple) { + for (const auto &iter : Multiple->InnerMap) { + FollowNaturalFlow(iter.second, Out); + } + FollowNaturalFlow(Multiple->Next, Out); + }, [&](LoopShape* Loop) { + FollowNaturalFlow(Loop->Inner, Out); + }); + } + + void FindNaturals(Shape *Root, Shape *Otherwise = nullptr) { + if (Root->Next) { + Root->Natural = Root->Next; + FindNaturals(Root->Next, Otherwise); + } else { + Root->Natural = Otherwise; + } + + ShapeSwitch(Root, [](SimpleShape* Simple) { + }, [&](MultipleShape* Multiple) { + for (const auto &iter : Multiple->InnerMap) { + FindNaturals(iter.second, Root->Natural); + } + }, [&](LoopShape* Loop){ + FindNaturals(Loop->Inner, Loop->Inner); + }); + } + + // Remove unneeded breaks and continues. + // A flow operation is trivially unneeded if the shape we naturally get to + // by normal code execution is the same as the flow forces us to. + void RemoveUnneededFlows(Shape *Root, Shape *Natural = nullptr, + LoopShape *LastLoop = nullptr, + unsigned Depth = 0) { + BlockSet NaturalBlocks; + FollowNaturalFlow(Natural, NaturalBlocks); + Shape *Next = Root; + while (Next) { + Root = Next; + Next = nullptr; + ShapeSwitch( + Root, + [&](SimpleShape* Simple) { + if (Simple->Inner->BranchVar) + LastLoop = + nullptr; // a switch clears out the loop (TODO: only for + // breaks, not continue) + + if (Simple->Next) { + if (!Simple->Inner->BranchVar && + Simple->Inner->ProcessedBranchesOut.size() == 2 && + Depth < RelooperNestingLimit) { + // If there is a next block, we already know at Simple + // creation time to make direct branches, and we can do + // nothing more in general. But, we try to optimize the + // case of a break and a direct: This would normally be + // if (break?) { break; } .. + // but if we make sure to nest the else, we can save the + // break, + // if (!break?) { .. } + // This is also better because the more canonical nested + // form is easier to further optimize later. The + // downside is more nesting, which adds to size in builds with + // whitespace. + // Note that we avoid switches, as it complicates control flow + // and is not relevant for the common case we optimize here. + bool Found = false; + bool Abort = false; + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Block *Target = iter.first; + Branch *Details = iter.second.get(); + if (Details->Type == Branch::Break) { + Found = true; + if (!contains(NaturalBlocks, Target)) + Abort = true; + } else if (Details->Type != Branch::Direct) + Abort = true; + } + if (Found && !Abort) { + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Branch *Details = iter.second.get(); + if (Details->Type == Branch::Break) { + Details->Type = Branch::Direct; + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) + Multiple->Breaks--; + } else { + assert(Details->Type == Branch::Direct); + Details->Type = Branch::Nested; + } + } + } + Depth++; // this optimization increases depth, for us and all + // our next chain (i.e., until this call returns) + } + Next = Simple->Next; + } else { + // If there is no next then Natural is where we will + // go to by doing nothing, so we can potentially optimize some + // branches to direct. + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Block *Target = iter.first; + Branch *Details = iter.second.get(); + if (Details->Type != Branch::Direct && + contains(NaturalBlocks, + Target)) { // note: cannot handle split blocks + Details->Type = Branch::Direct; + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) + Multiple->Breaks--; + } else if (Details->Type == Branch::Break && LastLoop && + LastLoop->Natural == Details->Ancestor->Natural) { + // it is important to simplify breaks, as simpler breaks + // enable other optimizations + Details->Labeled = false; + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) + Multiple->Breaks--; + } + } + } + }, [&](MultipleShape* Multiple) + { + for (const auto &iter : Multiple->InnerMap) { + RemoveUnneededFlows(iter.second, Multiple->Next, + Multiple->Breaks ? nullptr : LastLoop, + Depth + 1); + } + Next = Multiple->Next; + }, [&](LoopShape* Loop) + { + RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth + 1); + Next = Loop->Next; + }); + } + } + + // After we know which loops exist, we can calculate which need to be + // labeled + void FindLabeledLoops(Shape *Root) { + Shape *Next = Root; + while (Next) { + Root = Next; + Next = nullptr; + + ShapeSwitch( + Root, + [&](SimpleShape *Simple) { + MultipleShape *Fused = dyn_cast<MultipleShape>(Root->Next); + // If we are fusing a Multiple with a loop into this Simple, then + // visit it now + if (Fused && Fused->Breaks) + LoopStack.push(Fused); + if (Simple->Inner->BranchVar) + LoopStack.push(nullptr); // a switch means breaks are now useless, + // push a dummy + if (Fused) { + if (Fused->UseSwitch) + LoopStack.push(nullptr); // a switch means breaks are now + // useless, push a dummy + for (const auto &iter : Fused->InnerMap) { + FindLabeledLoops(iter.second); + } + } + for (const auto &iter : Simple->Inner->ProcessedBranchesOut) { + Branch *Details = iter.second.get(); + if (Details->Type == Branch::Break || + Details->Type == Branch::Continue) { + assert(!LoopStack.empty()); + if (Details->Ancestor != LoopStack.top() && Details->Labeled) { + if (MultipleShape *Multiple = + dyn_cast<MultipleShape>(Details->Ancestor)) { + Multiple->Labeled = true; + } else { + LoopShape *Loop = cast<LoopShape>(Details->Ancestor); + Loop->Labeled = true; + } + } else { + Details->Labeled = false; + } + } + if (Fused && Fused->UseSwitch) + LoopStack.pop(); + if (Simple->Inner->BranchVar) + LoopStack.pop(); + if (Fused && Fused->Breaks) + LoopStack.pop(); + if (Fused) + Next = Fused->Next; + else + Next = Root->Next; + } + } + , [&](MultipleShape* Multiple) { + if (Multiple->Breaks) + LoopStack.push(Multiple); + for (const auto &iter : Multiple->InnerMap) + FindLabeledLoops(iter.second); + if (Multiple->Breaks) + LoopStack.pop(); + Next = Root->Next; + } + , [&](LoopShape* Loop) { + LoopStack.push(Loop); + FindLabeledLoops(Loop->Inner); + LoopStack.pop(); + Next = Root->Next; + }); + } + } + + void Process(Shape * Root) { + FindNaturals(Root); + RemoveUnneededFlows(Root); + FindLabeledLoops(Root); + } + }; + + PostOptimizer(this).Process(Root); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/Relooper.h b/contrib/llvm/lib/Target/WebAssembly/Relooper.h new file mode 100644 index 0000000..7c564de --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/Relooper.h @@ -0,0 +1,186 @@ +//===-- Relooper.h - Top-level interface for WebAssembly ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===-------------------------------------------------------------------===// +/// +/// \file +/// \brief This defines an optimized C++ implemention of the Relooper +/// algorithm, originally developed as part of Emscripten, which +/// generates a structured AST from arbitrary control flow. +/// +//===-------------------------------------------------------------------===// + +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Support/Casting.h" + +#include <cassert> +#include <cstdarg> +#include <cstdio> +#include <deque> +#include <list> +#include <map> +#include <memory> +#include <set> + +namespace llvm { + +namespace Relooper { + +struct Block; +struct Shape; + +/// +/// Info about a branching from one block to another +/// +struct Branch { + enum FlowType { + Direct = 0, // We will directly reach the right location through other + // means, no need for continue or break + Break = 1, + Continue = 2, + Nested = 3 // This code is directly reached, but we must be careful to + // ensure it is nested in an if - it is not reached + // unconditionally, other code paths exist alongside it that we need to make + // sure do not intertwine + }; + Shape + *Ancestor; // If not nullptr, this shape is the relevant one for purposes + // of getting to the target block. We break or continue on it + Branch::FlowType + Type; // If Ancestor is not nullptr, this says whether to break or + // continue + bool Labeled; // If a break or continue, whether we need to use a label + const char *Condition; // The condition for which we branch. For example, + // "my_var == 1". Conditions are checked one by one. + // One of the conditions should have nullptr as the + // condition, in which case it is the default + // FIXME: move from char* to LLVM data structures + const char *Code; // If provided, code that is run right before the branch is + // taken. This is useful for phis + // FIXME: move from char* to LLVM data structures + + Branch(const char *ConditionInit, const char *CodeInit = nullptr); + ~Branch(); +}; + +typedef SetVector<Block *> BlockSet; +typedef MapVector<Block *, Branch *> BlockBranchMap; +typedef MapVector<Block *, std::unique_ptr<Branch>> OwningBlockBranchMap; + +/// +/// Represents a basic block of code - some instructions that end with a +/// control flow modifier (a branch, return or throw). +/// +struct Block { + // Branches become processed after we finish the shape relevant to them. For + // example, when we recreate a loop, branches to the loop start become + // continues and are now processed. When we calculate what shape to generate + // from a set of blocks, we ignore processed branches. Blocks own the Branch + // objects they use, and destroy them when done. + OwningBlockBranchMap BranchesOut; + BlockSet BranchesIn; + OwningBlockBranchMap ProcessedBranchesOut; + BlockSet ProcessedBranchesIn; + Shape *Parent; // The shape we are directly inside + int Id; // A unique identifier, defined when added to relooper. Note that this + // uniquely identifies a *logical* block - if we split it, the two + // instances have the same content *and* the same Id + const char *Code; // The string representation of the code in this block. + // Owning pointer (we copy the input) + // FIXME: move from char* to LLVM data structures + const char *BranchVar; // A variable whose value determines where we go; if + // this is not nullptr, emit a switch on that variable + // FIXME: move from char* to LLVM data structures + bool IsCheckedMultipleEntry; // If true, we are a multiple entry, so reaching + // us requires setting the label variable + + Block(const char *CodeInit, const char *BranchVarInit); + ~Block(); + + void AddBranchTo(Block *Target, const char *Condition, + const char *Code = nullptr); +}; + +/// +/// Represents a structured control flow shape +/// +struct Shape { + int Id; // A unique identifier. Used to identify loops, labels are Lx where x + // is the Id. Defined when added to relooper + Shape *Next; // The shape that will appear in the code right after this one + Shape *Natural; // The shape that control flow gets to naturally (if there is + // Next, then this is Next) + + /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.) + enum ShapeKind { SK_Simple, SK_Multiple, SK_Loop }; + +private: + ShapeKind Kind; + +public: + ShapeKind getKind() const { return Kind; } + + Shape(ShapeKind KindInit) : Id(-1), Next(nullptr), Kind(KindInit) {} +}; + +/// +/// Simple: No control flow at all, just instructions. +/// +struct SimpleShape : public Shape { + Block *Inner; + + SimpleShape() : Shape(SK_Simple), Inner(nullptr) {} + + static bool classof(const Shape *S) { return S->getKind() == SK_Simple; } +}; + +/// +/// A shape that may be implemented with a labeled loop. +/// +struct LabeledShape : public Shape { + bool Labeled; // If we have a loop, whether it needs to be labeled + + LabeledShape(ShapeKind KindInit) : Shape(KindInit), Labeled(false) {} +}; + +// Blocks with the same id were split and are identical, so we just care about +// ids in Multiple entries +typedef std::map<int, Shape *> IdShapeMap; + +/// +/// Multiple: A shape with more than one entry. If the next block to +/// be entered is among them, we run it and continue to +/// the next shape, otherwise we continue immediately to the +/// next shape. +/// +struct MultipleShape : public LabeledShape { + IdShapeMap InnerMap; // entry block ID -> shape + int Breaks; // If we have branches on us, we need a loop (or a switch). This + // is a counter of requirements, + // if we optimize it to 0, the loop is unneeded + bool UseSwitch; // Whether to switch on label as opposed to an if-else chain + + MultipleShape() : LabeledShape(SK_Multiple), Breaks(0), UseSwitch(false) {} + + static bool classof(const Shape *S) { return S->getKind() == SK_Multiple; } +}; + +/// +/// Loop: An infinite loop. +/// +struct LoopShape : public LabeledShape { + Shape *Inner; + + LoopShape() : LabeledShape(SK_Loop), Inner(nullptr) {} + + static bool classof(const Shape *S) { return S->getKind() == SK_Loop; } +}; + +} // namespace Relooper + +} // namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h index 3ff19d4..e972da5 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.h @@ -23,8 +23,22 @@ namespace llvm { class WebAssemblyTargetMachine; class FunctionPass; +FunctionPass *createWebAssemblyOptimizeReturned(); + FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM, CodeGenOpt::Level OptLevel); +FunctionPass *createWebAssemblyArgumentMove(); + +FunctionPass *createWebAssemblyStoreResults(); +FunctionPass *createWebAssemblyRegStackify(); +FunctionPass *createWebAssemblyRegColoring(); +FunctionPass *createWebAssemblyPEI(); +FunctionPass *createWebAssemblyCFGStackify(); +FunctionPass *createWebAssemblyLowerBrUnless(); +FunctionPass *createWebAssemblyRegNumbering(); +FunctionPass *createWebAssemblyPeephole(); + +FunctionPass *createWebAssemblyRelooper(); } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td index a123bf6..551ad93 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -6,10 +6,11 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This is a target description file for the WebAssembly architecture, which is -// also known as "wasm". -// +/// +/// \file +/// \brief This is a target description file for the WebAssembly architecture, +/// which is also known as "wasm". +/// //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -50,6 +51,9 @@ def WebAssemblyInstrInfo : InstrInfo; // Minimal Viable Product. def : ProcessorModel<"mvp", NoSchedModel, []>; +// Generic processor: latest stable version. +def : ProcessorModel<"generic", NoSchedModel, []>; + // Latest and greatest experimental version of WebAssembly. Bugs included! def : ProcessorModel<"bleeding-edge", NoSchedModel, [FeatureSIMD128]>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp new file mode 100644 index 0000000..3893c40 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyArgumentMove.cpp @@ -0,0 +1,110 @@ +//===-- WebAssemblyArgumentMove.cpp - Argument instruction moving ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file moves ARGUMENT instructions after ScheduleDAG scheduling. +/// +/// Arguments are really live-in registers, however, since we use virtual +/// registers and LLVM doesn't support live-in virtual registers, we're +/// currently making do with ARGUMENT instructions which are placed at the top +/// of the entry block. The trick is to get them to *stay* at the top of the +/// entry block. +/// +/// The ARGUMENTS physical register keeps these instructions pinned in place +/// during liveness-aware CodeGen passes, however one thing which does not +/// respect this is the ScheduleDAG scheduler. This pass is therefore run +/// immediately after that. +/// +/// This is all hopefully a temporary solution until we find a better solution +/// for describing the live-in nature of arguments. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-argument-move" + +namespace { +class WebAssemblyArgumentMove final : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyArgumentMove() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { + return "WebAssembly Argument Move"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // end anonymous namespace + +char WebAssemblyArgumentMove::ID = 0; +FunctionPass *llvm::createWebAssemblyArgumentMove() { + return new WebAssemblyArgumentMove(); +} + +/// Test whether the given instruction is an ARGUMENT. +static bool IsArgument(const MachineInstr *MI) { + switch (MI->getOpcode()) { + case WebAssembly::ARGUMENT_I32: + case WebAssembly::ARGUMENT_I64: + case WebAssembly::ARGUMENT_F32: + case WebAssembly::ARGUMENT_F64: + return true; + default: + return false; + } +} + +bool WebAssemblyArgumentMove::runOnMachineFunction(MachineFunction &MF) { + DEBUG({ + dbgs() << "********** Argument Move **********\n" + << "********** Function: " << MF.getName() << '\n'; + }); + + bool Changed = false; + MachineBasicBlock &EntryMBB = MF.front(); + MachineBasicBlock::iterator InsertPt = EntryMBB.end(); + + // Look for the first NonArg instruction. + for (auto MII = EntryMBB.begin(), MIE = EntryMBB.end(); MII != MIE; ++MII) { + MachineInstr *MI = MII; + if (!IsArgument(MI)) { + InsertPt = MII; + break; + } + } + + // Now move any argument instructions later in the block + // to before our first NonArg instruction. + for (auto I = InsertPt, E = EntryMBB.end(); I != E; ++I) { + MachineInstr *MI = I; + if (IsArgument(MI)) { + EntryMBB.insert(InsertPt, MI->removeFromParent()); + Changed = true; + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp new file mode 100644 index 0000000..0d2b4d9 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -0,0 +1,285 @@ +//===-- WebAssemblyAsmPrinter.cpp - WebAssembly LLVM assembly writer ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file contains a printer that converts from our internal +/// representation of machine-dependent LLVM code to the WebAssembly assembly +/// language. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "InstPrinter/WebAssemblyInstPrinter.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMCInstLower.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblyRegisterInfo.h" +#include "WebAssemblySubtarget.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "asm-printer" + +namespace { + +class WebAssemblyAsmPrinter final : public AsmPrinter { + const MachineRegisterInfo *MRI; + const WebAssemblyFunctionInfo *MFI; + +public: + WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {} + +private: + const char *getPassName() const override { + return "WebAssembly Assembly Printer"; + } + + //===------------------------------------------------------------------===// + // MachineFunctionPass Implementation. + //===------------------------------------------------------------------===// + + bool runOnMachineFunction(MachineFunction &MF) override { + MRI = &MF.getRegInfo(); + MFI = MF.getInfo<WebAssemblyFunctionInfo>(); + return AsmPrinter::runOnMachineFunction(MF); + } + + //===------------------------------------------------------------------===// + // AsmPrinter Implementation. + //===------------------------------------------------------------------===// + + void EmitJumpTableInfo() override; + void EmitConstantPool() override; + void EmitFunctionBodyStart() override; + void EmitInstruction(const MachineInstr *MI) override; + bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) override; + bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo, + unsigned AsmVariant, const char *ExtraCode, + raw_ostream &OS) override; + + MVT getRegType(unsigned RegNo) const; + const char *toString(MVT VT) const; + std::string regToString(const MachineOperand &MO); +}; + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// +// Helpers. +//===----------------------------------------------------------------------===// + +MVT WebAssemblyAsmPrinter::getRegType(unsigned RegNo) const { + const TargetRegisterClass *TRC = + TargetRegisterInfo::isVirtualRegister(RegNo) ? + MRI->getRegClass(RegNo) : + MRI->getTargetRegisterInfo()->getMinimalPhysRegClass(RegNo); + for (MVT T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64}) + if (TRC->hasType(T)) + return T; + DEBUG(errs() << "Unknown type for register number: " << RegNo); + llvm_unreachable("Unknown register type"); + return MVT::Other; +} + +std::string WebAssemblyAsmPrinter::regToString(const MachineOperand &MO) { + unsigned RegNo = MO.getReg(); + assert(TargetRegisterInfo::isVirtualRegister(RegNo) && + "Unlowered physical register encountered during assembly printing"); + assert(!MFI->isVRegStackified(RegNo)); + unsigned WAReg = MFI->getWAReg(RegNo); + assert(WAReg != WebAssemblyFunctionInfo::UnusedReg); + return '$' + utostr(WAReg); +} + +const char *WebAssemblyAsmPrinter::toString(MVT VT) const { + return WebAssembly::TypeToString(VT); +} + +//===----------------------------------------------------------------------===// +// WebAssemblyAsmPrinter Implementation. +//===----------------------------------------------------------------------===// + +void WebAssemblyAsmPrinter::EmitConstantPool() { + assert(MF->getConstantPool()->getConstants().empty() && + "WebAssembly disables constant pools"); +} + +void WebAssemblyAsmPrinter::EmitJumpTableInfo() { + // Nothing to do; jump tables are incorporated into the instruction stream. +} + +static void ComputeLegalValueVTs(const Function &F, const TargetMachine &TM, + Type *Ty, SmallVectorImpl<MVT> &ValueVTs) { + const DataLayout &DL(F.getParent()->getDataLayout()); + const WebAssemblyTargetLowering &TLI = + *TM.getSubtarget<WebAssemblySubtarget>(F).getTargetLowering(); + SmallVector<EVT, 4> VTs; + ComputeValueVTs(TLI, DL, Ty, VTs); + + for (EVT VT : VTs) { + unsigned NumRegs = TLI.getNumRegisters(F.getContext(), VT); + MVT RegisterVT = TLI.getRegisterType(F.getContext(), VT); + for (unsigned i = 0; i != NumRegs; ++i) + ValueVTs.push_back(RegisterVT); + } +} + +void WebAssemblyAsmPrinter::EmitFunctionBodyStart() { + if (!MFI->getParams().empty()) { + MCInst Param; + Param.setOpcode(WebAssembly::PARAM); + for (MVT VT : MFI->getParams()) + Param.addOperand(MCOperand::createImm(VT.SimpleTy)); + EmitToStreamer(*OutStreamer, Param); + } + + SmallVector<MVT, 4> ResultVTs; + const Function &F(*MF->getFunction()); + ComputeLegalValueVTs(F, TM, F.getReturnType(), ResultVTs); + // If the return type needs to be legalized it will get converted into + // passing a pointer. + if (ResultVTs.size() == 1) { + MCInst Result; + Result.setOpcode(WebAssembly::RESULT); + Result.addOperand(MCOperand::createImm(ResultVTs.front().SimpleTy)); + EmitToStreamer(*OutStreamer, Result); + } + + bool AnyWARegs = false; + MCInst Local; + Local.setOpcode(WebAssembly::LOCAL); + for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) { + unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx); + unsigned WAReg = MFI->getWAReg(VReg); + // Don't declare unused registers. + if (WAReg == WebAssemblyFunctionInfo::UnusedReg) + continue; + // Don't redeclare parameters. + if (WAReg < MFI->getParams().size()) + continue; + // Don't declare stackified registers. + if (int(WAReg) < 0) + continue; + Local.addOperand(MCOperand::createImm(getRegType(VReg).SimpleTy)); + AnyWARegs = true; + } + auto &PhysRegs = MFI->getPhysRegs(); + for (unsigned PReg = 0; PReg < PhysRegs.size(); ++PReg) { + if (PhysRegs[PReg] == -1U) + continue; + Local.addOperand(MCOperand::createImm(getRegType(PReg).SimpleTy)); + AnyWARegs = true; + } + if (AnyWARegs) + EmitToStreamer(*OutStreamer, Local); + + AsmPrinter::EmitFunctionBodyStart(); +} + +void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { + DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n'); + + switch (MI->getOpcode()) { + case WebAssembly::ARGUMENT_I32: + case WebAssembly::ARGUMENT_I64: + case WebAssembly::ARGUMENT_F32: + case WebAssembly::ARGUMENT_F64: + // These represent values which are live into the function entry, so there's + // no instruction to emit. + break; + case WebAssembly::LOOP_END: + // This is a no-op which just exists to tell AsmPrinter.cpp that there's a + // fallthrough which nevertheless requires a label for the destination here. + break; + default: { + WebAssemblyMCInstLower MCInstLowering(OutContext, *this); + MCInst TmpInst; + MCInstLowering.Lower(MI, TmpInst); + EmitToStreamer(*OutStreamer, TmpInst); + break; + } + } +} + +bool WebAssemblyAsmPrinter::PrintAsmOperand(const MachineInstr *MI, + unsigned OpNo, unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &OS) { + if (AsmVariant != 0) + report_fatal_error("There are no defined alternate asm variants"); + + // First try the generic code, which knows about modifiers like 'c' and 'n'. + if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, OS)) + return false; + + if (!ExtraCode) { + const MachineOperand &MO = MI->getOperand(OpNo); + switch (MO.getType()) { + case MachineOperand::MO_Immediate: + OS << MO.getImm(); + return false; + case MachineOperand::MO_Register: + OS << regToString(MO); + return false; + case MachineOperand::MO_GlobalAddress: + getSymbol(MO.getGlobal())->print(OS, MAI); + printOffset(MO.getOffset(), OS); + return false; + case MachineOperand::MO_ExternalSymbol: + GetExternalSymbolSymbol(MO.getSymbolName())->print(OS, MAI); + printOffset(MO.getOffset(), OS); + return false; + case MachineOperand::MO_MachineBasicBlock: + MO.getMBB()->getSymbol()->print(OS, MAI); + return false; + default: + break; + } + } + + return true; +} + +bool WebAssemblyAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, + unsigned OpNo, + unsigned AsmVariant, + const char *ExtraCode, + raw_ostream &OS) { + if (AsmVariant != 0) + report_fatal_error("There are no defined alternate asm variants"); + + if (!ExtraCode) { + // TODO: For now, we just hard-code 0 as the constant offset; teach + // SelectInlineAsmMemoryOperand how to do address mode matching. + OS << "0(" + regToString(MI->getOperand(OpNo)) + ')'; + return false; + } + + return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, AsmVariant, ExtraCode, OS); +} + +// Force static initialization. +extern "C" void LLVMInitializeWebAssemblyAsmPrinter() { + RegisterAsmPrinter<WebAssemblyAsmPrinter> X(TheWebAssemblyTarget32); + RegisterAsmPrinter<WebAssemblyAsmPrinter> Y(TheWebAssemblyTarget64); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp new file mode 100644 index 0000000..e9671ee --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -0,0 +1,468 @@ +//===-- WebAssemblyCFGStackify.cpp - CFG Stackification -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a CFG stacking pass. +/// +/// This pass reorders the blocks in a function to put them into a reverse +/// post-order [0], with special care to keep the order as similar as possible +/// to the original order, and to keep loops contiguous even in the case of +/// split backedges. +/// +/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since +/// scope boundaries serve as the labels for WebAssembly's control transfers. +/// +/// This is sufficient to convert arbitrary CFGs into a form that works on +/// WebAssembly, provided that all loops are single-entry. +/// +/// [0] https://en.wikipedia.org/wiki/Depth-first_search#Vertex_orderings +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblySubtarget.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-cfg-stackify" + +namespace { +class WebAssemblyCFGStackify final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly CFG Stackify"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); + AU.addPreserved<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyCFGStackify() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyCFGStackify::ID = 0; +FunctionPass *llvm::createWebAssemblyCFGStackify() { + return new WebAssemblyCFGStackify(); +} + +static void EliminateMultipleEntryLoops(MachineFunction &MF, + const MachineLoopInfo &MLI) { + SmallPtrSet<MachineBasicBlock *, 8> InSet; + for (scc_iterator<MachineFunction *> I = scc_begin(&MF), E = scc_end(&MF); + I != E; ++I) { + const std::vector<MachineBasicBlock *> &CurrentSCC = *I; + + // Skip trivial SCCs. + if (CurrentSCC.size() == 1) + continue; + + InSet.insert(CurrentSCC.begin(), CurrentSCC.end()); + MachineBasicBlock *Header = nullptr; + for (MachineBasicBlock *MBB : CurrentSCC) { + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (InSet.count(Pred)) + continue; + if (!Header) { + Header = MBB; + break; + } + // TODO: Implement multiple-entry loops. + report_fatal_error("multiple-entry loops are not supported yet"); + } + } + assert(MLI.isLoopHeader(Header)); + + InSet.clear(); + } +} + +namespace { +/// Post-order traversal stack entry. +struct POStackEntry { + MachineBasicBlock *MBB; + SmallVector<MachineBasicBlock *, 0> Succs; + + POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF, + const MachineLoopInfo &MLI); +}; +} // end anonymous namespace + +static bool LoopContains(const MachineLoop *Loop, + const MachineBasicBlock *MBB) { + return Loop ? Loop->contains(MBB) : true; +} + +POStackEntry::POStackEntry(MachineBasicBlock *MBB, MachineFunction &MF, + const MachineLoopInfo &MLI) + : MBB(MBB), Succs(MBB->successors()) { + // RPO is not a unique form, since at every basic block with multiple + // successors, the DFS has to pick which order to visit the successors in. + // Sort them strategically (see below). + MachineLoop *Loop = MLI.getLoopFor(MBB); + MachineFunction::iterator Next = next(MachineFunction::iterator(MBB)); + MachineBasicBlock *LayoutSucc = Next == MF.end() ? nullptr : &*Next; + std::stable_sort( + Succs.begin(), Succs.end(), + [=, &MLI](const MachineBasicBlock *A, const MachineBasicBlock *B) { + if (A == B) + return false; + + // Keep loops contiguous by preferring the block that's in the same + // loop. + bool LoopContainsA = LoopContains(Loop, A); + bool LoopContainsB = LoopContains(Loop, B); + if (LoopContainsA && !LoopContainsB) + return true; + if (!LoopContainsA && LoopContainsB) + return false; + + // Minimize perturbation by preferring the block which is the immediate + // layout successor. + if (A == LayoutSucc) + return true; + if (B == LayoutSucc) + return false; + + // TODO: More sophisticated orderings may be profitable here. + + return false; + }); +} + +/// Return the "bottom" block of a loop. This differs from +/// MachineLoop::getBottomBlock in that it works even if the loop is +/// discontiguous. +static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) { + MachineBasicBlock *Bottom = Loop->getHeader(); + for (MachineBasicBlock *MBB : Loop->blocks()) + if (MBB->getNumber() > Bottom->getNumber()) + Bottom = MBB; + return Bottom; +} + +/// Sort the blocks in RPO, taking special care to make sure that loops are +/// contiguous even in the case of split backedges. +/// +/// TODO: Determine whether RPO is actually worthwhile, or whether we should +/// move to just a stable-topological-sort-based approach that would preserve +/// more of the original order. +static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI) { + // Note that we do our own RPO rather than using + // "llvm/ADT/PostOrderIterator.h" because we want control over the order that + // successors are visited in (see above). Also, we can sort the blocks in the + // MachineFunction as we go. + SmallPtrSet<MachineBasicBlock *, 16> Visited; + SmallVector<POStackEntry, 16> Stack; + + MachineBasicBlock *EntryBlock = &*MF.begin(); + Visited.insert(EntryBlock); + Stack.push_back(POStackEntry(EntryBlock, MF, MLI)); + + for (;;) { + POStackEntry &Entry = Stack.back(); + SmallVectorImpl<MachineBasicBlock *> &Succs = Entry.Succs; + if (!Succs.empty()) { + MachineBasicBlock *Succ = Succs.pop_back_val(); + if (Visited.insert(Succ).second) + Stack.push_back(POStackEntry(Succ, MF, MLI)); + continue; + } + + // Put the block in its position in the MachineFunction. + MachineBasicBlock &MBB = *Entry.MBB; + MBB.moveBefore(&*MF.begin()); + + // Branch instructions may utilize a fallthrough, so update them if a + // fallthrough has been added or removed. + if (!MBB.empty() && MBB.back().isTerminator() && !MBB.back().isBranch() && + !MBB.back().isBarrier()) + report_fatal_error( + "Non-branch terminator with fallthrough cannot yet be rewritten"); + if (MBB.empty() || !MBB.back().isTerminator() || MBB.back().isBranch()) + MBB.updateTerminator(); + + Stack.pop_back(); + if (Stack.empty()) + break; + } + + // Now that we've sorted the blocks in RPO, renumber them. + MF.RenumberBlocks(); + +#ifndef NDEBUG + SmallSetVector<MachineLoop *, 8> OnStack; + + // Insert a sentinel representing the degenerate loop that starts at the + // function entry block and includes the entire function as a "loop" that + // executes once. + OnStack.insert(nullptr); + + for (auto &MBB : MF) { + assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative."); + + MachineLoop *Loop = MLI.getLoopFor(&MBB); + if (Loop && &MBB == Loop->getHeader()) { + // Loop header. The loop predecessor should be sorted above, and the other + // predecessors should be backedges below. + for (auto Pred : MBB.predecessors()) + assert( + (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) && + "Loop header predecessors must be loop predecessors or backedges"); + assert(OnStack.insert(Loop) && "Loops should be declared at most once."); + } else { + // Not a loop header. All predecessors should be sorted above. + for (auto Pred : MBB.predecessors()) + assert(Pred->getNumber() < MBB.getNumber() && + "Non-loop-header predecessors should be topologically sorted"); + assert(OnStack.count(MLI.getLoopFor(&MBB)) && + "Blocks must be nested in their loops"); + } + while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back())) + OnStack.pop_back(); + } + assert(OnStack.pop_back_val() == nullptr && + "The function entry block shouldn't actually be a loop header"); + assert(OnStack.empty() && + "Control flow stack pushes and pops should be balanced."); +#endif +} + +/// Test whether Pred has any terminators explicitly branching to MBB, as +/// opposed to falling through. Note that it's possible (eg. in unoptimized +/// code) for a branch instruction to both branch to a block and fallthrough +/// to it, so we check the actual branch operands to see if there are any +/// explicit mentions. +static bool ExplicitlyBranchesTo(MachineBasicBlock *Pred, MachineBasicBlock *MBB) { + for (MachineInstr &MI : Pred->terminators()) + for (MachineOperand &MO : MI.explicit_operands()) + if (MO.isMBB() && MO.getMBB() == MBB) + return true; + return false; +} + +/// Insert a BLOCK marker for branches to MBB (if needed). +static void PlaceBlockMarker(MachineBasicBlock &MBB, MachineFunction &MF, + SmallVectorImpl<MachineBasicBlock *> &ScopeTops, + const WebAssemblyInstrInfo &TII, + const MachineLoopInfo &MLI, + MachineDominatorTree &MDT) { + // First compute the nearest common dominator of all forward non-fallthrough + // predecessors so that we minimize the time that the BLOCK is on the stack, + // which reduces overall stack height. + MachineBasicBlock *Header = nullptr; + bool IsBranchedTo = false; + int MBBNumber = MBB.getNumber(); + for (MachineBasicBlock *Pred : MBB.predecessors()) + if (Pred->getNumber() < MBBNumber) { + Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred; + if (ExplicitlyBranchesTo(Pred, &MBB)) + IsBranchedTo = true; + } + if (!Header) + return; + if (!IsBranchedTo) + return; + + assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors"); + MachineBasicBlock *LayoutPred = &*prev(MachineFunction::iterator(&MBB)); + + // If the nearest common dominator is inside a more deeply nested context, + // walk out to the nearest scope which isn't more deeply nested. + for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) { + if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) { + if (ScopeTop->getNumber() > Header->getNumber()) { + // Skip over an intervening scope. + I = next(MachineFunction::iterator(ScopeTop)); + } else { + // We found a scope level at an appropriate depth. + Header = ScopeTop; + break; + } + } + } + + // If there's a loop which ends just before MBB which contains Header, we can + // reuse its label instead of inserting a new BLOCK. + for (MachineLoop *Loop = MLI.getLoopFor(LayoutPred); + Loop && Loop->contains(LayoutPred); Loop = Loop->getParentLoop()) + if (Loop && LoopBottom(Loop) == LayoutPred && Loop->contains(Header)) + return; + + // Decide where in Header to put the BLOCK. + MachineBasicBlock::iterator InsertPos; + MachineLoop *HeaderLoop = MLI.getLoopFor(Header); + if (HeaderLoop && MBB.getNumber() > LoopBottom(HeaderLoop)->getNumber()) { + // Header is the header of a loop that does not lexically contain MBB, so + // the BLOCK needs to be above the LOOP. + InsertPos = Header->begin(); + } else { + // Otherwise, insert the BLOCK as late in Header as we can, but before the + // beginning of the local expression tree and any nested BLOCKs. + InsertPos = Header->getFirstTerminator(); + while (InsertPos != Header->begin() && + prev(InsertPos)->definesRegister(WebAssembly::EXPR_STACK) && + prev(InsertPos)->getOpcode() != WebAssembly::LOOP) + --InsertPos; + } + + // Add the BLOCK. + BuildMI(*Header, InsertPos, DebugLoc(), TII.get(WebAssembly::BLOCK)) + .addMBB(&MBB); + + // Track the farthest-spanning scope that ends at this point. + int Number = MBB.getNumber(); + if (!ScopeTops[Number] || + ScopeTops[Number]->getNumber() > Header->getNumber()) + ScopeTops[Number] = Header; +} + +/// Insert a LOOP marker for a loop starting at MBB (if it's a loop header). +static void PlaceLoopMarker(MachineBasicBlock &MBB, MachineFunction &MF, + SmallVectorImpl<MachineBasicBlock *> &ScopeTops, + const WebAssemblyInstrInfo &TII, + const MachineLoopInfo &MLI) { + MachineLoop *Loop = MLI.getLoopFor(&MBB); + if (!Loop || Loop->getHeader() != &MBB) + return; + + // The operand of a LOOP is the first block after the loop. If the loop is the + // bottom of the function, insert a dummy block at the end. + MachineBasicBlock *Bottom = LoopBottom(Loop); + auto Iter = next(MachineFunction::iterator(Bottom)); + if (Iter == MF.end()) { + MachineBasicBlock *Label = MF.CreateMachineBasicBlock(); + // Give it a fake predecessor so that AsmPrinter prints its label. + Label->addSuccessor(Label); + MF.push_back(Label); + Iter = next(MachineFunction::iterator(Bottom)); + } + MachineBasicBlock *AfterLoop = &*Iter; + BuildMI(MBB, MBB.begin(), DebugLoc(), TII.get(WebAssembly::LOOP)) + .addMBB(AfterLoop); + + // Emit a special no-op telling the asm printer that we need a label to close + // the loop scope, even though the destination is only reachable by + // fallthrough. + if (!Bottom->back().isBarrier()) + BuildMI(*Bottom, Bottom->end(), DebugLoc(), TII.get(WebAssembly::LOOP_END)); + + assert((!ScopeTops[AfterLoop->getNumber()] || + ScopeTops[AfterLoop->getNumber()]->getNumber() < MBB.getNumber()) && + "With RPO we should visit the outer-most loop for a block first."); + if (!ScopeTops[AfterLoop->getNumber()]) + ScopeTops[AfterLoop->getNumber()] = &MBB; +} + +/// Insert LOOP and BLOCK markers at appropriate places. +static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI, + const WebAssemblyInstrInfo &TII, + MachineDominatorTree &MDT) { + // For each block whose label represents the end of a scope, record the block + // which holds the beginning of the scope. This will allow us to quickly skip + // over scoped regions when walking blocks. We allocate one more than the + // number of blocks in the function to accommodate for the possible fake block + // we may insert at the end. + SmallVector<MachineBasicBlock *, 8> ScopeTops(MF.getNumBlockIDs() + 1); + + for (auto &MBB : MF) { + // Place the LOOP for MBB if MBB is the header of a loop. + PlaceLoopMarker(MBB, MF, ScopeTops, TII, MLI); + + // Place the BLOCK for MBB if MBB is branched to from above. + PlaceBlockMarker(MBB, MF, ScopeTops, TII, MLI, MDT); + } +} + +#ifndef NDEBUG +static bool +IsOnStack(const SmallVectorImpl<std::pair<MachineBasicBlock *, bool>> &Stack, + const MachineBasicBlock *MBB) { + for (const auto &Pair : Stack) + if (Pair.first == MBB) + return true; + return false; +} +#endif + +bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** CFG Stackifying **********\n" + "********** Function: " + << MF.getName() << '\n'); + + const auto &MLI = getAnalysis<MachineLoopInfo>(); + auto &MDT = getAnalysis<MachineDominatorTree>(); + const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + + // RPO sorting needs all loops to be single-entry. + EliminateMultipleEntryLoops(MF, MLI); + + // Sort the blocks in RPO, with contiguous loops. + SortBlocks(MF, MLI); + + // Place the BLOCK and LOOP markers to indicate the beginnings of scopes. + PlaceMarkers(MF, MLI, TII, MDT); + +#ifndef NDEBUG + // Verify that block and loop beginnings and endings are in LIFO order, and + // that all references to blocks are to blocks on the stack at the point of + // the reference. + SmallVector<std::pair<MachineBasicBlock *, bool>, 0> Stack; + for (auto &MBB : MF) { + while (!Stack.empty() && Stack.back().first == &MBB) + if (Stack.back().second) { + assert(Stack.size() >= 2); + Stack.pop_back(); + Stack.pop_back(); + } else { + assert(Stack.size() >= 1); + Stack.pop_back(); + } + for (auto &MI : MBB) + switch (MI.getOpcode()) { + case WebAssembly::LOOP: + Stack.push_back(std::make_pair(&MBB, false)); + Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), true)); + break; + case WebAssembly::BLOCK: + Stack.push_back(std::make_pair(MI.getOperand(0).getMBB(), false)); + break; + default: + // Verify that all referenced blocks are in scope. A reference to a + // block with a negative number is invalid, but can happen with inline + // asm, so we shouldn't assert on it, but instead let CodeGen properly + // fail on it. + for (const MachineOperand &MO : MI.explicit_operands()) + if (MO.isMBB() && MO.getMBB()->getNumber() >= 0) + assert(IsOnStack(Stack, MO.getMBB())); + break; + } + } + assert(Stack.empty()); +#endif + + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp new file mode 100644 index 0000000..1b761b1 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -0,0 +1,81 @@ +//===-- WebAssemblyFastISel.cpp - WebAssembly FastISel implementation -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines the WebAssembly-specific support for the FastISel +/// class. Some of the target-specific code is generated by tablegen in the file +/// WebAssemblyGenFastISel.inc, which is #included here. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblySubtarget.h" +#include "WebAssemblyTargetMachine.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/CodeGen/FastISel.h" +#include "llvm/CodeGen/FunctionLoweringInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Operator.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-fastisel" + +namespace { + +class WebAssemblyFastISel final : public FastISel { + /// Keep a pointer to the WebAssemblySubtarget around so that we can make the + /// right decision when generating code for different targets. + const WebAssemblySubtarget *Subtarget; + LLVMContext *Context; + + // Call handling routines. +private: +public: + // Backend specific FastISel code. + WebAssemblyFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) + : FastISel(FuncInfo, LibInfo, /*SkipTargetIndependentISel=*/true) { + Subtarget = &FuncInfo.MF->getSubtarget<WebAssemblySubtarget>(); + Context = &FuncInfo.Fn->getContext(); + } + + bool fastSelectInstruction(const Instruction *I) override; + +#include "WebAssemblyGenFastISel.inc" +}; + +} // end anonymous namespace + +bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) { + switch (I->getOpcode()) { + default: + break; + // TODO: add fast-isel selection cases here... + } + + // Fall back to target-independent instruction selection. + return selectOperator(I, I->getOpcode()); +} + +FastISel *WebAssembly::createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) { + return new WebAssemblyFastISel(FuncInfo, LibInfo); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp index e4ca82e..0eefd57 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp @@ -35,11 +35,20 @@ using namespace llvm; #define DEBUG_TYPE "wasm-frame-info" // TODO: Implement a red zone? +// TODO: wasm64 +// TODO: Prolog/epilog should be stackified too. This pass runs after register +// stackification, so we'll have to do it manually. +// TODO: Emit TargetOpcode::CFI_INSTRUCTION instructions /// Return true if the specified function should have a dedicated frame pointer /// register. bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const { - llvm_unreachable("TODO: implement hasFP"); + const MachineFrameInfo *MFI = MF.getFrameInfo(); + const auto *RegInfo = + MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo(); + return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + RegInfo->needsStackRealignment(MF); } /// Under normal circumstances, when a frame pointer is not required, we reserve @@ -52,23 +61,115 @@ bool WebAssemblyFrameLowering::hasReservedCallFrame( return !MF.getFrameInfo()->hasVarSizedObjects(); } + +/// Adjust the stack pointer by a constant amount. +static void adjustStackPointer(unsigned StackSize, + bool AdjustUp, + MachineFunction& MF, + MachineBasicBlock& MBB, + const TargetInstrInfo* TII, + MachineBasicBlock::iterator InsertPt, + const DebugLoc& DL) { + auto &MRI = MF.getRegInfo(); + unsigned SPReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer"); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), SPReg) + .addExternalSymbol(SPSymbol); + // This MachinePointerInfo should reference __stack_pointer as well but + // doesn't because MachinePointerInfo() takes a GV which we don't have for + // __stack_pointer. TODO: check if PseudoSourceValue::ExternalSymbolCallEntry + // is appropriate instead. (likewise for EmitEpologue below) + auto *LoadMMO = new MachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOLoad, 4, 4); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg) + .addImm(0) + .addReg(SPReg) + .addMemOperand(LoadMMO); + // Add/Subtract the frame size + unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addImm(StackSize); + BuildMI(MBB, InsertPt, DL, + TII->get(AdjustUp ? WebAssembly::ADD_I32 : WebAssembly::SUB_I32), + WebAssembly::SP32) + .addReg(SPReg) + .addReg(OffsetReg); + // The SP32 register now has the new stacktop. Also write it back to memory. + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addExternalSymbol(SPSymbol); + auto *MMO = new MachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, 4, 4); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32) + .addImm(0) + .addReg(OffsetReg) + .addReg(WebAssembly::SP32) + .addMemOperand(MMO); +} + void WebAssemblyFrameLowering::eliminateCallFramePseudoInstr( MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { - llvm_unreachable("TODO: implement eliminateCallFramePseudoInstr"); + const auto *TII = + static_cast<const WebAssemblyInstrInfo*>(MF.getSubtarget().getInstrInfo()); + DebugLoc DL = I->getDebugLoc(); + unsigned Opc = I->getOpcode(); + bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode(); + unsigned Amount = I->getOperand(0).getImm(); + if (Amount) + adjustStackPointer(Amount, IsDestroy, MF, MBB, + TII, I, DL); + MBB.erase(I); } void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { - llvm_unreachable("TODO: implement emitPrologue"); + // TODO: Do ".setMIFlag(MachineInstr::FrameSetup)" on emitted instructions + auto *MFI = MF.getFrameInfo(); + assert(MFI->getCalleeSavedInfo().empty() && + "WebAssembly should not have callee-saved registers"); + assert(!hasFP(MF) && "Functions needing frame pointers not yet supported"); + uint64_t StackSize = MFI->getStackSize(); + if (!StackSize && (!MFI->adjustsStack() || MFI->getMaxCallFrameSize() == 0)) + return; + + const auto *TII = MF.getSubtarget().getInstrInfo(); + + auto InsertPt = MBB.begin(); + DebugLoc DL; + + adjustStackPointer(StackSize, false, MF, MBB, TII, InsertPt, DL); } void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { - llvm_unreachable("TODO: implement emitEpilogue"); -} + uint64_t StackSize = MF.getFrameInfo()->getStackSize(); + if (!StackSize) + return; + const auto *TII = MF.getSubtarget().getInstrInfo(); + auto &MRI = MF.getRegInfo(); + unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + auto InsertPt = MBB.getFirstTerminator(); + DebugLoc DL; + + if (InsertPt != MBB.end()) { + DL = InsertPt->getDebugLoc(); + } -void WebAssemblyFrameLowering::processFunctionBeforeCalleeSavedScan( - MachineFunction &MF, RegScavenger *RS) const { - llvm_unreachable("TODO: implement processFunctionBeforeCalleeSavedScan"); + // Restore the stack pointer. Without FP its value is just SP32 - stacksize + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addImm(StackSize); + auto *SPSymbol = MF.createExternalSymbolName("__stack_pointer"); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), WebAssembly::SP32) + .addReg(WebAssembly::SP32) + .addReg(OffsetReg); + // Re-use OffsetReg to hold the address of the stacktop + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg) + .addExternalSymbol(SPSymbol); + auto *MMO = new MachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, 4, 4); + BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::STORE_I32), WebAssembly::SP32) + .addImm(0) + .addReg(OffsetReg) + .addReg(WebAssembly::SP32) + .addMemOperand(MMO); } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h index 0b112d0..5f4708f 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h @@ -38,9 +38,6 @@ public: bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; - - void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, - RegScavenger *RS) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def new file mode 100644 index 0000000..3a03fa5 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -0,0 +1,25 @@ +//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file describes the various WebAssembly ISD node types. +/// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +HANDLE_NODETYPE(CALL1) +HANDLE_NODETYPE(CALL0) +HANDLE_NODETYPE(RETURN) +HANDLE_NODETYPE(ARGUMENT) +HANDLE_NODETYPE(Wrapper) +HANDLE_NODETYPE(BR_IF) +HANDLE_NODETYPE(TABLESWITCH) + +// add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here... diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index 518ef33..8390f79 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -56,13 +56,68 @@ public: SDNode *Select(SDNode *Node) override; + bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, + std::vector<SDValue> &OutOps) override; + +// Include the pieces autogenerated from the target description. +#include "WebAssemblyGenDAGISel.inc" + private: // add select functions here... }; } // end anonymous namespace SDNode *WebAssemblyDAGToDAGISel::Select(SDNode *Node) { - llvm_unreachable("TODO: implement Select"); + // Dump information about the Node being selected. + DEBUG(errs() << "Selecting: "); + DEBUG(Node->dump(CurDAG)); + DEBUG(errs() << "\n"); + + // If we have a custom node, we already have selected! + if (Node->isMachineOpcode()) { + DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n"); + Node->setNodeId(-1); + return nullptr; + } + + // Few custom selection stuff. + SDNode *ResNode = nullptr; + EVT VT = Node->getValueType(0); + + switch (Node->getOpcode()) { + default: + break; + // If we need WebAssembly-specific selection, it would go here. + (void)VT; + } + + // Select the default instruction. + ResNode = SelectCode(Node); + + DEBUG(errs() << "=> "); + if (ResNode == nullptr || ResNode == Node) + DEBUG(Node->dump(CurDAG)); + else + DEBUG(ResNode->dump(CurDAG)); + DEBUG(errs() << "\n"); + + return ResNode; +} + +bool WebAssemblyDAGToDAGISel::SelectInlineAsmMemoryOperand( + const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) { + switch (ConstraintID) { + case InlineAsm::Constraint_i: + case InlineAsm::Constraint_m: + // We just support simple memory operands that just have a single address + // operand and need no special handling. + OutOps.push_back(Op); + return false; + default: + break; + } + + return true; } /// This pass converts a legalized DAG into a WebAssembly-specific DAG, ready diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4184eb6..7a89f78 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -17,10 +17,13 @@ #include "WebAssemblyMachineFunctionInfo.h" #include "WebAssemblySubtarget.h" #include "WebAssemblyTargetMachine.h" -#include "WebAssemblyTargetObjectFile.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/CommandLine.h" @@ -32,14 +35,254 @@ using namespace llvm; #define DEBUG_TYPE "wasm-lower" +namespace { +// Diagnostic information for unimplemented or unsupported feature reporting. +// TODO: This code is copied from BPF and AMDGPU; consider factoring it out +// and sharing code. +class DiagnosticInfoUnsupported final : public DiagnosticInfo { +private: + // Debug location where this diagnostic is triggered. + DebugLoc DLoc; + const Twine &Description; + const Function &Fn; + SDValue Value; + + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoUnsupported(SDLoc DLoc, const Function &Fn, const Twine &Desc, + SDValue Value) + : DiagnosticInfo(getKindID(), DS_Error), DLoc(DLoc.getDebugLoc()), + Description(Desc), Fn(Fn), Value(Value) {} + + void print(DiagnosticPrinter &DP) const override { + std::string Str; + raw_string_ostream OS(Str); + + if (DLoc) { + auto DIL = DLoc.get(); + StringRef Filename = DIL->getFilename(); + unsigned Line = DIL->getLine(); + unsigned Column = DIL->getColumn(); + OS << Filename << ':' << Line << ':' << Column << ' '; + } + + OS << "in function " << Fn.getName() << ' ' << *Fn.getFunctionType() << '\n' + << Description; + if (Value) + Value->print(OS); + OS << '\n'; + OS.flush(); + DP << Str; + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticInfoUnsupported::KindID = 0; +} // end anonymous namespace + WebAssemblyTargetLowering::WebAssemblyTargetLowering( const TargetMachine &TM, const WebAssemblySubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { + auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32; + + // Booleans always contain 0 or 1. + setBooleanContents(ZeroOrOneBooleanContent); // WebAssembly does not produce floating-point exceptions on normal floating // point operations. setHasFloatingPointExceptions(false); // We don't know the microarchitecture here, so just reduce register pressure. setSchedulingPreference(Sched::RegPressure); + // Tell ISel that we have a stack pointer. + setStackPointerRegisterToSaveRestore( + Subtarget->hasAddr64() ? WebAssembly::SP64 : WebAssembly::SP32); + // Set up the register classes. + addRegisterClass(MVT::i32, &WebAssembly::I32RegClass); + addRegisterClass(MVT::i64, &WebAssembly::I64RegClass); + addRegisterClass(MVT::f32, &WebAssembly::F32RegClass); + addRegisterClass(MVT::f64, &WebAssembly::F64RegClass); + // Compute derived properties from the register classes. + computeRegisterProperties(Subtarget->getRegisterInfo()); + + setOperationAction(ISD::GlobalAddress, MVTPtr, Custom); + setOperationAction(ISD::ExternalSymbol, MVTPtr, Custom); + setOperationAction(ISD::JumpTable, MVTPtr, Custom); + + // Take the default expansion for va_arg, va_copy, and va_end. There is no + // default action for va_start, so we do that custom. + setOperationAction(ISD::VASTART, MVT::Other, Custom); + setOperationAction(ISD::VAARG, MVT::Other, Expand); + setOperationAction(ISD::VACOPY, MVT::Other, Expand); + setOperationAction(ISD::VAEND, MVT::Other, Expand); + + for (auto T : {MVT::f32, MVT::f64}) { + // Don't expand the floating-point types to constant pools. + setOperationAction(ISD::ConstantFP, T, Legal); + // Expand floating-point comparisons. + for (auto CC : {ISD::SETO, ISD::SETUO, ISD::SETUEQ, ISD::SETONE, + ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE}) + setCondCodeAction(CC, T, Expand); + // Expand floating-point library function operators. + for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW, + ISD::FREM, ISD::FMA}) + setOperationAction(Op, T, Expand); + // Note supported floating-point library function operators that otherwise + // default to expand. + for (auto Op : + {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT, ISD::FRINT}) + setOperationAction(Op, T, Legal); + // Support minnan and maxnan, which otherwise default to expand. + setOperationAction(ISD::FMINNAN, T, Legal); + setOperationAction(ISD::FMAXNAN, T, Legal); + } + + for (auto T : {MVT::i32, MVT::i64}) { + // Expand unavailable integer operations. + for (auto Op : + {ISD::BSWAP, ISD::ROTL, ISD::ROTR, ISD::SMUL_LOHI, ISD::UMUL_LOHI, + ISD::MULHS, ISD::MULHU, ISD::SDIVREM, ISD::UDIVREM, ISD::SHL_PARTS, + ISD::SRA_PARTS, ISD::SRL_PARTS, ISD::ADDC, ISD::ADDE, ISD::SUBC, + ISD::SUBE}) { + setOperationAction(Op, T, Expand); + } + } + + // As a special case, these operators use the type to mean the type to + // sign-extend from. + for (auto T : {MVT::i1, MVT::i8, MVT::i16, MVT::i32}) + setOperationAction(ISD::SIGN_EXTEND_INREG, T, Expand); + + // Dynamic stack allocation: use the default expansion. + setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand); + + setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + + // Expand these forms; we pattern-match the forms that we can handle in isel. + for (auto T : {MVT::i32, MVT::i64, MVT::f32, MVT::f64}) + for (auto Op : {ISD::BR_CC, ISD::SELECT_CC}) + setOperationAction(Op, T, Expand); + + // We have custom switch handling. + setOperationAction(ISD::BR_JT, MVT::Other, Custom); + + // WebAssembly doesn't have: + // - Floating-point extending loads. + // - Floating-point truncating stores. + // - i1 extending loads. + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f32, Expand); + for (auto T : MVT::integer_valuetypes()) + for (auto Ext : {ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD}) + setLoadExtAction(Ext, T, MVT::i1, Promote); + + // Trap lowers to wasm unreachable + setOperationAction(ISD::TRAP, MVT::Other, Legal); +} + +FastISel *WebAssemblyTargetLowering::createFastISel( + FunctionLoweringInfo &FuncInfo, const TargetLibraryInfo *LibInfo) const { + return WebAssembly::createFastISel(FuncInfo, LibInfo); +} + +bool WebAssemblyTargetLowering::isOffsetFoldingLegal( + const GlobalAddressSDNode * /*GA*/) const { + // All offsets can be folded. + return true; +} + +MVT WebAssemblyTargetLowering::getScalarShiftAmountTy(const DataLayout & /*DL*/, + EVT VT) const { + unsigned BitWidth = NextPowerOf2(VT.getSizeInBits() - 1); + if (BitWidth > 1 && BitWidth < 8) + BitWidth = 8; + + if (BitWidth > 64) { + BitWidth = 64; + assert(BitWidth >= Log2_32_Ceil(VT.getSizeInBits()) && + "64-bit shift counts ought to be enough for anyone"); + } + + MVT Result = MVT::getIntegerVT(BitWidth); + assert(Result != MVT::INVALID_SIMPLE_VALUE_TYPE && + "Unable to represent scalar shift amount type"); + return Result; +} + +const char * +WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const { + switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) { + case WebAssemblyISD::FIRST_NUMBER: + break; +#define HANDLE_NODETYPE(NODE) \ + case WebAssemblyISD::NODE: \ + return "WebAssemblyISD::" #NODE; +#include "WebAssemblyISD.def" +#undef HANDLE_NODETYPE + } + return nullptr; +} + +std::pair<unsigned, const TargetRegisterClass *> +WebAssemblyTargetLowering::getRegForInlineAsmConstraint( + const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { + // First, see if this is a constraint that directly corresponds to a + // WebAssembly register class. + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'r': + assert(VT != MVT::iPTR && "Pointer MVT not expected here"); + if (VT.isInteger() && !VT.isVector()) { + if (VT.getSizeInBits() <= 32) + return std::make_pair(0U, &WebAssembly::I32RegClass); + if (VT.getSizeInBits() <= 64) + return std::make_pair(0U, &WebAssembly::I64RegClass); + } + break; + default: + break; + } + } + + return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); +} + +bool WebAssemblyTargetLowering::isCheapToSpeculateCttz() const { + // Assume ctz is a relatively cheap operation. + return true; +} + +bool WebAssemblyTargetLowering::isCheapToSpeculateCtlz() const { + // Assume clz is a relatively cheap operation. + return true; +} + +bool WebAssemblyTargetLowering::isLegalAddressingMode(const DataLayout &DL, + const AddrMode &AM, + Type *Ty, + unsigned AS) const { + // WebAssembly offsets are added as unsigned without wrapping. The + // isLegalAddressingMode gives us no way to determine if wrapping could be + // happening, so we approximate this by accepting only non-negative offsets. + if (AM.BaseOffs < 0) + return false; + + // WebAssembly has no scale register operands. + if (AM.Scale != 0) + return false; + + // Everything else is legal. + return true; } //===----------------------------------------------------------------------===// @@ -50,16 +293,359 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // Lowering Code //===----------------------------------------------------------------------===// +static void fail(SDLoc DL, SelectionDAG &DAG, const char *msg) { + MachineFunction &MF = DAG.getMachineFunction(); + DAG.getContext()->diagnose( + DiagnosticInfoUnsupported(DL, *MF.getFunction(), msg, SDValue())); +} + +// Test whether the given calling convention is supported. +static bool CallingConvSupported(CallingConv::ID CallConv) { + // We currently support the language-independent target-independent + // conventions. We don't yet have a way to annotate calls with properties like + // "cold", and we don't have any call-clobbered registers, so these are mostly + // all handled the same. + return CallConv == CallingConv::C || CallConv == CallingConv::Fast || + CallConv == CallingConv::Cold || + CallConv == CallingConv::PreserveMost || + CallConv == CallingConv::PreserveAll || + CallConv == CallingConv::CXX_FAST_TLS; +} + +SDValue +WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const { + SelectionDAG &DAG = CLI.DAG; + SDLoc DL = CLI.DL; + SDValue Chain = CLI.Chain; + SDValue Callee = CLI.Callee; + MachineFunction &MF = DAG.getMachineFunction(); + + CallingConv::ID CallConv = CLI.CallConv; + if (!CallingConvSupported(CallConv)) + fail(DL, DAG, + "WebAssembly doesn't support language-specific or target-specific " + "calling conventions yet"); + if (CLI.IsPatchPoint) + fail(DL, DAG, "WebAssembly doesn't support patch point yet"); + + // WebAssembly doesn't currently support explicit tail calls. If they are + // required, fail. Otherwise, just disable them. + if ((CallConv == CallingConv::Fast && CLI.IsTailCall && + MF.getTarget().Options.GuaranteedTailCallOpt) || + (CLI.CS && CLI.CS->isMustTailCall())) + fail(DL, DAG, "WebAssembly doesn't support tail call yet"); + CLI.IsTailCall = false; + + SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; + + SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; + if (Ins.size() > 1) + fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet"); + + SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; + for (const ISD::OutputArg &Out : Outs) { + if (Out.Flags.isByVal()) + fail(DL, DAG, "WebAssembly hasn't implemented byval arguments"); + if (Out.Flags.isNest()) + fail(DL, DAG, "WebAssembly hasn't implemented nest arguments"); + if (Out.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments"); + if (Out.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments"); + if (Out.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments"); + } + + bool IsVarArg = CLI.IsVarArg; + unsigned NumFixedArgs = CLI.NumFixedArgs; + auto PtrVT = getPointerTy(MF.getDataLayout()); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); + + if (IsVarArg) { + // Outgoing non-fixed arguments are placed at the top of the stack. First + // compute their offsets and the total amount of argument stack space + // needed. + for (SDValue Arg : + make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) { + EVT VT = Arg.getValueType(); + assert(VT != MVT::iPTR && "Legalized args should be concrete"); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); + unsigned Offset = + CCInfo.AllocateStack(MF.getDataLayout().getTypeAllocSize(Ty), + MF.getDataLayout().getABITypeAlignment(Ty)); + CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(), + Offset, VT.getSimpleVT(), + CCValAssign::Full)); + } + } + + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); + + SDValue NB; + if (NumBytes) { + NB = DAG.getConstant(NumBytes, DL, PtrVT, true); + Chain = DAG.getCALLSEQ_START(Chain, NB, DL); + } + + if (IsVarArg) { + // For non-fixed arguments, next emit stores to store the argument values + // to the stack at the offsets computed above. + SDValue SP = DAG.getCopyFromReg( + Chain, DL, getStackPointerRegisterToSaveRestore(), PtrVT); + unsigned ValNo = 0; + SmallVector<SDValue, 8> Chains; + for (SDValue Arg : + make_range(OutVals.begin() + NumFixedArgs, OutVals.end())) { + assert(ArgLocs[ValNo].getValNo() == ValNo && + "ArgLocs should remain in order and only hold varargs args"); + unsigned Offset = ArgLocs[ValNo++].getLocMemOffset(); + SDValue Add = DAG.getNode(ISD::ADD, DL, PtrVT, SP, + DAG.getConstant(Offset, DL, PtrVT)); + Chains.push_back(DAG.getStore(Chain, DL, Arg, Add, + MachinePointerInfo::getStack(MF, Offset), + false, false, 0)); + } + if (!Chains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + } + + // Compute the operands for the CALLn node. + SmallVector<SDValue, 16> Ops; + Ops.push_back(Chain); + Ops.push_back(Callee); + + // Add all fixed arguments. Note that for non-varargs calls, NumFixedArgs + // isn't reliable. + Ops.append(OutVals.begin(), + IsVarArg ? OutVals.begin() + NumFixedArgs : OutVals.end()); + + SmallVector<EVT, 8> Tys; + for (const auto &In : Ins) { + assert(!In.Flags.isByVal() && "byval is not valid for return values"); + assert(!In.Flags.isNest() && "nest is not valid for return values"); + if (In.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca return values"); + if (In.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs return values"); + if (In.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, + "WebAssembly hasn't implemented cons regs last return values"); + // Ignore In.getOrigAlign() because all our arguments are passed in + // registers. + Tys.push_back(In.VT); + } + Tys.push_back(MVT::Other); + SDVTList TyList = DAG.getVTList(Tys); + SDValue Res = + DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1, + DL, TyList, Ops); + if (Ins.empty()) { + Chain = Res; + } else { + InVals.push_back(Res); + Chain = Res.getValue(1); + } + + if (NumBytes) { + SDValue Unused = DAG.getTargetConstant(0, DL, PtrVT); + Chain = DAG.getCALLSEQ_END(Chain, NB, Unused, SDValue(), DL); + } + + return Chain; +} + +bool WebAssemblyTargetLowering::CanLowerReturn( + CallingConv::ID /*CallConv*/, MachineFunction & /*MF*/, bool /*IsVarArg*/, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext & /*Context*/) const { + // WebAssembly can't currently handle returning tuples. + return Outs.size() <= 1; +} + +SDValue WebAssemblyTargetLowering::LowerReturn( + SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, SDLoc DL, + SelectionDAG &DAG) const { + assert(Outs.size() <= 1 && "WebAssembly can only return up to one value"); + if (!CallingConvSupported(CallConv)) + fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); + + SmallVector<SDValue, 4> RetOps(1, Chain); + RetOps.append(OutVals.begin(), OutVals.end()); + Chain = DAG.getNode(WebAssemblyISD::RETURN, DL, MVT::Other, RetOps); + + // Record the number and types of the return values. + for (const ISD::OutputArg &Out : Outs) { + assert(!Out.Flags.isByVal() && "byval is not valid for return values"); + assert(!Out.Flags.isNest() && "nest is not valid for return values"); + assert(Out.IsFixed && "non-fixed return value is not valid"); + if (Out.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca results"); + if (Out.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs results"); + if (Out.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs last results"); + } + + return Chain; +} + +SDValue WebAssemblyTargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool /*IsVarArg*/, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { + MachineFunction &MF = DAG.getMachineFunction(); + + if (!CallingConvSupported(CallConv)) + fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions"); + + // Set up the incoming ARGUMENTS value, which serves to represent the liveness + // of the incoming values before they're represented by virtual registers. + MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS); + + for (const ISD::InputArg &In : Ins) { + if (In.Flags.isByVal()) + fail(DL, DAG, "WebAssembly hasn't implemented byval arguments"); + if (In.Flags.isInAlloca()) + fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments"); + if (In.Flags.isNest()) + fail(DL, DAG, "WebAssembly hasn't implemented nest arguments"); + if (In.Flags.isInConsecutiveRegs()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments"); + if (In.Flags.isInConsecutiveRegsLast()) + fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments"); + // Ignore In.getOrigAlign() because all our arguments are passed in + // registers. + InVals.push_back( + In.Used + ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT, + DAG.getTargetConstant(InVals.size(), DL, MVT::i32)) + : DAG.getUNDEF(In.VT)); + + // Record the number and types of arguments. + MF.getInfo<WebAssemblyFunctionInfo>()->addParam(In.VT); + } + + // Incoming varargs arguments are on the stack and will be accessed through + // va_arg, so we don't need to do anything for them here. + + return Chain; +} + //===----------------------------------------------------------------------===// -// Other Lowering Code +// Custom lowering hooks. //===----------------------------------------------------------------------===// +SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op, + SelectionDAG &DAG) const { + switch (Op.getOpcode()) { + default: + llvm_unreachable("unimplemented operation lowering"); + return SDValue(); + case ISD::FrameIndex: + return LowerFrameIndex(Op, DAG); + case ISD::GlobalAddress: + return LowerGlobalAddress(Op, DAG); + case ISD::ExternalSymbol: + return LowerExternalSymbol(Op, DAG); + case ISD::JumpTable: + return LowerJumpTable(Op, DAG); + case ISD::BR_JT: + return LowerBR_JT(Op, DAG); + case ISD::VASTART: + return LowerVASTART(Op, DAG); + } +} + +SDValue WebAssemblyTargetLowering::LowerFrameIndex(SDValue Op, + SelectionDAG &DAG) const { + int FI = cast<FrameIndexSDNode>(Op)->getIndex(); + return DAG.getTargetFrameIndex(FI, Op.getValueType()); +} + +SDValue WebAssemblyTargetLowering::LowerGlobalAddress(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const auto *GA = cast<GlobalAddressSDNode>(Op); + EVT VT = Op.getValueType(); + assert(GA->getTargetFlags() == 0 && "WebAssembly doesn't set target flags"); + if (GA->getAddressSpace() != 0) + fail(DL, DAG, "WebAssembly only expects the 0 address space"); + return DAG.getNode( + WebAssemblyISD::Wrapper, DL, VT, + DAG.getTargetGlobalAddress(GA->getGlobal(), DL, VT, GA->getOffset())); +} + +SDValue +WebAssemblyTargetLowering::LowerExternalSymbol(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + const auto *ES = cast<ExternalSymbolSDNode>(Op); + EVT VT = Op.getValueType(); + assert(ES->getTargetFlags() == 0 && "WebAssembly doesn't set target flags"); + return DAG.getNode(WebAssemblyISD::Wrapper, DL, VT, + DAG.getTargetExternalSymbol(ES->getSymbol(), VT)); +} + +SDValue WebAssemblyTargetLowering::LowerJumpTable(SDValue Op, + SelectionDAG &DAG) const { + // There's no need for a Wrapper node because we always incorporate a jump + // table operand into a TABLESWITCH instruction, rather than ever + // materializing it in a register. + const JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + return DAG.getTargetJumpTable(JT->getIndex(), Op.getValueType(), + JT->getTargetFlags()); +} + +SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Chain = Op.getOperand(0); + const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1)); + SDValue Index = Op.getOperand(2); + assert(JT->getTargetFlags() == 0 && "WebAssembly doesn't set target flags"); + + SmallVector<SDValue, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(Index); + + MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo(); + const auto &MBBs = MJTI->getJumpTables()[JT->getIndex()].MBBs; + + // TODO: For now, we just pick something arbitrary for a default case for now. + // We really want to sniff out the guard and put in the real default case (and + // delete the guard). + Ops.push_back(DAG.getBasicBlock(MBBs[0])); + + // Add an operand for each case. + for (auto MBB : MBBs) + Ops.push_back(DAG.getBasicBlock(MBB)); + + return DAG.getNode(WebAssemblyISD::TABLESWITCH, DL, MVT::Other, Ops); +} + +SDValue WebAssemblyTargetLowering::LowerVASTART(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT PtrVT = getPointerTy(DAG.getMachineFunction().getDataLayout()); + + // The incoming non-fixed arguments are placed on the top of the stack, with + // natural alignment, at the point of the call, so the base pointer is just + // the current frame pointer. + DAG.getMachineFunction().getFrameInfo()->setFrameAddressIsTaken(true); + unsigned FP = + Subtarget->getRegisterInfo()->getFrameRegister(DAG.getMachineFunction()); + SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, FP, PtrVT); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FrameAddr, Op.getOperand(1), + MachinePointerInfo(SV), false, false, 0); +} + //===----------------------------------------------------------------------===// // WebAssembly Optimization Hooks //===----------------------------------------------------------------------===// - -MCSection *WebAssemblyTargetObjectFile::SelectSectionForGlobal( - const GlobalValue *GV, SectionKind Kind, Mangler &Mang, - const TargetMachine &TM) const { - return getDataSection(); -} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h index efd60a7..e7232a0 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h @@ -22,10 +22,11 @@ namespace llvm { namespace WebAssemblyISD { -enum { +enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, - - // add memory opcodes starting at ISD::FIRST_TARGET_MEMORY_OPCODE here... +#define HANDLE_NODETYPE(NODE) NODE, +#include "WebAssemblyISD.def" +#undef HANDLE_NODETYPE }; } // end namespace WebAssemblyISD @@ -42,8 +43,51 @@ private: /// Keep a pointer to the WebAssemblySubtarget around so that we can make the /// right decision when generating code for different targets. const WebAssemblySubtarget *Subtarget; + + FastISel *createFastISel(FunctionLoweringInfo &FuncInfo, + const TargetLibraryInfo *LibInfo) const override; + bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; + MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; + const char *getTargetNodeName(unsigned Opcode) const override; + std::pair<unsigned, const TargetRegisterClass *> + getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, + StringRef Constraint, MVT VT) const override; + bool isCheapToSpeculateCttz() const override; + bool isCheapToSpeculateCtlz() const override; + bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, + unsigned AS) const override; + + SDValue LowerCall(CallLoweringInfo &CLI, + SmallVectorImpl<SDValue> &InVals) const override; + bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, + bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + LLVMContext &Context) const override; + SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, SDLoc dl, + SelectionDAG &DAG) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, + bool IsVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + SDLoc DL, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const override; + + // Custom lowering hooks. + SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; }; +namespace WebAssembly { +FastISel *createFastISel(FunctionLoweringInfo &funcInfo, + const TargetLibraryInfo *libInfo); +} // end namespace WebAssembly + } // end namespace llvm #endif diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 6b5b6cd..cfa1519 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -12,10 +12,63 @@ /// //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * - * call_direct: call function directly - * call_indirect: call function indirectly - * addressof: obtain a function pointer value for a given function - */ +// TODO: addr64: These currently assume the callee address is 32-bit. + +let Defs = [ARGUMENTS] in { + +// Call sequence markers. These have an immediate which represents the amount of +// stack space to allocate or free, which is used for varargs lowering. +let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in { +def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt), + [(WebAssemblycallseq_start timm:$amt)]>; +def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2), + [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>; +} // isCodeGenOnly = 1 + +multiclass CALL<WebAssemblyRegClass vt, string prefix> { + def CALL_#vt : I<(outs vt:$dst), (ins i32imm:$callee, variable_ops), + [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))], + !strconcat(prefix, "call\t$dst, $callee")>; + def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops), + [(set vt:$dst, (WebAssemblycall1 I32:$callee))], + !strconcat(prefix, "call_indirect\t$dst, $callee")>; +} +let Uses = [SP32, SP64], isCall = 1 in { + defm : CALL<I32, "i32.">; + defm : CALL<I64, "i64.">; + defm : CALL<F32, "f32.">; + defm : CALL<F64, "f64.">; + + def CALL_VOID : I<(outs), (ins i32imm:$callee, variable_ops), + [(WebAssemblycall0 (i32 imm:$callee))], + "call \t$callee">; + def CALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops), + [(WebAssemblycall0 I32:$callee)], + "call_indirect\t$callee">; +} // Uses = [SP32,SP64], isCall = 1 + +} // Defs = [ARGUMENTS] + +// Patterns for matching a direct call to a global address. +def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_I32 tglobaladdr:$callee)>; +def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_I64 tglobaladdr:$callee)>; +def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_F32 tglobaladdr:$callee)>; +def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))), + (CALL_F64 tglobaladdr:$callee)>; +def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)), + (CALL_VOID tglobaladdr:$callee)>; + +// Patterns for matching a direct call to an external symbol. +def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_I32 texternalsym:$callee)>; +def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_I64 texternalsym:$callee)>; +def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_F32 texternalsym:$callee)>; +def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))), + (CALL_F64 texternalsym:$callee)>; +def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)), + (CALL_VOID texternalsym:$callee)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td new file mode 100644 index 0000000..05efe89 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -0,0 +1,82 @@ +//===- WebAssemblyInstrControl.td-WebAssembly control-flow ------*- tablegen -*- +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief WebAssembly control-flow code-gen constructs. +/// +//===----------------------------------------------------------------------===// + +let Defs = [ARGUMENTS] in { + +let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in { +// The condition operand is a boolean value which WebAssembly represents as i32. +def BR_IF : I<(outs), (ins I32:$cond, bb_op:$dst), + [(brcond I32:$cond, bb:$dst)], + "br_if \t$cond, $dst">; +let isCodeGenOnly = 1 in +def BR_UNLESS : I<(outs), (ins I32:$cond, bb_op:$dst), [], + "br_unless\t$cond, $dst">; +let isBarrier = 1 in { +def BR : I<(outs), (ins bb_op:$dst), + [(br bb:$dst)], + "br \t$dst">; +} // isBarrier = 1 +} // isBranch = 1, isTerminator = 1, hasCtrlDep = 1 + +} // Defs = [ARGUMENTS] + +def : Pat<(brcond (i32 (setne I32:$cond, 0)), bb:$dst), + (BR_IF I32:$cond, bb_op:$dst)>; +def : Pat<(brcond (i32 (seteq I32:$cond, 0)), bb:$dst), + (BR_UNLESS I32:$cond, bb_op:$dst)>; + +let Defs = [ARGUMENTS] in { + +// TODO: SelectionDAG's lowering insists on using a pointer as the index for +// jump tables, so in practice we don't ever use TABLESWITCH_I64 in wasm32 mode +// currently. +let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { +def TABLESWITCH_I32 : I<(outs), (ins I32:$index, bb_op:$default, variable_ops), + [(WebAssemblytableswitch I32:$index, bb:$default)], + "tableswitch\t$index, $default">; +def TABLESWITCH_I64 : I<(outs), (ins I64:$index, bb_op:$default, variable_ops), + [(WebAssemblytableswitch I64:$index, bb:$default)], + "tableswitch\t$index, $default">; +} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 + +// Placemarkers to indicate the start of a block or loop scope. These +// use/clobber EXPR_STACK to prevent them from being moved into the middle of +// an expression tree. +let Uses = [EXPR_STACK], Defs = [EXPR_STACK] in { +def BLOCK : I<(outs), (ins bb_op:$dst), [], "block \t$dst">; +def LOOP : I<(outs), (ins bb_op:$dst), [], "loop \t$dst">; +} // Uses = [EXPR_STACK], Defs = [EXPR_STACK] + +// No-op to indicate to the AsmPrinter that a loop ends here, so a +// basic block label is needed even if it wouldn't otherwise appear so. +let isTerminator = 1, hasCtrlDep = 1 in +def LOOP_END : I<(outs), (ins), []>; + +multiclass RETURN<WebAssemblyRegClass vt> { + def RETURN_#vt : I<(outs), (ins vt:$val), [(WebAssemblyreturn vt:$val)], + "return \t$val">; +} + +let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { +let isReturn = 1 in { + defm : RETURN<I32>; + defm : RETURN<I64>; + defm : RETURN<F32>; + defm : RETURN<F64>; + def RETURN_VOID : I<(outs), (ins), [(WebAssemblyreturn)], "return">; +} // isReturn = 1 + def UNREACHABLE : I<(outs), (ins), [(trap)], "unreachable">; +} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 + +} // Defs = [ARGUMENTS] diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td index 3fa2906..931f4a9 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrConv.td @@ -13,32 +13,99 @@ /// //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * - * int32.wrap[int64]: wrap a 64-bit integer to a 32-bit integer - * int32.trunc_signed[float32]: truncate a 32-bit float to a signed 32-bit integer - * int32.trunc_signed[float64]: truncate a 64-bit float to a signed 32-bit integer - * int32.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 32-bit integer - * int32.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 32-bit integer - * int32.reinterpret[float32]: reinterpret the bits of a 32-bit float as a 32-bit integer - * int64.extend_signed[int32]: extend a signed 32-bit integer to a 64-bit integer - * int64.extend_unsigned[int32]: extend an unsigned 32-bit integer to a 64-bit integer - * int64.trunc_signed[float32]: truncate a 32-bit float to a signed 64-bit integer - * int64.trunc_signed[float64]: truncate a 64-bit float to a signed 64-bit integer - * int64.trunc_unsigned[float32]: truncate a 32-bit float to an unsigned 64-bit integer - * int64.trunc_unsigned[float64]: truncate a 64-bit float to an unsigned 64-bit integer - * int64.reinterpret[float64]: reinterpret the bits of a 64-bit float as a 64-bit integer - * float32.demote[float64]: demote a 64-bit float to a 32-bit float - * float32.cvt_signed[int32]: convert a signed 32-bit integer to a 32-bit float - * float32.cvt_signed[int64]: convert a signed 64-bit integer to a 32-bit float - * float32.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 32-bit float - * float32.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 32-bit float - * float32.reinterpret[int32]: reinterpret the bits of a 32-bit integer as a 32-bit float - * float64.promote[float32]: promote a 32-bit float to a 64-bit float - * float64.cvt_signed[int32]: convert a signed 32-bit integer to a 64-bit float - * float64.cvt_signed[int64]: convert a signed 64-bit integer to a 64-bit float - * float64.cvt_unsigned[int32]: convert an unsigned 32-bit integer to a 64-bit float - * float64.cvt_unsigned[int64]: convert an unsigned 64-bit integer to a 64-bit float - * float64.reinterpret[int64]: reinterpret the bits of a 64-bit integer as a 64-bit float - */ +let Defs = [ARGUMENTS] in { + +def I32_WRAP_I64 : I<(outs I32:$dst), (ins I64:$src), + [(set I32:$dst, (trunc I64:$src))], + "i32.wrap/i64\t$dst, $src">; + +def I64_EXTEND_S_I32 : I<(outs I64:$dst), (ins I32:$src), + [(set I64:$dst, (sext I32:$src))], + "i64.extend_s/i32\t$dst, $src">; +def I64_EXTEND_U_I32 : I<(outs I64:$dst), (ins I32:$src), + [(set I64:$dst, (zext I32:$src))], + "i64.extend_u/i32\t$dst, $src">; + +} // defs = [ARGUMENTS] + +// Expand a "don't care" extend into zero-extend (chosen over sign-extend +// somewhat arbitrarily, although it favors popular hardware architectures +// and is conceptually a simpler operation). +def : Pat<(i64 (anyext I32:$src)), (I64_EXTEND_U_I32 I32:$src)>; + +let Defs = [ARGUMENTS] in { + +// Conversion from floating point to integer traps on overflow and invalid. +let hasSideEffects = 1 in { +def I32_TRUNC_S_F32 : I<(outs I32:$dst), (ins F32:$src), + [(set I32:$dst, (fp_to_sint F32:$src))], + "i32.trunc_s/f32\t$dst, $src">; +def I32_TRUNC_U_F32 : I<(outs I32:$dst), (ins F32:$src), + [(set I32:$dst, (fp_to_uint F32:$src))], + "i32.trunc_u/f32\t$dst, $src">; +def I64_TRUNC_S_F32 : I<(outs I64:$dst), (ins F32:$src), + [(set I64:$dst, (fp_to_sint F32:$src))], + "i64.trunc_s/f32\t$dst, $src">; +def I64_TRUNC_U_F32 : I<(outs I64:$dst), (ins F32:$src), + [(set I64:$dst, (fp_to_uint F32:$src))], + "i64.trunc_u/f32\t$dst, $src">; +def I32_TRUNC_S_F64 : I<(outs I32:$dst), (ins F64:$src), + [(set I32:$dst, (fp_to_sint F64:$src))], + "i32.trunc_s/f64\t$dst, $src">; +def I32_TRUNC_U_F64 : I<(outs I32:$dst), (ins F64:$src), + [(set I32:$dst, (fp_to_uint F64:$src))], + "i32.trunc_u/f64\t$dst, $src">; +def I64_TRUNC_S_F64 : I<(outs I64:$dst), (ins F64:$src), + [(set I64:$dst, (fp_to_sint F64:$src))], + "i64.trunc_s/f64\t$dst, $src">; +def I64_TRUNC_U_F64 : I<(outs I64:$dst), (ins F64:$src), + [(set I64:$dst, (fp_to_uint F64:$src))], + "i64.trunc_u/f64\t$dst, $src">; +} // hasSideEffects = 1 + +def F32_CONVERT_S_I32 : I<(outs F32:$dst), (ins I32:$src), + [(set F32:$dst, (sint_to_fp I32:$src))], + "f32.convert_s/i32\t$dst, $src">; +def F32_CONVERT_U_I32 : I<(outs F32:$dst), (ins I32:$src), + [(set F32:$dst, (uint_to_fp I32:$src))], + "f32.convert_u/i32\t$dst, $src">; +def F64_CONVERT_S_I32 : I<(outs F64:$dst), (ins I32:$src), + [(set F64:$dst, (sint_to_fp I32:$src))], + "f64.convert_s/i32\t$dst, $src">; +def F64_CONVERT_U_I32 : I<(outs F64:$dst), (ins I32:$src), + [(set F64:$dst, (uint_to_fp I32:$src))], + "f64.convert_u/i32\t$dst, $src">; +def F32_CONVERT_S_I64 : I<(outs F32:$dst), (ins I64:$src), + [(set F32:$dst, (sint_to_fp I64:$src))], + "f32.convert_s/i64\t$dst, $src">; +def F32_CONVERT_U_I64 : I<(outs F32:$dst), (ins I64:$src), + [(set F32:$dst, (uint_to_fp I64:$src))], + "f32.convert_u/i64\t$dst, $src">; +def F64_CONVERT_S_I64 : I<(outs F64:$dst), (ins I64:$src), + [(set F64:$dst, (sint_to_fp I64:$src))], + "f64.convert_s/i64\t$dst, $src">; +def F64_CONVERT_U_I64 : I<(outs F64:$dst), (ins I64:$src), + [(set F64:$dst, (uint_to_fp I64:$src))], + "f64.convert_u/i64\t$dst, $src">; + +def F64_PROMOTE_F32 : I<(outs F64:$dst), (ins F32:$src), + [(set F64:$dst, (fextend F32:$src))], + "f64.promote/f32\t$dst, $src">; +def F32_DEMOTE_F64 : I<(outs F32:$dst), (ins F64:$src), + [(set F32:$dst, (fround F64:$src))], + "f32.demote/f64\t$dst, $src">; + +def I32_REINTERPRET_F32 : I<(outs I32:$dst), (ins F32:$src), + [(set I32:$dst, (bitconvert F32:$src))], + "i32.reinterpret/f32\t$dst, $src">; +def F32_REINTERPRET_I32 : I<(outs F32:$dst), (ins I32:$src), + [(set F32:$dst, (bitconvert I32:$src))], + "f32.reinterpret/i32\t$dst, $src">; +def I64_REINTERPRET_F64 : I<(outs I64:$dst), (ins F64:$src), + [(set I64:$dst, (bitconvert F64:$src))], + "i64.reinterpret/f64\t$dst, $src">; +def F64_REINTERPRET_I64 : I<(outs F64:$dst), (ins I64:$src), + [(set F64:$dst, (bitconvert I64:$src))], + "f64.reinterpret/i64\t$dst, $src">; + +} // Defs = [ARGUMENTS] diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td index 30ef633..5520c6d 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td @@ -12,33 +12,90 @@ /// //===----------------------------------------------------------------------===// -defm FADD : BinaryFP<fadd>; -defm FSUB : BinaryFP<fsub>; -defm FMUL : BinaryFP<fmul>; -defm FDIV : BinaryFP<fdiv>; -defm FABS : UnaryFP<fabs>; -defm FNEG : UnaryFP<fneg>; -defm COPYSIGN : BinaryFP<fcopysign>; -defm CEIL : UnaryFP<fceil>; -defm FLOOR : UnaryFP<ffloor>; -defm TRUNC : UnaryFP<ftrunc>; -defm NEARESTINT : UnaryFP<fnearbyint>; - -/* - * TODO(jfb): Add the following for 32-bit and 64-bit. - * - * float32.eq: compare equal - * float32.lt: less than - * float32.le: less than or equal - * float32.gt: greater than - * float32.ge: greater than or equal - */ - -defm SQRT : UnaryFP<fsqrt>; - -/* - * TODO(jfb): Add the following for 32-bit and 64-bit. - * - * float32.min: minimum (binary operator); if either operand is NaN, returns NaN - * float32.max: maximum (binary operator); if either operand is NaN, returns NaN - */ +let Defs = [ARGUMENTS] in { + +let isCommutable = 1 in +defm ADD : BinaryFP<fadd, "add ">; +defm SUB : BinaryFP<fsub, "sub ">; +let isCommutable = 1 in +defm MUL : BinaryFP<fmul, "mul ">; +defm DIV : BinaryFP<fdiv, "div ">; +defm SQRT : UnaryFP<fsqrt, "sqrt">; + +defm ABS : UnaryFP<fabs, "abs ">; +defm NEG : UnaryFP<fneg, "neg ">; +defm COPYSIGN : BinaryFP<fcopysign, "copysign">; + +let isCommutable = 1 in { +defm MIN : BinaryFP<fminnan, "min ">; +defm MAX : BinaryFP<fmaxnan, "max ">; +} // isCommutable = 1 + +defm CEIL : UnaryFP<fceil, "ceil">; +defm FLOOR : UnaryFP<ffloor, "floor">; +defm TRUNC : UnaryFP<ftrunc, "trunc">; +defm NEAREST : UnaryFP<fnearbyint, "nearest">; + +} // Defs = [ARGUMENTS] + +// DAGCombine oddly folds casts into the rhs of copysign. Unfold them. +def : Pat<(fcopysign F64:$lhs, F32:$rhs), + (COPYSIGN_F64 F64:$lhs, (F64_PROMOTE_F32 F32:$rhs))>; +def : Pat<(fcopysign F32:$lhs, F64:$rhs), + (COPYSIGN_F32 F32:$lhs, (F32_DEMOTE_F64 F64:$rhs))>; + +// WebAssembly doesn't expose inexact exceptions, so map frint to fnearbyint. +def : Pat<(frint f32:$src), (NEAREST_F32 f32:$src)>; +def : Pat<(frint f64:$src), (NEAREST_F64 f64:$src)>; + +let Defs = [ARGUMENTS] in { + +let isCommutable = 1 in { +defm EQ : ComparisonFP<SETOEQ, "eq ">; +defm NE : ComparisonFP<SETUNE, "ne ">; +} // isCommutable = 1 +defm LT : ComparisonFP<SETOLT, "lt ">; +defm LE : ComparisonFP<SETOLE, "le ">; +defm GT : ComparisonFP<SETOGT, "gt ">; +defm GE : ComparisonFP<SETOGE, "ge ">; + +} // Defs = [ARGUMENTS] + +// Don't care floating-point comparisons, supported via other comparisons. +def : Pat<(seteq f32:$lhs, f32:$rhs), (EQ_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setne f32:$lhs, f32:$rhs), (NE_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setlt f32:$lhs, f32:$rhs), (LT_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setle f32:$lhs, f32:$rhs), (LE_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setgt f32:$lhs, f32:$rhs), (GT_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(setge f32:$lhs, f32:$rhs), (GE_F32 f32:$lhs, f32:$rhs)>; +def : Pat<(seteq f64:$lhs, f64:$rhs), (EQ_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setne f64:$lhs, f64:$rhs), (NE_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setlt f64:$lhs, f64:$rhs), (LT_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setle f64:$lhs, f64:$rhs), (LE_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setgt f64:$lhs, f64:$rhs), (GT_F64 f64:$lhs, f64:$rhs)>; +def : Pat<(setge f64:$lhs, f64:$rhs), (GE_F64 f64:$lhs, f64:$rhs)>; + +let Defs = [ARGUMENTS] in { + +def SELECT_F32 : I<(outs F32:$dst), (ins I32:$cond, F32:$lhs, F32:$rhs), + [(set F32:$dst, (select I32:$cond, F32:$lhs, F32:$rhs))], + "f32.select\t$dst, $cond, $lhs, $rhs">; +def SELECT_F64 : I<(outs F64:$dst), (ins I32:$cond, F64:$lhs, F64:$rhs), + [(set F64:$dst, (select I32:$cond, F64:$lhs, F64:$rhs))], + "f64.select\t$dst, $cond, $lhs, $rhs">; + +} // Defs = [ARGUMENTS] + +// ISD::SELECT requires its operand to conform to getBooleanContents, but +// WebAssembly's select interprets any non-zero value as true, so we can fold +// a setne with 0 into a select. +def : Pat<(select (i32 (setne I32:$cond, 0)), F32:$lhs, F32:$rhs), + (SELECT_F32 I32:$cond, F32:$lhs, F32:$rhs)>; +def : Pat<(select (i32 (setne I32:$cond, 0)), F64:$lhs, F64:$rhs), + (SELECT_F64 I32:$cond, F64:$lhs, F64:$rhs)>; + +// And again, this time with seteq instead of setne and the arms reversed. +def : Pat<(select (i32 (seteq I32:$cond, 0)), F32:$lhs, F32:$rhs), + (SELECT_F32 I32:$cond, F32:$rhs, F32:$lhs)>; +def : Pat<(select (i32 (seteq I32:$cond, 0)), F64:$lhs, F64:$rhs), + (SELECT_F64 I32:$cond, F64:$rhs, F64:$lhs)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td index 513c36f..8008dd3 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td @@ -1,4 +1,4 @@ -// WebAssemblyInstrFormats.td - WebAssembly Instruction Formats -*- tblgen -*-// +//=- WebAssemblyInstrFormats.td - WebAssembly Instr. Formats -*- tablegen -*-=// // // The LLVM Compiler Infrastructure // @@ -12,44 +12,68 @@ /// //===----------------------------------------------------------------------===// -// WebAssembly Instruction Format -class WebAssemblyInst<string cstr> : Instruction { +// WebAssembly Instruction Format. +class WebAssemblyInst<string asmstr> : Instruction { field bits<0> Inst; // Instruction encoding. let Namespace = "WebAssembly"; let Pattern = []; - let Constraints = cstr; + let AsmString = asmstr; } -// Normal instructions -class I<dag oops, dag iops, list<dag> pattern, string cstr = ""> - : WebAssemblyInst<cstr> { +// Normal instructions. +class I<dag oops, dag iops, list<dag> pattern, string asmstr = ""> + : WebAssemblyInst<asmstr> { dag OutOperandList = oops; dag InOperandList = iops; let Pattern = pattern; } // Unary and binary instructions, for the local types that WebAssembly supports. -multiclass UnaryInt<SDNode node> { - def _I32 : I<(outs Int32:$dst), (ins Int32:$src), - [(set Int32:$dst, (node Int32:$src))]>; - def _I64 : I<(outs Int64:$dst), (ins Int64:$src), - [(set Int64:$dst, (node Int64:$src))]>; -} -multiclass BinaryInt<SDNode node> { - def _I32 : I<(outs Int32:$dst), (ins Int32:$lhs, Int32:$rhs), - [(set Int32:$dst, (node Int32:$lhs, Int32:$rhs))]>; - def _I64 : I<(outs Int64:$dst), (ins Int64:$lhs, Int64:$rhs), - [(set Int64:$dst, (node Int64:$lhs, Int64:$rhs))]>; -} -multiclass UnaryFP<SDNode node> { - def _F32 : I<(outs Float32:$dst), (ins Float32:$src), - [(set Float32:$dst, (node Float32:$src))]>; - def _F64 : I<(outs Float64:$dst), (ins Float64:$src), - [(set Float64:$dst, (node Float64:$src))]>; -} -multiclass BinaryFP<SDNode node> { - def _F32 : I<(outs Float32:$dst), (ins Float32:$lhs, Float32:$rhs), - [(set Float32:$dst, (node Float32:$lhs, Float32:$rhs))]>; - def _F64 : I<(outs Float64:$dst), (ins Float64:$lhs, Float64:$rhs), - [(set Float64:$dst, (node Float64:$lhs, Float64:$rhs))]>; +multiclass UnaryInt<SDNode node, string name> { + def _I32 : I<(outs I32:$dst), (ins I32:$src), + [(set I32:$dst, (node I32:$src))], + !strconcat("i32.", !strconcat(name, "\t$dst, $src"))>; + def _I64 : I<(outs I64:$dst), (ins I64:$src), + [(set I64:$dst, (node I64:$src))], + !strconcat("i64.", !strconcat(name, "\t$dst, $src"))>; +} +multiclass BinaryInt<SDNode node, string name> { + def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), + [(set I32:$dst, (node I32:$lhs, I32:$rhs))], + !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _I64 : I<(outs I64:$dst), (ins I64:$lhs, I64:$rhs), + [(set I64:$dst, (node I64:$lhs, I64:$rhs))], + !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; +} +multiclass UnaryFP<SDNode node, string name> { + def _F32 : I<(outs F32:$dst), (ins F32:$src), + [(set F32:$dst, (node F32:$src))], + !strconcat("f32.", !strconcat(name, "\t$dst, $src"))>; + def _F64 : I<(outs F64:$dst), (ins F64:$src), + [(set F64:$dst, (node F64:$src))], + !strconcat("f64.", !strconcat(name, "\t$dst, $src"))>; +} +multiclass BinaryFP<SDNode node, string name> { + def _F32 : I<(outs F32:$dst), (ins F32:$lhs, F32:$rhs), + [(set F32:$dst, (node F32:$lhs, F32:$rhs))], + !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _F64 : I<(outs F64:$dst), (ins F64:$lhs, F64:$rhs), + [(set F64:$dst, (node F64:$lhs, F64:$rhs))], + !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; +} +multiclass ComparisonInt<CondCode cond, string name> { + def _I32 : I<(outs I32:$dst), (ins I32:$lhs, I32:$rhs), + [(set I32:$dst, (setcc I32:$lhs, I32:$rhs, cond))], + !strconcat("i32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _I64 : I<(outs I32:$dst), (ins I64:$lhs, I64:$rhs), + [(set I32:$dst, (setcc I64:$lhs, I64:$rhs, cond))], + !strconcat("i64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; +} +multiclass ComparisonFP<CondCode cond, string name> { + def _F32 : I<(outs I32:$dst), (ins F32:$lhs, F32:$rhs), + [(set I32:$dst, (setcc F32:$lhs, F32:$rhs, cond))], + !strconcat("f32.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; + def _F64 : I<(outs I32:$dst), (ins F64:$lhs, F64:$rhs), + [(set I32:$dst, (setcc F64:$lhs, F64:$rhs, cond))], + !strconcat("f64.", !strconcat(name, "\t$dst, $lhs, $rhs"))>; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index ea8937c..5e7663c 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -24,5 +24,136 @@ using namespace llvm; #define DEBUG_TYPE "wasm-instr-info" +#define GET_INSTRINFO_CTOR_DTOR +#include "WebAssemblyGenInstrInfo.inc" + WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) - : RI(STI.getTargetTriple()) {} + : WebAssemblyGenInstrInfo(WebAssembly::ADJCALLSTACKDOWN, + WebAssembly::ADJCALLSTACKUP), + RI(STI.getTargetTriple()) {} + +void WebAssemblyInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + DebugLoc DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { + // This method is called by post-RA expansion, which expects only pregs to + // exist. However we need to handle both here. + auto &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = TargetRegisterInfo::isVirtualRegister(DestReg) ? + MRI.getRegClass(DestReg) : + MRI.getTargetRegisterInfo()->getMinimalPhysRegClass(SrcReg); + + unsigned CopyLocalOpcode; + if (RC == &WebAssembly::I32RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_I32; + else if (RC == &WebAssembly::I64RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_I64; + else if (RC == &WebAssembly::F32RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_F32; + else if (RC == &WebAssembly::F64RegClass) + CopyLocalOpcode = WebAssembly::COPY_LOCAL_F64; + else + llvm_unreachable("Unexpected register class"); + + BuildMI(MBB, I, DL, get(CopyLocalOpcode), DestReg) + .addReg(SrcReg, KillSrc ? RegState::Kill : 0); +} + +// Branch analysis. +bool WebAssemblyInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool /*AllowModify*/) const { + bool HaveCond = false; + for (MachineInstr &MI : MBB.terminators()) { + switch (MI.getOpcode()) { + default: + // Unhandled instruction; bail out. + return true; + case WebAssembly::BR_IF: + if (HaveCond) + return true; + Cond.push_back(MachineOperand::CreateImm(true)); + Cond.push_back(MI.getOperand(0)); + TBB = MI.getOperand(1).getMBB(); + HaveCond = true; + break; + case WebAssembly::BR_UNLESS: + if (HaveCond) + return true; + Cond.push_back(MachineOperand::CreateImm(false)); + Cond.push_back(MI.getOperand(0)); + TBB = MI.getOperand(1).getMBB(); + HaveCond = true; + break; + case WebAssembly::BR: + if (!HaveCond) + TBB = MI.getOperand(0).getMBB(); + else + FBB = MI.getOperand(0).getMBB(); + break; + } + if (MI.isBarrier()) + break; + } + + return false; +} + +unsigned WebAssemblyInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::instr_iterator I = MBB.instr_end(); + unsigned Count = 0; + + while (I != MBB.instr_begin()) { + --I; + if (I->isDebugValue()) + continue; + if (!I->isTerminator()) + break; + // Remove the branch. + I->eraseFromParent(); + I = MBB.instr_end(); + ++Count; + } + + return Count; +} + +unsigned WebAssemblyInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + DebugLoc DL) const { + if (Cond.empty()) { + if (!TBB) + return 0; + + BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(TBB); + return 1; + } + + assert(Cond.size() == 2 && "Expected a flag and a successor block"); + + if (Cond[0].getImm()) { + BuildMI(&MBB, DL, get(WebAssembly::BR_IF)) + .addOperand(Cond[1]) + .addMBB(TBB); + } else { + BuildMI(&MBB, DL, get(WebAssembly::BR_UNLESS)) + .addOperand(Cond[1]) + .addMBB(TBB); + } + if (!FBB) + return 1; + + BuildMI(&MBB, DL, get(WebAssembly::BR)).addMBB(FBB); + return 2; +} + +bool WebAssemblyInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 2 && "Expected a flag and a successor block"); + Cond.front() = MachineOperand::CreateImm(!Cond.front().getImm()); + return false; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index 1c4ae22..5ddd9b3 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -19,17 +19,35 @@ #include "WebAssemblyRegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" +#define GET_INSTRINFO_HEADER +#include "WebAssemblyGenInstrInfo.inc" + namespace llvm { class WebAssemblySubtarget; -class WebAssemblyInstrInfo final { +class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo { const WebAssemblyRegisterInfo RI; public: explicit WebAssemblyInstrInfo(const WebAssemblySubtarget &STI); const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; } + + void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, + DebugLoc DL, unsigned DestReg, unsigned SrcReg, + bool KillSrc) const override; + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify = false) const override; + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, + DebugLoc DL) const override; + bool + ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index fe3ca76..f0b4ce7 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -25,20 +25,48 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, // WebAssembly-specific DAG Node Types. //===----------------------------------------------------------------------===// +def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>; +def SDT_WebAssemblyCallSeqEnd : + SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; +def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_WebAssemblyCall1 : SDTypeProfile<1, -1, [SDTCisPtrTy<1>]>; +def SDT_WebAssemblyTableswitch : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; +def SDT_WebAssemblyArgument : SDTypeProfile<1, 1, [SDTCisVT<1, i32>]>; +def SDT_WebAssemblyReturn : SDTypeProfile<0, -1, []>; +def SDT_WebAssemblyWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, + SDTCisPtrTy<0>]>; + //===----------------------------------------------------------------------===// // WebAssembly-specific DAG Nodes. //===----------------------------------------------------------------------===// +def WebAssemblycallseq_start : + SDNode<"ISD::CALLSEQ_START", SDT_WebAssemblyCallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def WebAssemblycallseq_end : + SDNode<"ISD::CALLSEQ_END", SDT_WebAssemblyCallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0", + SDT_WebAssemblyCall0, + [SDNPHasChain, SDNPVariadic]>; +def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1", + SDT_WebAssemblyCall1, + [SDNPHasChain, SDNPVariadic]>; +def WebAssemblytableswitch : SDNode<"WebAssemblyISD::TABLESWITCH", + SDT_WebAssemblyTableswitch, + [SDNPHasChain, SDNPVariadic]>; +def WebAssemblyargument : SDNode<"WebAssemblyISD::ARGUMENT", + SDT_WebAssemblyArgument>; +def WebAssemblyreturn : SDNode<"WebAssemblyISD::RETURN", + SDT_WebAssemblyReturn, [SDNPHasChain]>; +def WebAssemblywrapper : SDNode<"WebAssemblyISD::Wrapper", + SDT_WebAssemblyWrapper>; + //===----------------------------------------------------------------------===// // WebAssembly-specific Operands. //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * - * get_local: read the current value of a local variable - * set_local: set the current value of a local variable -*/ +def bb_op : Operand<OtherVT>; //===----------------------------------------------------------------------===// // WebAssembly Instruction Format Definitions. @@ -47,13 +75,86 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, include "WebAssemblyInstrFormats.td" //===----------------------------------------------------------------------===// +// Additional instructions. +//===----------------------------------------------------------------------===// + +multiclass ARGUMENT<WebAssemblyRegClass vt> { + let hasSideEffects = 1, Uses = [ARGUMENTS], isCodeGenOnly = 1 in + def ARGUMENT_#vt : I<(outs vt:$res), (ins i32imm:$argno), + [(set vt:$res, (WebAssemblyargument timm:$argno))]>; +} +defm : ARGUMENT<I32>; +defm : ARGUMENT<I64>; +defm : ARGUMENT<F32>; +defm : ARGUMENT<F64>; + +let Defs = [ARGUMENTS] in { + +// get_local and set_local are not generated by instruction selection; they +// are implied by virtual register uses and defs in most contexts. However, +// they are explicitly emitted for special purposes. +multiclass LOCAL<WebAssemblyRegClass vt> { + def GET_LOCAL_#vt : I<(outs vt:$res), (ins i32imm:$regno), [], + "get_local\t$res, $regno">; + // TODO: set_local returns its operand value + def SET_LOCAL_#vt : I<(outs), (ins i32imm:$regno, vt:$src), [], + "set_local\t$regno, $src">; + + // COPY_LOCAL is not an actual instruction in wasm, but since we allow + // get_local and set_local to be implicit, we can have a COPY_LOCAL which + // is actually a no-op because all the work is done in the implied + // get_local and set_local. + let isAsCheapAsAMove = 1 in + def COPY_LOCAL_#vt : I<(outs vt:$res), (ins vt:$src), [], + "copy_local\t$res, $src">; +} +defm : LOCAL<I32>; +defm : LOCAL<I64>; +defm : LOCAL<F32>; +defm : LOCAL<F64>; + +let isMoveImm = 1 in { +def CONST_I32 : I<(outs I32:$res), (ins i32imm:$imm), + [(set I32:$res, imm:$imm)], + "i32.const\t$res, $imm">; +def CONST_I64 : I<(outs I64:$res), (ins i64imm:$imm), + [(set I64:$res, imm:$imm)], + "i64.const\t$res, $imm">; +def CONST_F32 : I<(outs F32:$res), (ins f32imm:$imm), + [(set F32:$res, fpimm:$imm)], + "f32.const\t$res, $imm">; +def CONST_F64 : I<(outs F64:$res), (ins f64imm:$imm), + [(set F64:$res, fpimm:$imm)], + "f64.const\t$res, $imm">; +} // isMoveImm = 1 + +} // Defs = [ARGUMENTS] + +def : Pat<(i32 (WebAssemblywrapper tglobaladdr:$dst)), + (CONST_I32 tglobaladdr:$dst)>; +def : Pat<(i32 (WebAssemblywrapper texternalsym:$dst)), + (CONST_I32 texternalsym:$dst)>; +def : Pat<(i32 (WebAssemblywrapper tjumptable:$dst)), + (CONST_I32 tjumptable:$dst)>; + +let Defs = [ARGUMENTS] in { + +// Function signature and local variable declaration "instructions". +def PARAM : I<(outs), (ins variable_ops), [], ".param \t">; +def RESULT : I<(outs), (ins variable_ops), [], ".result \t">; +def LOCAL : I<(outs), (ins variable_ops), [], ".local \t">; + +} // Defs = [ARGUMENTS] + +//===----------------------------------------------------------------------===// // Additional sets of instructions. //===----------------------------------------------------------------------===// include "WebAssemblyInstrMemory.td" include "WebAssemblyInstrCall.td" +include "WebAssemblyInstrControl.td" include "WebAssemblyInstrInteger.td" -include "WebAssemblyInstrFloat.td" include "WebAssemblyInstrConv.td" +include "WebAssemblyInstrFloat.td" include "WebAssemblyInstrAtomics.td" include "WebAssemblyInstrSIMD.td" diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td index 5f60fe8..09e5eaf 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrInteger.td @@ -12,34 +12,77 @@ /// //===----------------------------------------------------------------------===// -defm ADD : BinaryInt<add>; -defm SUB : BinaryInt<sub>; -defm MUL : BinaryInt<mul>; -defm SDIV : BinaryInt<sdiv>; -defm UDIV : BinaryInt<udiv>; -defm SREM : BinaryInt<srem>; -defm UREM : BinaryInt<urem>; -defm AND : BinaryInt<and>; -defm IOR : BinaryInt<or>; -defm XOR : BinaryInt<xor>; -defm SHL : BinaryInt<shl>; -defm SHR : BinaryInt<srl>; -defm SAR : BinaryInt<sra>; - -/* - * TODO(jfb): Add the following for 32-bit and 64-bit. - * - * int32.eq: signed-less compare equal - * int32.slt: signed less than - * int32.sle: signed less than or equal - * int32.ult: unsigned less than - * int32.ule: unsigned less than or equal - * int32.sgt: signed greater than - * int32.sge: signed greater than or equal - * int32.ugt: unsigned greater than - * int32.uge: unsigned greater than or equal - */ - -defm CLZ : UnaryInt<ctlz>; -defm CTZ : UnaryInt<cttz>; -defm POPCNT : UnaryInt<ctpop>; +let Defs = [ARGUMENTS] in { + +// The spaces after the names are for aesthetic purposes only, to make +// operands line up vertically after tab expansion. +let isCommutable = 1 in +defm ADD : BinaryInt<add, "add ">; +defm SUB : BinaryInt<sub, "sub ">; +let isCommutable = 1 in +defm MUL : BinaryInt<mul, "mul ">; +// Divide and remainder trap on a zero denominator. +let hasSideEffects = 1 in { +defm DIV_S : BinaryInt<sdiv, "div_s">; +defm DIV_U : BinaryInt<udiv, "div_u">; +defm REM_S : BinaryInt<srem, "rem_s">; +defm REM_U : BinaryInt<urem, "rem_u">; +} // hasSideEffects = 1 +let isCommutable = 1 in { +defm AND : BinaryInt<and, "and ">; +defm OR : BinaryInt<or, "or ">; +defm XOR : BinaryInt<xor, "xor ">; +} // isCommutable = 1 +defm SHL : BinaryInt<shl, "shl ">; +defm SHR_U : BinaryInt<srl, "shr_u">; +defm SHR_S : BinaryInt<sra, "shr_s">; + +let isCommutable = 1 in { +defm EQ : ComparisonInt<SETEQ, "eq ">; +defm NE : ComparisonInt<SETNE, "ne ">; +} // isCommutable = 1 +defm LT_S : ComparisonInt<SETLT, "lt_s">; +defm LE_S : ComparisonInt<SETLE, "le_s">; +defm LT_U : ComparisonInt<SETULT, "lt_u">; +defm LE_U : ComparisonInt<SETULE, "le_u">; +defm GT_S : ComparisonInt<SETGT, "gt_s">; +defm GE_S : ComparisonInt<SETGE, "ge_s">; +defm GT_U : ComparisonInt<SETUGT, "gt_u">; +defm GE_U : ComparisonInt<SETUGE, "ge_u">; + +defm CLZ : UnaryInt<ctlz, "clz ">; +defm CTZ : UnaryInt<cttz, "ctz ">; +defm POPCNT : UnaryInt<ctpop, "popcnt">; + +} // Defs = [ARGUMENTS] + +// Expand the "don't care" operations to supported operations. +def : Pat<(ctlz_zero_undef I32:$src), (CLZ_I32 I32:$src)>; +def : Pat<(ctlz_zero_undef I64:$src), (CLZ_I64 I64:$src)>; +def : Pat<(cttz_zero_undef I32:$src), (CTZ_I32 I32:$src)>; +def : Pat<(cttz_zero_undef I64:$src), (CTZ_I64 I64:$src)>; + +let Defs = [ARGUMENTS] in { + +def SELECT_I32 : I<(outs I32:$dst), (ins I32:$cond, I32:$lhs, I32:$rhs), + [(set I32:$dst, (select I32:$cond, I32:$lhs, I32:$rhs))], + "i32.select\t$dst, $cond, $lhs, $rhs">; +def SELECT_I64 : I<(outs I64:$dst), (ins I32:$cond, I64:$lhs, I64:$rhs), + [(set I64:$dst, (select I32:$cond, I64:$lhs, I64:$rhs))], + "i64.select\t$dst, $cond, $lhs, $rhs">; + +} // Defs = [ARGUMENTS] + +// ISD::SELECT requires its operand to conform to getBooleanContents, but +// WebAssembly's select interprets any non-zero value as true, so we can fold +// a setne with 0 into a select. +def : Pat<(select (i32 (setne I32:$cond, 0)), I32:$lhs, I32:$rhs), + (SELECT_I32 I32:$cond, I32:$lhs, I32:$rhs)>; +def : Pat<(select (i32 (setne I32:$cond, 0)), I64:$lhs, I64:$rhs), + (SELECT_I64 I32:$cond, I64:$lhs, I64:$rhs)>; + +// And again, this time with seteq instead of setne and the arms reversed. +def : Pat<(select (i32 (seteq I32:$cond, 0)), I32:$lhs, I32:$rhs), + (SELECT_I32 I32:$cond, I32:$rhs, I32:$lhs)>; +def : Pat<(select (i32 (seteq I32:$cond, 0)), I64:$lhs, I64:$rhs), + (SELECT_I64 I32:$cond, I64:$rhs, I64:$lhs)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td index 5ab40e8..74ec45d 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td @@ -12,35 +12,500 @@ /// //===----------------------------------------------------------------------===// -/* - * TODO(jfb): Add the following. - * Each has optional alignment and immediate byte offset. - * - * int32.load_sx[int8]: sign-extend to int32 - * int32.load_sx[int16]: sign-extend to int32 - * int32.load_zx[int8]: zero-extend to int32 - * int32.load_zx[int16]: zero-extend to int32 - * int32.load[int32]: (no conversion) - * int64.load_sx[int8]: sign-extend to int64 - * int64.load_sx[int16]: sign-extend to int64 - * int64.load_sx[int32]: sign-extend to int64 - * int64.load_zx[int8]: zero-extend to int64 - * int64.load_zx[int16]: zero-extend to int64 - * int64.load_zx[int32]: zero-extend to int64 - * int64.load[int64]: (no conversion) - * float32.load[float32]: (no conversion) - * float64.load[float64]: (no conversion) - * - * int32.store[int8]: wrap int32 to int8 - * int32.store[int16]: wrap int32 to int16 - * int32.store[int32]: (no conversion) - * int64.store[int8]: wrap int64 to int8 - * int64.store[int16]: wrap int64 to int16 - * int64.store[int32]: wrap int64 to int32 - * int64.store[int64]: (no conversion) - * float32.store[float32]: (no conversion) - * float64.store[float64]: (no conversion) - * - * load_global: load the value of a given global variable - * store_global: store a given value to a given global variable - */ +// TODO: +// - HasAddr64 +// - WebAssemblyTargetLowering having to do with atomics +// - Each has optional alignment. + +// WebAssembly has i8/i16/i32/i64/f32/f64 memory types, but doesn't have i8/i16 +// local types. These memory-only types instead zero- or sign-extend into local +// types when loading, and truncate when storing. + +// WebAssembly constant offsets are performed as unsigned with infinite +// precision, so we need to check for NoUnsignedWrap so that we don't fold an +// offset for an add that needs wrapping. +def regPlusImm : PatFrag<(ops node:$off, node:$addr), + (add node:$addr, node:$off), + [{ return N->getFlags()->hasNoUnsignedWrap(); }]>; + +let Defs = [ARGUMENTS] in { + +// Basic load. +def LOAD_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load\t$dst, ${off}(${addr})">; +def LOAD_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load\t$dst, ${off}(${addr})">; +def LOAD_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr), [], + "f32.load\t$dst, ${off}(${addr})">; +def LOAD_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr), [], + "f64.load\t$dst, ${off}(${addr})">; + +} // Defs = [ARGUMENTS] + +// Select loads with no constant offset. +def : Pat<(i32 (load I32:$addr)), (LOAD_I32 0, $addr)>; +def : Pat<(i64 (load I32:$addr)), (LOAD_I64 0, $addr)>; +def : Pat<(f32 (load I32:$addr)), (LOAD_F32 0, $addr)>; +def : Pat<(f64 (load I32:$addr)), (LOAD_F64 0, $addr)>; + +// Select loads with a constant offset. +def : Pat<(i32 (load (regPlusImm imm:$off, I32:$addr))), + (LOAD_I32 imm:$off, $addr)>; +def : Pat<(i64 (load (regPlusImm imm:$off, I32:$addr))), + (LOAD_I64 imm:$off, $addr)>; +def : Pat<(f32 (load (regPlusImm imm:$off, I32:$addr))), + (LOAD_F32 imm:$off, $addr)>; +def : Pat<(f64 (load (regPlusImm imm:$off, I32:$addr))), + (LOAD_F64 imm:$off, $addr)>; +def : Pat<(i32 (load (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (load (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD_I64 tglobaladdr:$off, $addr)>; +def : Pat<(f32 (load (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD_F32 tglobaladdr:$off, $addr)>; +def : Pat<(f64 (load (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD_F64 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (load (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD_I32 texternalsym:$off, $addr)>; +def : Pat<(i64 (load (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD_I64 texternalsym:$off, $addr)>; +def : Pat<(f32 (load (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD_F32 texternalsym:$off, $addr)>; +def : Pat<(f64 (load (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD_F64 texternalsym:$off, $addr)>; + +// Select loads with just a constant offset. +def : Pat<(i32 (load imm:$off)), (LOAD_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (load imm:$off)), (LOAD_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(f32 (load imm:$off)), (LOAD_F32 imm:$off, (CONST_I32 0))>; +def : Pat<(f64 (load imm:$off)), (LOAD_F64 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(f32 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_F32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(f64 (load (WebAssemblywrapper tglobaladdr:$off))), + (LOAD_F64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(f32 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_F32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(f64 (load (WebAssemblywrapper texternalsym:$off))), + (LOAD_F64 texternalsym:$off, (CONST_I32 0))>; + +let Defs = [ARGUMENTS] in { + +// Extending load. +def LOAD8_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load8_s\t$dst, ${off}(${addr})">; +def LOAD8_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load8_u\t$dst, ${off}(${addr})">; +def LOAD16_S_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load16_s\t$dst, ${off}(${addr})">; +def LOAD16_U_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr), [], + "i32.load16_u\t$dst, ${off}(${addr})">; +def LOAD8_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load8_s\t$dst, ${off}(${addr})">; +def LOAD8_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load8_u\t$dst, ${off}(${addr})">; +def LOAD16_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load16_s\t$dst, ${off}(${addr})">; +def LOAD16_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load16_u\t$dst, ${off}(${addr})">; +def LOAD32_S_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load32_s\t$dst, ${off}(${addr})">; +def LOAD32_U_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr), [], + "i64.load32_u\t$dst, ${off}(${addr})">; + +} // Defs = [ARGUMENTS] + +// Select extending loads with no constant offset. +def : Pat<(i32 (sextloadi8 I32:$addr)), (LOAD8_S_I32 0, $addr)>; +def : Pat<(i32 (zextloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>; +def : Pat<(i32 (sextloadi16 I32:$addr)), (LOAD16_S_I32 0, $addr)>; +def : Pat<(i32 (zextloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>; +def : Pat<(i64 (sextloadi8 I32:$addr)), (LOAD8_S_I64 0, $addr)>; +def : Pat<(i64 (zextloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>; +def : Pat<(i64 (sextloadi16 I32:$addr)), (LOAD16_S_I64 0, $addr)>; +def : Pat<(i64 (zextloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>; +def : Pat<(i64 (sextloadi32 I32:$addr)), (LOAD32_S_I64 0, $addr)>; +def : Pat<(i64 (zextloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>; + +// Select extending loads with a constant offset. +def : Pat<(i32 (sextloadi8 (regPlusImm imm:$off, I32:$addr))), + (LOAD8_S_I32 imm:$off, $addr)>; +def : Pat<(i32 (zextloadi8 (regPlusImm imm:$off, I32:$addr))), + (LOAD8_U_I32 imm:$off, $addr)>; +def : Pat<(i32 (sextloadi16 (regPlusImm imm:$off, I32:$addr))), + (LOAD16_S_I32 imm:$off, $addr)>; +def : Pat<(i32 (zextloadi16 (regPlusImm imm:$off, I32:$addr))), + (LOAD16_U_I32 imm:$off, $addr)>; +def : Pat<(i64 (sextloadi8 (regPlusImm imm:$off, I32:$addr))), + (LOAD8_S_I64 imm:$off, $addr)>; +def : Pat<(i64 (zextloadi8 (regPlusImm imm:$off, I32:$addr))), + (LOAD8_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (sextloadi16 (regPlusImm imm:$off, I32:$addr))), + (LOAD16_S_I64 imm:$off, $addr)>; +def : Pat<(i64 (zextloadi16 (regPlusImm imm:$off, I32:$addr))), + (LOAD16_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (sextloadi32 (regPlusImm imm:$off, I32:$addr))), + (LOAD32_S_I64 imm:$off, $addr)>; +def : Pat<(i64 (zextloadi32 (regPlusImm imm:$off, I32:$addr))), + (LOAD32_U_I64 imm:$off, $addr)>; +def : Pat<(i32 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD8_S_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD8_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD16_S_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD16_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (sextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD8_S_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (zextloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD8_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (sextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD16_S_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (zextloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD16_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (sextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD32_S_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (zextloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD32_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD8_S_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD8_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD16_S_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD16_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i64 (sextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD8_S_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (zextloadi8 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD8_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (sextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD16_S_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (zextloadi16 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD16_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (sextloadi32 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD32_S_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (zextloadi32 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD32_U_I64 texternalsym:$off, $addr)>; + +// Select extending loads with just a constant offset. +def : Pat<(i32 (sextloadi8 imm:$off)), (LOAD8_S_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi16 imm:$off)), (LOAD16_S_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi8 imm:$off)), (LOAD8_S_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi16 imm:$off)), (LOAD16_S_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi32 imm:$off)), (LOAD32_S_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_S_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_S_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_S_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_S_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi32 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD32_S_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi32 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_S_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (sextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_S_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (zextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_S_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_S_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (sextloadi32 (WebAssemblywrapper texternalsym:$off))), + (LOAD32_S_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (zextloadi32 (WebAssemblywrapper texternalsym:$off))), + (LOAD32_U_I64 texternalsym:$off, (CONST_I32 0))>; + +// Resolve "don't care" extending loads to zero-extending loads. This is +// somewhat arbitrary, but zero-extending is conceptually simpler. + +// Select "don't care" extending loads with no constant offset. +def : Pat<(i32 (extloadi8 I32:$addr)), (LOAD8_U_I32 0, $addr)>; +def : Pat<(i32 (extloadi16 I32:$addr)), (LOAD16_U_I32 0, $addr)>; +def : Pat<(i64 (extloadi8 I32:$addr)), (LOAD8_U_I64 0, $addr)>; +def : Pat<(i64 (extloadi16 I32:$addr)), (LOAD16_U_I64 0, $addr)>; +def : Pat<(i64 (extloadi32 I32:$addr)), (LOAD32_U_I64 0, $addr)>; + +// Select "don't care" extending loads with a constant offset. +def : Pat<(i32 (extloadi8 (regPlusImm imm:$off, I32:$addr))), + (LOAD8_U_I32 imm:$off, $addr)>; +def : Pat<(i32 (extloadi16 (regPlusImm imm:$off, I32:$addr))), + (LOAD16_U_I32 imm:$off, $addr)>; +def : Pat<(i64 (extloadi8 (regPlusImm imm:$off, I32:$addr))), + (LOAD8_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (extloadi16 (regPlusImm imm:$off, I32:$addr))), + (LOAD16_U_I64 imm:$off, $addr)>; +def : Pat<(i64 (extloadi32 (regPlusImm imm:$off, I32:$addr))), + (LOAD32_U_I64 imm:$off, $addr)>; +def : Pat<(i32 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD8_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD16_U_I32 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (extloadi8 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD8_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (extloadi16 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD16_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i64 (extloadi32 (regPlusImm tglobaladdr:$off, I32:$addr))), + (LOAD32_U_I64 tglobaladdr:$off, $addr)>; +def : Pat<(i32 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD8_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i32 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD16_U_I32 texternalsym:$off, $addr)>; +def : Pat<(i64 (extloadi8 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD8_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (extloadi16 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD16_U_I64 texternalsym:$off, $addr)>; +def : Pat<(i64 (extloadi32 (regPlusImm texternalsym:$off, I32:$addr))), + (LOAD32_U_I64 texternalsym:$off, $addr)>; + +// Select "don't care" extending loads with just a constant offset. +def : Pat<(i32 (extloadi8 imm:$off)), (LOAD8_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi16 imm:$off)), (LOAD16_U_I32 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi8 imm:$off)), (LOAD8_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi16 imm:$off)), (LOAD16_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi32 imm:$off)), (LOAD32_U_I64 imm:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I32 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi8 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD8_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi16 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD16_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi32 (WebAssemblywrapper tglobaladdr:$off))), + (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i32 (extloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I32 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi8 (WebAssemblywrapper texternalsym:$off))), + (LOAD8_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi16 (WebAssemblywrapper texternalsym:$off))), + (LOAD16_U_I64 texternalsym:$off, (CONST_I32 0))>; +def : Pat<(i64 (extloadi32 (WebAssemblywrapper texternalsym:$off))), + (LOAD32_U_I64 tglobaladdr:$off, (CONST_I32 0))>; + +let Defs = [ARGUMENTS] in { + +// Basic store. +// Note that we split the patterns out of the instruction definitions because +// WebAssembly's stores return their operand value, and tablegen doesn't like +// instruction definition patterns that don't reference all of the output +// operands. +// Note: WebAssembly inverts SelectionDAG's usual operand order. +def STORE_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [], + "i32.store\t$dst, ${off}(${addr}), $val">; +def STORE_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store\t$dst, ${off}(${addr}), $val">; +def STORE_F32 : I<(outs F32:$dst), (ins i32imm:$off, I32:$addr, F32:$val), [], + "f32.store\t$dst, ${off}(${addr}), $val">; +def STORE_F64 : I<(outs F64:$dst), (ins i32imm:$off, I32:$addr, F64:$val), [], + "f64.store\t$dst, ${off}(${addr}), $val">; + +} // Defs = [ARGUMENTS] + +// Select stores with no constant offset. +def : Pat<(store I32:$val, I32:$addr), (STORE_I32 0, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, I32:$addr), (STORE_I64 0, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, I32:$addr), (STORE_F32 0, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, I32:$addr), (STORE_F64 0, I32:$addr, F64:$val)>; + +// Select stores with a constant offset. +def : Pat<(store I32:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE_I32 imm:$off, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE_F32 imm:$off, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE_F64 imm:$off, I32:$addr, F64:$val)>; +def : Pat<(store I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE_F32 tglobaladdr:$off, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE_F64 tglobaladdr:$off, I32:$addr, F64:$val)>; +def : Pat<(store I32:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE_I32 texternalsym:$off, I32:$addr, I32:$val)>; +def : Pat<(store I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE_I64 texternalsym:$off, I32:$addr, I64:$val)>; +def : Pat<(store F32:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE_F32 texternalsym:$off, I32:$addr, F32:$val)>; +def : Pat<(store F64:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE_F64 texternalsym:$off, I32:$addr, F64:$val)>; + +// Select stores with just a constant offset. +def : Pat<(store I32:$val, imm:$off), + (STORE_I32 imm:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(store I64:$val, imm:$off), + (STORE_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(store F32:$val, imm:$off), + (STORE_F32 imm:$off, (CONST_I32 0), F32:$val)>; +def : Pat<(store F64:$val, imm:$off), + (STORE_F64 imm:$off, (CONST_I32 0), F64:$val)>; +def : Pat<(store I32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(store I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(store F32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_F32 tglobaladdr:$off, (CONST_I32 0), F32:$val)>; +def : Pat<(store F64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_F64 tglobaladdr:$off, (CONST_I32 0), F64:$val)>; +def : Pat<(store I32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(store I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(store F32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_F32 texternalsym:$off, (CONST_I32 0), F32:$val)>; +def : Pat<(store F64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE_F64 texternalsym:$off, (CONST_I32 0), F64:$val)>; + +let Defs = [ARGUMENTS] in { + +// Truncating store. +def STORE8_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [], + "i32.store8\t$dst, ${off}(${addr}), $val">; +def STORE16_I32 : I<(outs I32:$dst), (ins i32imm:$off, I32:$addr, I32:$val), [], + "i32.store16\t$dst, ${off}(${addr}), $val">; +def STORE8_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store8\t$dst, ${off}(${addr}), $val">; +def STORE16_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store16\t$dst, ${off}(${addr}), $val">; +def STORE32_I64 : I<(outs I64:$dst), (ins i32imm:$off, I32:$addr, I64:$val), [], + "i64.store32\t$dst, ${off}(${addr}), $val">; + +} // Defs = [ARGUMENTS] + +// Select truncating stores with no constant offset. +def : Pat<(truncstorei8 I32:$val, I32:$addr), + (STORE8_I32 0, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, I32:$addr), + (STORE16_I32 0, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, I32:$addr), + (STORE8_I64 0, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, I32:$addr), + (STORE16_I64 0, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, I32:$addr), + (STORE32_I64 0, I32:$addr, I64:$val)>; + +// Select truncating stores with a constant offset. +def : Pat<(truncstorei8 I32:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE8_I32 imm:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE16_I32 imm:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE8_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE16_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (regPlusImm imm:$off, I32:$addr)), + (STORE32_I64 imm:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei8 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE8_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE16_I32 tglobaladdr:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE8_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE16_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (regPlusImm tglobaladdr:$off, I32:$addr)), + (STORE32_I64 tglobaladdr:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei8 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE8_I32 texternalsym:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE16_I32 texternalsym:$off, I32:$addr, I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE8_I64 texternalsym:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE16_I64 texternalsym:$off, I32:$addr, I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (regPlusImm texternalsym:$off, I32:$addr)), + (STORE32_I64 texternalsym:$off, I32:$addr, I64:$val)>; + +// Select truncating stores with just a constant offset. +def : Pat<(truncstorei8 I32:$val, imm:$off), + (STORE8_I32 imm:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei16 I32:$val, imm:$off), + (STORE16_I32 imm:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei8 I64:$val, imm:$off), + (STORE8_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei16 I64:$val, imm:$off), + (STORE16_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei32 I64:$val, imm:$off), + (STORE32_I64 imm:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE8_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE16_I32 tglobaladdr:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE8_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE16_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE32_I64 tglobaladdr:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei8 I32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE8_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei16 I32:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE16_I32 texternalsym:$off, (CONST_I32 0), I32:$val)>; +def : Pat<(truncstorei8 I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE8_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei16 I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE16_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; +def : Pat<(truncstorei32 I64:$val, (WebAssemblywrapper texternalsym:$off)), + (STORE32_I64 texternalsym:$off, (CONST_I32 0), I64:$val)>; + +let Defs = [ARGUMENTS] in { + +// Memory size. +def MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins), + [(set I32:$dst, (int_wasm_memory_size))], + "memory_size\t$dst">, + Requires<[HasAddr32]>; +def MEMORY_SIZE_I64 : I<(outs I64:$dst), (ins), + [(set I64:$dst, (int_wasm_memory_size))], + "memory_size\t$dst">, + Requires<[HasAddr64]>; + +// Grow memory. +def GROW_MEMORY_I32 : I<(outs), (ins I32:$delta), + [(int_wasm_grow_memory I32:$delta)], + "grow_memory\t$delta">, + Requires<[HasAddr32]>; +def GROW_MEMORY_I64 : I<(outs), (ins I64:$delta), + [(int_wasm_grow_memory I64:$delta)], + "grow_memory\t$delta">, + Requires<[HasAddr64]>; + +} // Defs = [ARGUMENTS] diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp new file mode 100644 index 0000000..b009a4e --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp @@ -0,0 +1,133 @@ +//===-- WebAssemblyLowerBrUnless.cpp - Lower br_unless --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file lowers br_unless into br_if with an inverted condition. +/// +/// br_unless is not currently in the spec, but it's very convenient for LLVM +/// to use. This pass allows LLVM to use it, for now. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblySubtarget.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-lower-br_unless" + +namespace { +class WebAssemblyLowerBrUnless final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly Lower br_unless"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyLowerBrUnless() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyLowerBrUnless::ID = 0; +FunctionPass *llvm::createWebAssemblyLowerBrUnless() { + return new WebAssemblyLowerBrUnless(); +} + +bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** Lowering br_unless **********\n" + "********** Function: " + << MF.getName() << '\n'); + + auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo(); + auto &MRI = MF.getRegInfo(); + + for (auto &MBB : MF) { + for (auto MII = MBB.begin(); MII != MBB.end(); ) { + MachineInstr *MI = &*MII++; + if (MI->getOpcode() != WebAssembly::BR_UNLESS) + continue; + + unsigned Cond = MI->getOperand(0).getReg(); + bool Inverted = false; + + // Attempt to invert the condition in place. + if (MFI.isVRegStackified(Cond)) { + assert(MRI.hasOneDef(Cond)); + MachineInstr *Def = MRI.getVRegDef(Cond); + switch (Def->getOpcode()) { + using namespace WebAssembly; + case EQ_I32: Def->setDesc(TII.get(NE_I32)); Inverted = true; break; + case NE_I32: Def->setDesc(TII.get(EQ_I32)); Inverted = true; break; + case GT_S_I32: Def->setDesc(TII.get(LE_S_I32)); Inverted = true; break; + case GE_S_I32: Def->setDesc(TII.get(LT_S_I32)); Inverted = true; break; + case LT_S_I32: Def->setDesc(TII.get(GE_S_I32)); Inverted = true; break; + case LE_S_I32: Def->setDesc(TII.get(GT_S_I32)); Inverted = true; break; + case GT_U_I32: Def->setDesc(TII.get(LE_U_I32)); Inverted = true; break; + case GE_U_I32: Def->setDesc(TII.get(LT_U_I32)); Inverted = true; break; + case LT_U_I32: Def->setDesc(TII.get(GE_U_I32)); Inverted = true; break; + case LE_U_I32: Def->setDesc(TII.get(GT_U_I32)); Inverted = true; break; + case EQ_I64: Def->setDesc(TII.get(NE_I64)); Inverted = true; break; + case NE_I64: Def->setDesc(TII.get(EQ_I64)); Inverted = true; break; + case GT_S_I64: Def->setDesc(TII.get(LE_S_I64)); Inverted = true; break; + case GE_S_I64: Def->setDesc(TII.get(LT_S_I64)); Inverted = true; break; + case LT_S_I64: Def->setDesc(TII.get(GE_S_I64)); Inverted = true; break; + case LE_S_I64: Def->setDesc(TII.get(GT_S_I64)); Inverted = true; break; + case GT_U_I64: Def->setDesc(TII.get(LE_U_I64)); Inverted = true; break; + case GE_U_I64: Def->setDesc(TII.get(LT_U_I64)); Inverted = true; break; + case LT_U_I64: Def->setDesc(TII.get(GE_U_I64)); Inverted = true; break; + case LE_U_I64: Def->setDesc(TII.get(GT_U_I64)); Inverted = true; break; + case EQ_F32: Def->setDesc(TII.get(NE_F32)); Inverted = true; break; + case NE_F32: Def->setDesc(TII.get(EQ_F32)); Inverted = true; break; + case EQ_F64: Def->setDesc(TII.get(NE_F64)); Inverted = true; break; + case NE_F64: Def->setDesc(TII.get(EQ_F64)); Inverted = true; break; + default: break; + } + } + + // If we weren't able to invert the condition in place. Insert an + // expression to invert it. + if (!Inverted) { + unsigned ZeroReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + MFI.stackifyVReg(ZeroReg); + BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::CONST_I32), ZeroReg) + .addImm(0); + unsigned Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + MFI.stackifyVReg(Tmp); + BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQ_I32), Tmp) + .addReg(Cond) + .addReg(ZeroReg); + Cond = Tmp; + Inverted = true; + } + + // The br_unless condition has now been inverted. Insert a br_if and + // delete the br_unless. + assert(Inverted); + BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::BR_IF)) + .addReg(Cond) + .addOperand(MI->getOperand(1)); + MBB.erase(MI); + } + } + + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp new file mode 100644 index 0000000..a953f82 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -0,0 +1,106 @@ +// WebAssemblyMCInstLower.cpp - Convert WebAssembly MachineInstr to an MCInst // +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file contains code to lower WebAssembly MachineInstrs to their +/// corresponding MCInst records. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssemblyMCInstLower.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/AsmPrinter.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/Constants.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +MCSymbol * +WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { + return Printer.getSymbol(MO.getGlobal()); +} + +MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol( + const MachineOperand &MO) const { + return Printer.GetExternalSymbolSymbol(MO.getSymbolName()); +} + +MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(const MachineOperand &MO, + MCSymbol *Sym) const { + assert(MO.getTargetFlags() == 0 && "WebAssembly does not use target flags"); + + const MCExpr *Expr = MCSymbolRefExpr::create(Sym, Ctx); + + int64_t Offset = MO.getOffset(); + if (Offset != 0) { + assert(!MO.isJTI() && "Unexpected offset with jump table index"); + Expr = + MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, Ctx), Ctx); + } + + return MCOperand::createExpr(Expr); +} + +void WebAssemblyMCInstLower::Lower(const MachineInstr *MI, + MCInst &OutMI) const { + OutMI.setOpcode(MI->getOpcode()); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = MI->getOperand(i); + + MCOperand MCOp; + switch (MO.getType()) { + default: + MI->dump(); + llvm_unreachable("unknown operand type"); + case MachineOperand::MO_Register: { + // Ignore all implicit register operands. + if (MO.isImplicit()) + continue; + const WebAssemblyFunctionInfo &MFI = + *MI->getParent()->getParent()->getInfo<WebAssemblyFunctionInfo>(); + unsigned WAReg = MFI.getWAReg(MO.getReg()); + MCOp = MCOperand::createReg(WAReg); + break; + } + case MachineOperand::MO_Immediate: + MCOp = MCOperand::createImm(MO.getImm()); + break; + case MachineOperand::MO_FPImmediate: { + // TODO: MC converts all floating point immediate operands to double. + // This is fine for numeric values, but may cause NaNs to change bits. + const ConstantFP *Imm = MO.getFPImm(); + if (Imm->getType()->isFloatTy()) + MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToFloat()); + else if (Imm->getType()->isDoubleTy()) + MCOp = MCOperand::createFPImm(Imm->getValueAPF().convertToDouble()); + else + llvm_unreachable("unknown floating point immediate type"); + break; + } + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr( + MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); + break; + case MachineOperand::MO_GlobalAddress: + MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO)); + break; + case MachineOperand::MO_ExternalSymbol: + MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO)); + break; + } + + OutMI.addOperand(MCOp); + } +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h new file mode 100644 index 0000000..6d70470 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.h @@ -0,0 +1,45 @@ +//===-- WebAssemblyMCInstLower.h - Lower MachineInstr to MCInst -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file declares the class to lower WebAssembly MachineInstrs to +/// their corresponding MCInst records. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H +#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMCINSTLOWER_H + +#include "llvm/MC/MCInst.h" +#include "llvm/Support/Compiler.h" + +namespace llvm { +class AsmPrinter; +class MCContext; +class MCSymbol; +class MachineInstr; +class MachineOperand; + +/// This class is used to lower an MachineInstr into an MCInst. +class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower { + MCContext &Ctx; + AsmPrinter &Printer; + + MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const; + MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const; + MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const; + +public: + WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer) + : Ctx(ctx), Printer(printer) {} + void Lower(const MachineInstr *MI, MCInst &OutMI) const; +}; +} // end namespace llvm + +#endif diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 542d984..225c5d3 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -17,3 +17,9 @@ using namespace llvm; WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() {} + +void WebAssemblyFunctionInfo::initWARegs() { + assert(WARegs.empty()); + unsigned Reg = UnusedReg; + WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index fc5e910..6a60280 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -1,4 +1,4 @@ -// WebAssemblyMachineFuctionInfo.h-WebAssembly machine function info -*- C++ -*- +// WebAssemblyMachineFunctionInfo.h-WebAssembly machine function info-*- C++ -*- // // The LLVM Compiler Infrastructure // @@ -16,8 +16,7 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYMACHINEFUNCTIONINFO_H -#include "WebAssemblyRegisterInfo.h" -#include "llvm/CodeGen/MachineFunction.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -27,9 +26,70 @@ namespace llvm { class WebAssemblyFunctionInfo final : public MachineFunctionInfo { MachineFunction &MF; + std::vector<MVT> Params; + + /// A mapping from CodeGen vreg index to WebAssembly register number. + std::vector<unsigned> WARegs; + + /// A mapping from CodeGen vreg index to a boolean value indicating whether + /// the given register is considered to be "stackified", meaning it has been + /// determined or made to meet the stack requirements: + /// - single use (per path) + /// - single def (per path) + /// - defined and used in LIFO order with other stack registers + BitVector VRegStackified; + + // One entry for each possible target reg. we expect it to be small. + std::vector<unsigned> PhysRegs; + public: - explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {} + explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) { + PhysRegs.resize(WebAssembly::NUM_TARGET_REGS, -1U); + } ~WebAssemblyFunctionInfo() override; + + void addParam(MVT VT) { Params.push_back(VT); } + const std::vector<MVT> &getParams() const { return Params; } + + static const unsigned UnusedReg = -1u; + + void stackifyVReg(unsigned VReg) { + if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size()) + VRegStackified.resize(TargetRegisterInfo::virtReg2Index(VReg) + 1); + VRegStackified.set(TargetRegisterInfo::virtReg2Index(VReg)); + } + bool isVRegStackified(unsigned VReg) const { + if (TargetRegisterInfo::virtReg2Index(VReg) >= VRegStackified.size()) + return false; + return VRegStackified.test(TargetRegisterInfo::virtReg2Index(VReg)); + } + + void initWARegs(); + void setWAReg(unsigned VReg, unsigned WAReg) { + assert(WAReg != UnusedReg); + assert(TargetRegisterInfo::virtReg2Index(VReg) < WARegs.size()); + WARegs[TargetRegisterInfo::virtReg2Index(VReg)] = WAReg; + } + unsigned getWAReg(unsigned Reg) const { + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + assert(TargetRegisterInfo::virtReg2Index(Reg) < WARegs.size()); + return WARegs[TargetRegisterInfo::virtReg2Index(Reg)]; + } + return PhysRegs[Reg]; + } + // If new virtual registers are created after initWARegs has been called, + // this function can be used to add WebAssembly register mappings for them. + void addWAReg(unsigned VReg, unsigned WAReg) { + assert(VReg = WARegs.size()); + WARegs.push_back(WAReg); + } + + void addPReg(unsigned PReg, unsigned WAReg) { + assert(PReg < WebAssembly::NUM_TARGET_REGS); + assert(WAReg < -1U); + PhysRegs[PReg] = WAReg; + } + const std::vector<unsigned> &getPhysRegs() const { return PhysRegs; } }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp new file mode 100644 index 0000000..4dc401a --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp @@ -0,0 +1,76 @@ +//===-- WebAssemblyOptimizeReturned.cpp - Optimize "returned" attributes --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief Optimize calls with "returned" attributes for WebAssembly. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-optimize-returned" + +namespace { +class OptimizeReturned final : public FunctionPass, + public InstVisitor<OptimizeReturned> { + const char *getPassName() const override { + return "WebAssembly Optimize Returned"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + FunctionPass::getAnalysisUsage(AU); + } + + bool runOnFunction(Function &F) override; + + DominatorTree *DT; + +public: + static char ID; + OptimizeReturned() : FunctionPass(ID), DT(nullptr) {} + + void visitCallSite(CallSite CS); +}; +} // End anonymous namespace + +char OptimizeReturned::ID = 0; +FunctionPass *llvm::createWebAssemblyOptimizeReturned() { + return new OptimizeReturned(); +} + +void OptimizeReturned::visitCallSite(CallSite CS) { + for (unsigned i = 0, e = CS.getNumArgOperands(); i < e; ++i) + if (CS.paramHasAttr(1 + i, Attribute::Returned)) { + Instruction *Inst = CS.getInstruction(); + Value *Arg = CS.getArgOperand(i); + // Ignore constants, globals, undef, etc. + if (isa<Constant>(Arg)) + continue; + // Like replaceDominatedUsesWith but using Instruction/Use dominance. + for (auto UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE;) { + Use &U = *UI++; + if (DT->dominates(Inst, U)) + U.set(Inst); + } + } +} + +bool OptimizeReturned::runOnFunction(Function &F) { + DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + visit(F); + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp new file mode 100644 index 0000000..d570d42 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPEI.cpp @@ -0,0 +1,1066 @@ +//===-- WebAssemblyPEI.cpp - Insert Prolog/Epilog code in function --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass is responsible for finalizing the functions frame layout, saving +// callee saved registers, and for emitting prolog & epilog code for the +// function. +// +// This pass must be run after register allocation. After this pass is +// executed, it is illegal to construct MO_FrameIndex operands. +// +// This is a copy of lib/CodeGen/PrologEpilogInserter.cpp except that it does +// not assert that all virtual registers are gone (because WebAssembly currently +// uses virtual rather than physical registers), and only runs +// MRI.clearVirtRegs() if scavenging happened (which it never does). It also +// uses a different class name so it can be registered via INITIALIZE_PASS. +// It is otherwise unmodified, so any changes to the target-independent PEI +// can be easily applied. +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/IndexedMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/RegisterScavenging.h" +#include "llvm/CodeGen/StackProtector.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include <climits> + +using namespace llvm; + +#define DEBUG_TYPE "pei" +namespace llvm { +void initializeWasmPEIPass(PassRegistry&); +} +namespace { +class WasmPEI : public MachineFunctionPass { +public: + static char ID; + WasmPEI() : MachineFunctionPass(ID) { + initializeWasmPEIPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// runOnMachineFunction - Insert prolog/epilog code and replace abstract + /// frame indexes with appropriate references. + /// + bool runOnMachineFunction(MachineFunction &Fn) override; + +private: + RegScavenger *RS; + + // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved + // stack frame indexes. + unsigned MinCSFrameIndex, MaxCSFrameIndex; + + // Save and Restore blocks of the current function. Typically there is a + // single save block, unless Windows EH funclets are involved. + SmallVector<MachineBasicBlock *, 1> SaveBlocks; + SmallVector<MachineBasicBlock *, 4> RestoreBlocks; + + // Flag to control whether to use the register scavenger to resolve + // frame index materialization registers. Set according to + // TRI->requiresFrameIndexScavenging() for the current function. + bool FrameIndexVirtualScavenging; + + void calculateSets(MachineFunction &Fn); + void calculateCallsInformation(MachineFunction &Fn); + void assignCalleeSavedSpillSlots(MachineFunction &Fn, + const BitVector &SavedRegs); + void insertCSRSpillsAndRestores(MachineFunction &Fn); + void calculateFrameObjectOffsets(MachineFunction &Fn); + void replaceFrameIndices(MachineFunction &Fn); + void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, + int &SPAdj); + void scavengeFrameVirtualRegs(MachineFunction &Fn); + void insertPrologEpilogCode(MachineFunction &Fn); +}; +} // namespace + +char WasmPEI::ID = 0; + +namespace llvm { +FunctionPass *createWebAssemblyPEI() { + return new WasmPEI(); +} +} + +static cl::opt<unsigned> +WarnStackSize("wasm-warn-stack-size", cl::Hidden, cl::init((unsigned)-1), + cl::desc("Warn for stack size bigger than the given" + " number")); + +INITIALIZE_PASS_BEGIN(WasmPEI, "wasmprologepilog", + "Wasm Prologue/Epilogue Insertion", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(StackProtector) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_END(WasmPEI, "wasmprologepilog", + "Wasm Prologue/Epilogue Insertion & Frame Finalization", + false, false) + +STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); +STATISTIC(NumBytesStackSpace, + "Number of bytes used for stack in all functions"); + +void WasmPEI::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addPreserved<MachineLoopInfo>(); + AU.addPreserved<MachineDominatorTree>(); + AU.addRequired<StackProtector>(); + AU.addRequired<TargetPassConfig>(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +/// Compute the set of return blocks +void WasmPEI::calculateSets(MachineFunction &Fn) { + const MachineFrameInfo *MFI = Fn.getFrameInfo(); + + // Even when we do not change any CSR, we still want to insert the + // prologue and epilogue of the function. + // So set the save points for those. + + // Use the points found by shrink-wrapping, if any. + if (MFI->getSavePoint()) { + SaveBlocks.push_back(MFI->getSavePoint()); + assert(MFI->getRestorePoint() && "Both restore and save must be set"); + MachineBasicBlock *RestoreBlock = MFI->getRestorePoint(); + // If RestoreBlock does not have any successor and is not a return block + // then the end point is unreachable and we do not need to insert any + // epilogue. + if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock()) + RestoreBlocks.push_back(RestoreBlock); + return; + } + + // Save refs to entry and return blocks. + SaveBlocks.push_back(&Fn.front()); + for (MachineBasicBlock &MBB : Fn) { + if (MBB.isEHFuncletEntry()) + SaveBlocks.push_back(&MBB); + if (MBB.isReturnBlock()) + RestoreBlocks.push_back(&MBB); + } +} + +/// StackObjSet - A set of stack object indexes +typedef SmallSetVector<int, 8> StackObjSet; + +/// runOnMachineFunction - Insert prolog/epilog code and replace abstract +/// frame indexes with appropriate references. +/// +bool WasmPEI::runOnMachineFunction(MachineFunction &Fn) { + const Function* F = Fn.getFunction(); + const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + + // LOCALMOD: assert removed from target-independent PEI + //assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs"); + + RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr; + FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn); + + // Calculate the MaxCallFrameSize and AdjustsStack variables for the + // function's frame information. Also eliminates call frame pseudo + // instructions. + calculateCallsInformation(Fn); + + // Determine which of the registers in the callee save list should be saved. + BitVector SavedRegs; + TFI->determineCalleeSaves(Fn, SavedRegs, RS); + + // Insert spill code for any callee saved registers that are modified. + assignCalleeSavedSpillSlots(Fn, SavedRegs); + + // Determine placement of CSR spill/restore code: + // place all spills in the entry block, all restores in return blocks. + calculateSets(Fn); + + // Add the code to save and restore the callee saved registers. + if (!F->hasFnAttribute(Attribute::Naked)) + insertCSRSpillsAndRestores(Fn); + + // Allow the target machine to make final modifications to the function + // before the frame layout is finalized. + TFI->processFunctionBeforeFrameFinalized(Fn, RS); + + // Calculate actual frame offsets for all abstract stack objects... + calculateFrameObjectOffsets(Fn); + + // Add prolog and epilog code to the function. This function is required + // to align the stack frame as necessary for any stack variables or + // called functions. Because of this, calculateCalleeSavedRegisters() + // must be called before this function in order to set the AdjustsStack + // and MaxCallFrameSize variables. + if (!F->hasFnAttribute(Attribute::Naked)) + insertPrologEpilogCode(Fn); + + // Replace all MO_FrameIndex operands with physical register references + // and actual offsets. + // + replaceFrameIndices(Fn); + + // If register scavenging is needed, as we've enabled doing it as a + // post-pass, scavenge the virtual registers that frame index elimination + // inserted. + if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) { + scavengeFrameVirtualRegs(Fn); + // Clear any vregs created by virtual scavenging. + // LOCALMOD: made this call conditional with scavengeFrameVirtualregs() + Fn.getRegInfo().clearVirtRegs(); + } + + // Warn on stack size when we exceeds the given limit. + MachineFrameInfo *MFI = Fn.getFrameInfo(); + uint64_t StackSize = MFI->getStackSize(); + if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) { + DiagnosticInfoStackSize DiagStackSize(*F, StackSize); + F->getContext().diagnose(DiagStackSize); + } + + delete RS; + SaveBlocks.clear(); + RestoreBlocks.clear(); + return true; +} + +/// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack +/// variables for the function's frame information and eliminate call frame +/// pseudo instructions. +void WasmPEI::calculateCallsInformation(MachineFunction &Fn) { + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + MachineFrameInfo *MFI = Fn.getFrameInfo(); + + unsigned MaxCallFrameSize = 0; + bool AdjustsStack = MFI->adjustsStack(); + + // Get the function call frame set-up and tear-down instruction opcode + unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); + + // Early exit for targets which have no call frame setup/destroy pseudo + // instructions. + if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u) + return; + + std::vector<MachineBasicBlock::iterator> FrameSDOps; + for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) + if (I->getOpcode() == FrameSetupOpcode || + I->getOpcode() == FrameDestroyOpcode) { + assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo" + " instructions should have a single immediate argument!"); + unsigned Size = I->getOperand(0).getImm(); + if (Size > MaxCallFrameSize) MaxCallFrameSize = Size; + AdjustsStack = true; + FrameSDOps.push_back(I); + } else if (I->isInlineAsm()) { + // Some inline asm's need a stack frame, as indicated by operand 1. + unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm(); + if (ExtraInfo & InlineAsm::Extra_IsAlignStack) + AdjustsStack = true; + } + + MFI->setAdjustsStack(AdjustsStack); + MFI->setMaxCallFrameSize(MaxCallFrameSize); + + for (std::vector<MachineBasicBlock::iterator>::iterator + i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) { + MachineBasicBlock::iterator I = *i; + + // If call frames are not being included as part of the stack frame, and + // the target doesn't indicate otherwise, remove the call frame pseudos + // here. The sub/add sp instruction pairs are still inserted, but we don't + // need to track the SP adjustment for frame index elimination. + if (TFI->canSimplifyCallFramePseudos(Fn)) + TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I); + } +} + +void WasmPEI::assignCalleeSavedSpillSlots(MachineFunction &F, + const BitVector &SavedRegs) { + // These are used to keep track the callee-save area. Initialize them. + MinCSFrameIndex = INT_MAX; + MaxCSFrameIndex = 0; + + if (SavedRegs.empty()) + return; + + const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo(); + const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F); + + std::vector<CalleeSavedInfo> CSI; + for (unsigned i = 0; CSRegs[i]; ++i) { + unsigned Reg = CSRegs[i]; + if (SavedRegs.test(Reg)) + CSI.push_back(CalleeSavedInfo(Reg)); + } + + const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering(); + MachineFrameInfo *MFI = F.getFrameInfo(); + if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) { + // If target doesn't implement this, use generic code. + + if (CSI.empty()) + return; // Early exit if no callee saved registers are modified! + + unsigned NumFixedSpillSlots; + const TargetFrameLowering::SpillSlot *FixedSpillSlots = + TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots); + + // Now that we know which registers need to be saved and restored, allocate + // stack slots for them. + for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end(); + I != E; ++I) { + unsigned Reg = I->getReg(); + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + + int FrameIdx; + if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) { + I->setFrameIdx(FrameIdx); + continue; + } + + // Check to see if this physreg must be spilled to a particular stack slot + // on this target. + const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots; + while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots && + FixedSlot->Reg != Reg) + ++FixedSlot; + + if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { + // Nope, just spill it anywhere convenient. + unsigned Align = RC->getAlignment(); + unsigned StackAlign = TFI->getStackAlignment(); + + // We may not be able to satisfy the desired alignment specification of + // the TargetRegisterClass if the stack alignment is smaller. Use the + // min. + Align = std::min(Align, StackAlign); + FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true); + if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; + if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; + } else { + // Spill it to the stack where we must. + FrameIdx = + MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset); + } + + I->setFrameIdx(FrameIdx); + } + } + + MFI->setCalleeSavedInfo(CSI); +} + +/// Helper function to update the liveness information for the callee-saved +/// registers. +static void updateLiveness(MachineFunction &MF) { + MachineFrameInfo *MFI = MF.getFrameInfo(); + // Visited will contain all the basic blocks that are in the region + // where the callee saved registers are alive: + // - Anything that is not Save or Restore -> LiveThrough. + // - Save -> LiveIn. + // - Restore -> LiveOut. + // The live-out is not attached to the block, so no need to keep + // Restore in this set. + SmallPtrSet<MachineBasicBlock *, 8> Visited; + SmallVector<MachineBasicBlock *, 8> WorkList; + MachineBasicBlock *Entry = &MF.front(); + MachineBasicBlock *Save = MFI->getSavePoint(); + + if (!Save) + Save = Entry; + + if (Entry != Save) { + WorkList.push_back(Entry); + Visited.insert(Entry); + } + Visited.insert(Save); + + MachineBasicBlock *Restore = MFI->getRestorePoint(); + if (Restore) + // By construction Restore cannot be visited, otherwise it + // means there exists a path to Restore that does not go + // through Save. + WorkList.push_back(Restore); + + while (!WorkList.empty()) { + const MachineBasicBlock *CurBB = WorkList.pop_back_val(); + // By construction, the region that is after the save point is + // dominated by the Save and post-dominated by the Restore. + if (CurBB == Save && Save != Restore) + continue; + // Enqueue all the successors not already visited. + // Those are by construction either before Save or after Restore. + for (MachineBasicBlock *SuccBB : CurBB->successors()) + if (Visited.insert(SuccBB).second) + WorkList.push_back(SuccBB); + } + + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + for (MachineBasicBlock *MBB : Visited) { + MCPhysReg Reg = CSI[i].getReg(); + // Add the callee-saved register as live-in. + // It's killed at the spill. + if (!MBB->isLiveIn(Reg)) + MBB->addLiveIn(Reg); + } + } +} + +/// insertCSRSpillsAndRestores - Insert spill and restore code for +/// callee saved registers used in the function. +/// +void WasmPEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { + // Get callee saved register information. + MachineFrameInfo *MFI = Fn.getFrameInfo(); + const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo(); + + MFI->setCalleeSavedInfoValid(true); + + // Early exit if no callee saved registers are modified! + if (CSI.empty()) + return; + + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); + MachineBasicBlock::iterator I; + + // Spill using target interface. + for (MachineBasicBlock *SaveBlock : SaveBlocks) { + I = SaveBlock->begin(); + if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) { + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + // Insert the spill to the stack frame. + unsigned Reg = CSI[i].getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(), + RC, TRI); + } + } + // Update the live-in information of all the blocks up to the save point. + updateLiveness(Fn); + } + + // Restore using target interface. + for (MachineBasicBlock *MBB : RestoreBlocks) { + I = MBB->end(); + + // Skip over all terminator instructions, which are part of the return + // sequence. + MachineBasicBlock::iterator I2 = I; + while (I2 != MBB->begin() && (--I2)->isTerminator()) + I = I2; + + bool AtStart = I == MBB->begin(); + MachineBasicBlock::iterator BeforeI = I; + if (!AtStart) + --BeforeI; + + // Restore all registers immediately before the return and any + // terminators that precede it. + if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) { + for (unsigned i = 0, e = CSI.size(); i != e; ++i) { + unsigned Reg = CSI[i].getReg(); + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); + TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); + assert(I != MBB->begin() && + "loadRegFromStackSlot didn't insert any code!"); + // Insert in reverse order. loadRegFromStackSlot can insert + // multiple instructions. + if (AtStart) + I = MBB->begin(); + else { + I = BeforeI; + ++I; + } + } + } + } +} + +/// AdjustStackOffset - Helper function used to adjust the stack frame offset. +static inline void +AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, + bool StackGrowsDown, int64_t &Offset, + unsigned &MaxAlign, unsigned Skew) { + // If the stack grows down, add the object size to find the lowest address. + if (StackGrowsDown) + Offset += MFI->getObjectSize(FrameIdx); + + unsigned Align = MFI->getObjectAlignment(FrameIdx); + + // If the alignment of this object is greater than that of the stack, then + // increase the stack alignment to match. + MaxAlign = std::max(MaxAlign, Align); + + // Adjust to alignment boundary. + Offset = RoundUpToAlignment(Offset, Align, Skew); + + if (StackGrowsDown) { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset + } else { + DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n"); + MFI->setObjectOffset(FrameIdx, Offset); + Offset += MFI->getObjectSize(FrameIdx); + } +} + +/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e., +/// those required to be close to the Stack Protector) to stack offsets. +static void +AssignProtectedObjSet(const StackObjSet &UnassignedObjs, + SmallSet<int, 16> &ProtectedObjs, + MachineFrameInfo *MFI, bool StackGrowsDown, + int64_t &Offset, unsigned &MaxAlign, unsigned Skew) { + + for (StackObjSet::const_iterator I = UnassignedObjs.begin(), + E = UnassignedObjs.end(); I != E; ++I) { + int i = *I; + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); + ProtectedObjs.insert(i); + } +} + +/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the +/// abstract stack objects. +/// +void WasmPEI::calculateFrameObjectOffsets(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + StackProtector *SP = &getAnalysis<StackProtector>(); + + bool StackGrowsDown = + TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; + + // Loop over all of the stack objects, assigning sequential addresses... + MachineFrameInfo *MFI = Fn.getFrameInfo(); + + // Start at the beginning of the local area. + // The Offset is the distance from the stack top in the direction + // of stack growth -- so it's always nonnegative. + int LocalAreaOffset = TFI.getOffsetOfLocalArea(); + if (StackGrowsDown) + LocalAreaOffset = -LocalAreaOffset; + assert(LocalAreaOffset >= 0 + && "Local area offset should be in direction of stack growth"); + int64_t Offset = LocalAreaOffset; + + // Skew to be applied to alignment. + unsigned Skew = TFI.getStackAlignmentSkew(Fn); + + // If there are fixed sized objects that are preallocated in the local area, + // non-fixed objects can't be allocated right at the start of local area. + // We currently don't support filling in holes in between fixed sized + // objects, so we adjust 'Offset' to point to the end of last fixed sized + // preallocated object. + for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) { + int64_t FixedOff; + if (StackGrowsDown) { + // The maximum distance from the stack pointer is at lower address of + // the object -- which is given by offset. For down growing stack + // the offset is negative, so we negate the offset to get the distance. + FixedOff = -MFI->getObjectOffset(i); + } else { + // The maximum distance from the start pointer is at the upper + // address of the object. + FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i); + } + if (FixedOff > Offset) Offset = FixedOff; + } + + // First assign frame offsets to stack objects that are used to spill + // callee saved registers. + if (StackGrowsDown) { + for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) { + // If the stack grows down, we need to add the size to find the lowest + // address of the object. + Offset += MFI->getObjectSize(i); + + unsigned Align = MFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = RoundUpToAlignment(Offset, Align, Skew); + + MFI->setObjectOffset(i, -Offset); // Set the computed offset + } + } else { + int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex; + for (int i = MaxCSFI; i >= MinCSFI ; --i) { + unsigned Align = MFI->getObjectAlignment(i); + // Adjust to alignment boundary + Offset = RoundUpToAlignment(Offset, Align, Skew); + + MFI->setObjectOffset(i, Offset); + Offset += MFI->getObjectSize(i); + } + } + + unsigned MaxAlign = MFI->getMaxAlignment(); + + // Make sure the special register scavenging spill slot is closest to the + // incoming stack pointer if a frame pointer is required and is closer + // to the incoming rather than the final stack pointer. + const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo(); + bool EarlyScavengingSlots = (TFI.hasFP(Fn) && + TFI.isFPCloseToIncomingSP() && + RegInfo->useFPForScavengingIndex(Fn) && + !RegInfo->needsStackRealignment(Fn)); + if (RS && EarlyScavengingSlots) { + SmallVector<int, 2> SFIs; + RS->getScavengingFrameIndices(SFIs); + for (SmallVectorImpl<int>::iterator I = SFIs.begin(), + IE = SFIs.end(); I != IE; ++I) + AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew); + } + + // FIXME: Once this is working, then enable flag will change to a target + // check for whether the frame is large enough to want to use virtual + // frame index registers. Functions which don't want/need this optimization + // will continue to use the existing code path. + if (MFI->getUseLocalStackAllocationBlock()) { + unsigned Align = MFI->getLocalFrameMaxAlign(); + + // Adjust to alignment boundary. + Offset = RoundUpToAlignment(Offset, Align, Skew); + + DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); + + // Resolve offsets for objects in the local block. + for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) { + std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i); + int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second; + DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << + FIOffset << "]\n"); + MFI->setObjectOffset(Entry.first, FIOffset); + } + // Allocate the local block + Offset += MFI->getLocalFrameSize(); + + MaxAlign = std::max(Align, MaxAlign); + } + + // Make sure that the stack protector comes before the local variables on the + // stack. + SmallSet<int, 16> ProtectedObjs; + if (MFI->getStackProtectorIndex() >= 0) { + StackObjSet LargeArrayObjs; + StackObjSet SmallArrayObjs; + StackObjSet AddrOfObjs; + + AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown, + Offset, MaxAlign, Skew); + + // Assign large stack objects first. + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { + if (MFI->isObjectPreAllocated(i) && + MFI->getUseLocalStackAllocationBlock()) + continue; + if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex) + continue; + if (RS && RS->isScavengingFrameIndex((int)i)) + continue; + if (MFI->isDeadObjectIndex(i)) + continue; + if (MFI->getStackProtectorIndex() == (int)i) + continue; + + switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) { + case StackProtector::SSPLK_None: + continue; + case StackProtector::SSPLK_SmallArray: + SmallArrayObjs.insert(i); + continue; + case StackProtector::SSPLK_AddrOf: + AddrOfObjs.insert(i); + continue; + case StackProtector::SSPLK_LargeArray: + LargeArrayObjs.insert(i); + continue; + } + llvm_unreachable("Unexpected SSPLayoutKind."); + } + + AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown, + Offset, MaxAlign, Skew); + AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown, + Offset, MaxAlign, Skew); + AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown, + Offset, MaxAlign, Skew); + } + + // Then assign frame offsets to stack objects that are not used to spill + // callee saved registers. + for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { + if (MFI->isObjectPreAllocated(i) && + MFI->getUseLocalStackAllocationBlock()) + continue; + if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex) + continue; + if (RS && RS->isScavengingFrameIndex((int)i)) + continue; + if (MFI->isDeadObjectIndex(i)) + continue; + if (MFI->getStackProtectorIndex() == (int)i) + continue; + if (ProtectedObjs.count(i)) + continue; + + AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); + } + + // Make sure the special register scavenging spill slot is closest to the + // stack pointer. + if (RS && !EarlyScavengingSlots) { + SmallVector<int, 2> SFIs; + RS->getScavengingFrameIndices(SFIs); + for (SmallVectorImpl<int>::iterator I = SFIs.begin(), + IE = SFIs.end(); I != IE; ++I) + AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew); + } + + if (!TFI.targetHandlesStackFrameRounding()) { + // If we have reserved argument space for call sites in the function + // immediately on entry to the current function, count it as part of the + // overall stack size. + if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn)) + Offset += MFI->getMaxCallFrameSize(); + + // Round up the size to a multiple of the alignment. If the function has + // any calls or alloca's, align to the target's StackAlignment value to + // ensure that the callee's frame or the alloca data is suitably aligned; + // otherwise, for leaf functions, align to the TransientStackAlignment + // value. + unsigned StackAlign; + if (MFI->adjustsStack() || MFI->hasVarSizedObjects() || + (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0)) + StackAlign = TFI.getStackAlignment(); + else + StackAlign = TFI.getTransientStackAlignment(); + + // If the frame pointer is eliminated, all frame offsets will be relative to + // SP not FP. Align to MaxAlign so this works. + StackAlign = std::max(StackAlign, MaxAlign); + Offset = RoundUpToAlignment(Offset, StackAlign, Skew); + } + + // Update frame info to pretend that this is part of the stack... + int64_t StackSize = Offset - LocalAreaOffset; + MFI->setStackSize(StackSize); + NumBytesStackSpace += StackSize; +} + +/// insertPrologEpilogCode - Scan the function for modified callee saved +/// registers, insert spill code for these callee saved registers, then add +/// prolog and epilog code to the function. +/// +void WasmPEI::insertPrologEpilogCode(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + + // Add prologue to the function... + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.emitPrologue(Fn, *SaveBlock); + + // Add epilogue to restore the callee-save registers in each exiting block. + for (MachineBasicBlock *RestoreBlock : RestoreBlocks) + TFI.emitEpilogue(Fn, *RestoreBlock); + + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.inlineStackProbe(Fn, *SaveBlock); + + // Emit additional code that is required to support segmented stacks, if + // we've been asked for it. This, when linked with a runtime with support + // for segmented stacks (libgcc is one), will result in allocating stack + // space in small chunks instead of one large contiguous block. + if (Fn.shouldSplitStack()) { + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.adjustForSegmentedStacks(Fn, *SaveBlock); + } + + // Emit additional code that is required to explicitly handle the stack in + // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The + // approach is rather similar to that of Segmented Stacks, but it uses a + // different conditional check and another BIF for allocating more stack + // space. + if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE) + for (MachineBasicBlock *SaveBlock : SaveBlocks) + TFI.adjustForHiPEPrologue(Fn, *SaveBlock); +} + +/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical +/// register references and actual offsets. +/// +void WasmPEI::replaceFrameIndices(MachineFunction &Fn) { + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + if (!TFI.needsFrameIndexResolution(Fn)) return; + + // Store SPAdj at exit of a basic block. + SmallVector<int, 8> SPState; + SPState.resize(Fn.getNumBlockIDs()); + SmallPtrSet<MachineBasicBlock*, 8> Reachable; + + // Iterate over the reachable blocks in DFS order. + for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable); + DFI != DFE; ++DFI) { + int SPAdj = 0; + // Check the exit state of the DFS stack predecessor. + if (DFI.getPathLength() >= 2) { + MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2); + assert(Reachable.count(StackPred) && + "DFS stack predecessor is already visited.\n"); + SPAdj = SPState[StackPred->getNumber()]; + } + MachineBasicBlock *BB = *DFI; + replaceFrameIndices(BB, Fn, SPAdj); + SPState[BB->getNumber()] = SPAdj; + } + + // Handle the unreachable blocks. + for (auto &BB : Fn) { + if (Reachable.count(&BB)) + // Already handled in DFS traversal. + continue; + int SPAdj = 0; + replaceFrameIndices(&BB, Fn, SPAdj); + } +} + +void WasmPEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, + int &SPAdj) { + assert(Fn.getSubtarget().getRegisterInfo() && + "getRegisterInfo() must be implemented!"); + const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); + const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); + const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); + unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode(); + unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); + + if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB); + + bool InsideCallSequence = false; + + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { + + if (I->getOpcode() == FrameSetupOpcode || + I->getOpcode() == FrameDestroyOpcode) { + InsideCallSequence = (I->getOpcode() == FrameSetupOpcode); + SPAdj += TII.getSPAdjust(I); + + MachineBasicBlock::iterator PrevI = BB->end(); + if (I != BB->begin()) PrevI = std::prev(I); + TFI->eliminateCallFramePseudoInstr(Fn, *BB, I); + + // Visit the instructions created by eliminateCallFramePseudoInstr(). + if (PrevI == BB->end()) + I = BB->begin(); // The replaced instr was the first in the block. + else + I = std::next(PrevI); + continue; + } + + MachineInstr *MI = I; + bool DoIncr = true; + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (!MI->getOperand(i).isFI()) + continue; + + // Frame indices in debug values are encoded in a target independent + // way with simply the frame index and offset rather than any + // target-specific addressing mode. + if (MI->isDebugValue()) { + assert(i == 0 && "Frame indices can only appear as the first " + "operand of a DBG_VALUE machine instruction"); + unsigned Reg; + MachineOperand &Offset = MI->getOperand(1); + Offset.setImm(Offset.getImm() + + TFI->getFrameIndexReference( + Fn, MI->getOperand(0).getIndex(), Reg)); + MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/); + continue; + } + + // TODO: This code should be commoned with the code for + // PATCHPOINT. There's no good reason for the difference in + // implementation other than historical accident. The only + // remaining difference is the unconditional use of the stack + // pointer as the base register. + if (MI->getOpcode() == TargetOpcode::STATEPOINT) { + assert((!MI->isDebugValue() || i == 0) && + "Frame indicies can only appear as the first operand of a " + "DBG_VALUE machine instruction"); + unsigned Reg; + MachineOperand &Offset = MI->getOperand(i + 1); + const unsigned refOffset = + TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(), + Reg); + + Offset.setImm(Offset.getImm() + refOffset); + MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/); + continue; + } + + // Some instructions (e.g. inline asm instructions) can have + // multiple frame indices and/or cause eliminateFrameIndex + // to insert more than one instruction. We need the register + // scavenger to go through all of these instructions so that + // it can update its register information. We keep the + // iterator at the point before insertion so that we can + // revisit them in full. + bool AtBeginning = (I == BB->begin()); + if (!AtBeginning) --I; + + // If this instruction has a FrameIndex operand, we need to + // use that target machine register info object to eliminate + // it. + TRI.eliminateFrameIndex(MI, SPAdj, i, + FrameIndexVirtualScavenging ? nullptr : RS); + + // Reset the iterator if we were at the beginning of the BB. + if (AtBeginning) { + I = BB->begin(); + DoIncr = false; + } + + MI = nullptr; + break; + } + + // If we are looking at a call sequence, we need to keep track of + // the SP adjustment made by each instruction in the sequence. + // This includes both the frame setup/destroy pseudos (handled above), + // as well as other instructions that have side effects w.r.t the SP. + // Note that this must come after eliminateFrameIndex, because + // if I itself referred to a frame index, we shouldn't count its own + // adjustment. + if (MI && InsideCallSequence) + SPAdj += TII.getSPAdjust(MI); + + if (DoIncr && I != BB->end()) ++I; + + // Update register states. + if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI); + } +} + +/// scavengeFrameVirtualRegs - Replace all frame index virtual registers +/// with physical registers. Use the register scavenger to find an +/// appropriate register to use. +/// +/// FIXME: Iterating over the instruction stream is unnecessary. We can simply +/// iterate over the vreg use list, which at this point only contains machine +/// operands for which eliminateFrameIndex need a new scratch reg. +void +WasmPEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { + // Run through the instructions and find any virtual registers. + for (MachineFunction::iterator BB = Fn.begin(), + E = Fn.end(); BB != E; ++BB) { + RS->enterBasicBlock(&*BB); + + int SPAdj = 0; + + // The instruction stream may change in the loop, so check BB->end() + // directly. + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { + // We might end up here again with a NULL iterator if we scavenged a + // register for which we inserted spill code for definition by what was + // originally the first instruction in BB. + if (I == MachineBasicBlock::iterator(nullptr)) + I = BB->begin(); + + MachineInstr *MI = I; + MachineBasicBlock::iterator J = std::next(I); + MachineBasicBlock::iterator P = + I == BB->begin() ? MachineBasicBlock::iterator(nullptr) + : std::prev(I); + + // RS should process this instruction before we might scavenge at this + // location. This is because we might be replacing a virtual register + // defined by this instruction, and if so, registers killed by this + // instruction are available, and defined registers are not. + RS->forward(I); + + for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + if (MI->getOperand(i).isReg()) { + MachineOperand &MO = MI->getOperand(i); + unsigned Reg = MO.getReg(); + if (Reg == 0) + continue; + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + + // When we first encounter a new virtual register, it + // must be a definition. + assert(MI->getOperand(i).isDef() && + "frame index virtual missing def!"); + // Scavenge a new scratch register + const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg); + unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj); + + ++NumScavengedRegs; + + // Replace this reference to the virtual register with the + // scratch register. + assert (ScratchReg && "Missing scratch register!"); + Fn.getRegInfo().replaceRegWith(Reg, ScratchReg); + + // Because this instruction was processed by the RS before this + // register was allocated, make sure that the RS now records the + // register as being used. + RS->setRegUsed(ScratchReg); + } + } + + // If the scavenger needed to use one of its spill slots, the + // spill code will have been inserted in between I and J. This is a + // problem because we need the spill code before I: Move I to just + // prior to J. + if (I != std::prev(J)) { + BB->splice(J, &*BB, I); + + // Before we move I, we need to prepare the RS to visit I again. + // Specifically, RS will assert if it sees uses of registers that + // it believes are undefined. Because we have already processed + // register kills in I, when it visits I again, it will believe that + // those registers are undefined. To avoid this situation, unprocess + // the instruction I. + assert(RS->getCurrentPosition() == I && + "The register scavenger has an unexpected position"); + I = P; + RS->unprocess(P); + } else + ++I; + } + } +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp new file mode 100644 index 0000000..4ad6eed --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp @@ -0,0 +1,86 @@ +//===-- WebAssemblyPeephole.cpp - WebAssembly Peephole Optimiztions -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief Late peephole optimizations for WebAssembly. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-peephole" + +namespace { +class WebAssemblyPeephole final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly late peephole optimizer"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; + WebAssemblyPeephole() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyPeephole::ID = 0; +FunctionPass *llvm::createWebAssemblyPeephole() { + return new WebAssemblyPeephole(); +} + +bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + + for (auto &MBB : MF) + for (auto &MI : MBB) + switch (MI.getOpcode()) { + default: + break; + case WebAssembly::STORE8_I32: + case WebAssembly::STORE16_I32: + case WebAssembly::STORE8_I64: + case WebAssembly::STORE16_I64: + case WebAssembly::STORE32_I64: + case WebAssembly::STORE_F32: + case WebAssembly::STORE_F64: + case WebAssembly::STORE_I32: + case WebAssembly::STORE_I64: { + // Store instructions return their value operand. If we ended up using + // the same register for both, replace it with a dead def so that it + // can use $discard instead. + MachineOperand &MO = MI.getOperand(0); + unsigned OldReg = MO.getReg(); + // TODO: Handle SP/physregs + if (OldReg == MI.getOperand(3).getReg() + && TargetRegisterInfo::isVirtualRegister(MI.getOperand(3).getReg())) { + Changed = true; + unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg)); + MO.setReg(NewReg); + MO.setIsDead(); + MFI.stackifyVReg(NewReg); + MFI.addWAReg(NewReg, WebAssemblyFunctionInfo::UnusedReg); + } + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp new file mode 100644 index 0000000..9ec6659 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp @@ -0,0 +1,175 @@ +//===-- WebAssemblyRegColoring.cpp - Register coloring --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a virtual register coloring pass. +/// +/// WebAssembly doesn't have a fixed number of registers, but it is still +/// desirable to minimize the total number of registers used in each function. +/// +/// This code is modeled after lib/CodeGen/StackSlotColoring.cpp. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-reg-coloring" + +namespace { +class WebAssemblyRegColoring final : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyRegColoring() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { + return "WebAssembly Register Coloring"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<LiveIntervals>(); + AU.addRequired<MachineBlockFrequencyInfo>(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addPreservedID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: +}; +} // end anonymous namespace + +char WebAssemblyRegColoring::ID = 0; +FunctionPass *llvm::createWebAssemblyRegColoring() { + return new WebAssemblyRegColoring(); +} + +// Compute the total spill weight for VReg. +static float computeWeight(const MachineRegisterInfo *MRI, + const MachineBlockFrequencyInfo *MBFI, + unsigned VReg) { + float weight = 0.0f; + for (MachineOperand &MO : MRI->reg_nodbg_operands(VReg)) + weight += LiveIntervals::getSpillWeight(MO.isDef(), MO.isUse(), MBFI, + MO.getParent()); + return weight; +} + +bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) { + DEBUG({ + dbgs() << "********** Register Coloring **********\n" + << "********** Function: " << MF.getName() << '\n'; + }); + + // If there are calls to setjmp or sigsetjmp, don't perform coloring. Virtual + // registers could be modified before the longjmp is executed, resulting in + // the wrong value being used afterwards. (See <rdar://problem/8007500>.) + // TODO: Does WebAssembly need to care about setjmp for register coloring? + if (MF.exposesReturnsTwice()) + return false; + + MachineRegisterInfo *MRI = &MF.getRegInfo(); + LiveIntervals *Liveness = &getAnalysis<LiveIntervals>(); + const MachineBlockFrequencyInfo *MBFI = + &getAnalysis<MachineBlockFrequencyInfo>(); + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + + // Gather all register intervals into a list and sort them. + unsigned NumVRegs = MRI->getNumVirtRegs(); + SmallVector<LiveInterval *, 0> SortedIntervals; + SortedIntervals.reserve(NumVRegs); + + DEBUG(dbgs() << "Interesting register intervals:\n"); + for (unsigned i = 0; i < NumVRegs; ++i) { + unsigned VReg = TargetRegisterInfo::index2VirtReg(i); + if (MFI.isVRegStackified(VReg)) + continue; + // Skip unused registers, which can use $discard. + if (MRI->use_empty(VReg)) + continue; + + LiveInterval *LI = &Liveness->getInterval(VReg); + assert(LI->weight == 0.0f); + LI->weight = computeWeight(MRI, MBFI, VReg); + DEBUG(LI->dump()); + SortedIntervals.push_back(LI); + } + DEBUG(dbgs() << '\n'); + + // Sort them to put arguments first (since we don't want to rename live-in + // registers), by weight next, and then by position. + // TODO: Investigate more intelligent sorting heuristics. For starters, we + // should try to coalesce adjacent live intervals before non-adjacent ones. + std::sort(SortedIntervals.begin(), SortedIntervals.end(), + [MRI](LiveInterval *LHS, LiveInterval *RHS) { + if (MRI->isLiveIn(LHS->reg) != MRI->isLiveIn(RHS->reg)) + return MRI->isLiveIn(LHS->reg); + if (LHS->weight != RHS->weight) + return LHS->weight > RHS->weight; + if (LHS->empty() || RHS->empty()) + return !LHS->empty() && RHS->empty(); + return *LHS < *RHS; + }); + + DEBUG(dbgs() << "Coloring register intervals:\n"); + SmallVector<unsigned, 16> SlotMapping(SortedIntervals.size(), -1u); + SmallVector<SmallVector<LiveInterval *, 4>, 16> Assignments( + SortedIntervals.size()); + BitVector UsedColors(SortedIntervals.size()); + bool Changed = false; + for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) { + LiveInterval *LI = SortedIntervals[i]; + unsigned Old = LI->reg; + size_t Color = i; + const TargetRegisterClass *RC = MRI->getRegClass(Old); + + // Check if it's possible to reuse any of the used colors. + if (!MRI->isLiveIn(Old)) + for (int C(UsedColors.find_first()); C != -1; + C = UsedColors.find_next(C)) { + if (MRI->getRegClass(SortedIntervals[C]->reg) != RC) + continue; + for (LiveInterval *OtherLI : Assignments[C]) + if (!OtherLI->empty() && OtherLI->overlaps(*LI)) + goto continue_outer; + Color = C; + break; + continue_outer:; + } + + unsigned New = SortedIntervals[Color]->reg; + SlotMapping[i] = New; + Changed |= Old != New; + UsedColors.set(Color); + Assignments[Color].push_back(LI); + DEBUG(dbgs() << "Assigning vreg" + << TargetRegisterInfo::virtReg2Index(LI->reg) << " to vreg" + << TargetRegisterInfo::virtReg2Index(New) << "\n"); + } + if (!Changed) + return false; + + // Rewrite register operands. + for (size_t i = 0, e = SortedIntervals.size(); i < e; ++i) { + unsigned Old = SortedIntervals[i]->reg; + unsigned New = SlotMapping[i]; + if (Old != New) + MRI->replaceRegWith(Old, New); + } + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp new file mode 100644 index 0000000..f621db0 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp @@ -0,0 +1,109 @@ +//===-- WebAssemblyRegNumbering.cpp - Register Numbering ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a pass which assigns WebAssembly register +/// numbers for CodeGen virtual registers. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblySubtarget.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-reg-numbering" + +namespace { +class WebAssemblyRegNumbering final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly Register Numbering"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyRegNumbering() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyRegNumbering::ID = 0; +FunctionPass *llvm::createWebAssemblyRegNumbering() { + return new WebAssemblyRegNumbering(); +} + +bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** Register Numbering **********\n" + "********** Function: " + << MF.getName() << '\n'); + + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const MachineFrameInfo &FrameInfo = *MF.getFrameInfo(); + + MFI.initWARegs(); + + // WebAssembly argument registers are in the same index space as local + // variables. Assign the numbers for them first. + MachineBasicBlock &EntryMBB = MF.front(); + for (MachineInstr &MI : EntryMBB) { + switch (MI.getOpcode()) { + case WebAssembly::ARGUMENT_I32: + case WebAssembly::ARGUMENT_I64: + case WebAssembly::ARGUMENT_F32: + case WebAssembly::ARGUMENT_F64: + MFI.setWAReg(MI.getOperand(0).getReg(), MI.getOperand(1).getImm()); + break; + default: + break; + } + } + + // Then assign regular WebAssembly registers for all remaining used + // virtual registers. TODO: Consider sorting the registers by frequency of + // use, to maximize usage of small immediate fields. + unsigned NumArgRegs = MFI.getParams().size(); + unsigned NumVRegs = MF.getRegInfo().getNumVirtRegs(); + unsigned NumStackRegs = 0; + unsigned CurReg = 0; + for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) { + unsigned VReg = TargetRegisterInfo::index2VirtReg(VRegIdx); + // Handle stackified registers. + if (MFI.isVRegStackified(VReg)) { + MFI.setWAReg(VReg, INT32_MIN | NumStackRegs++); + continue; + } + // Skip unused registers. + if (MRI.use_empty(VReg)) + continue; + if (MFI.getWAReg(VReg) == WebAssemblyFunctionInfo::UnusedReg) + MFI.setWAReg(VReg, NumArgRegs + CurReg++); + } + // Allocate locals for used physical registers + if (FrameInfo.getStackSize() > 0) + MFI.addPReg(WebAssembly::SP32, CurReg++); + + return true; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp new file mode 100644 index 0000000..89ef5cd --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -0,0 +1,265 @@ +//===-- WebAssemblyRegStackify.cpp - Register Stackification --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a register stacking pass. +/// +/// This pass reorders instructions to put register uses and defs in an order +/// such that they form single-use expression trees. Registers fitting this form +/// are then marked as "stackified", meaning references to them are replaced by +/// "push" and "pop" from the stack. +/// +/// This is primarily a code size optimization, since temporary values on the +/// expression don't need to be named. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_* +#include "WebAssemblyMachineFunctionInfo.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveIntervalAnalysis.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-reg-stackify" + +namespace { +class WebAssemblyRegStackify final : public MachineFunctionPass { + const char *getPassName() const override { + return "WebAssembly Register Stackify"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<LiveIntervals>(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addPreserved<SlotIndexes>(); + AU.addPreserved<LiveIntervals>(); + AU.addPreservedID(MachineDominatorsID); + AU.addPreservedID(LiveVariablesID); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyRegStackify() : MachineFunctionPass(ID) {} +}; +} // end anonymous namespace + +char WebAssemblyRegStackify::ID = 0; +FunctionPass *llvm::createWebAssemblyRegStackify() { + return new WebAssemblyRegStackify(); +} + +// Decorate the given instruction with implicit operands that enforce the +// expression stack ordering constraints for an instruction which is on +// the expression stack. +static void ImposeStackOrdering(MachineInstr *MI) { + // Write the opaque EXPR_STACK register. + if (!MI->definesRegister(WebAssembly::EXPR_STACK)) + MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK, + /*isDef=*/true, + /*isImp=*/true)); + + // Also read the opaque EXPR_STACK register. + if (!MI->readsRegister(WebAssembly::EXPR_STACK)) + MI->addOperand(MachineOperand::CreateReg(WebAssembly::EXPR_STACK, + /*isDef=*/false, + /*isImp=*/true)); +} + +// Test whether it's safe to move Def to just before Insert. +// TODO: Compute memory dependencies in a way that doesn't require always +// walking the block. +// TODO: Compute memory dependencies in a way that uses AliasAnalysis to be +// more precise. +static bool IsSafeToMove(const MachineInstr *Def, const MachineInstr *Insert, + AliasAnalysis &AA, LiveIntervals &LIS, + MachineRegisterInfo &MRI) { + assert(Def->getParent() == Insert->getParent()); + bool SawStore = false, SawSideEffects = false; + MachineBasicBlock::const_iterator D(Def), I(Insert); + + // Check for register dependencies. + for (const MachineOperand &MO : Def->operands()) { + if (!MO.isReg() || MO.isUndef()) + continue; + unsigned Reg = MO.getReg(); + + // If the register is dead here and at Insert, ignore it. + if (MO.isDead() && Insert->definesRegister(Reg) && + !Insert->readsRegister(Reg)) + continue; + + if (TargetRegisterInfo::isPhysicalRegister(Reg)) { + // If the physical register is never modified, ignore it. + if (!MRI.isPhysRegModified(Reg)) + continue; + // Otherwise, it's a physical register with unknown liveness. + return false; + } + + // Ask LiveIntervals whether moving this virtual register use or def to + // Insert will change value numbers are seen. + const LiveInterval &LI = LIS.getInterval(Reg); + VNInfo *DefVNI = MO.isDef() ? + LI.getVNInfoAt(LIS.getInstructionIndex(Def).getRegSlot()) : + LI.getVNInfoBefore(LIS.getInstructionIndex(Def)); + assert(DefVNI && "Instruction input missing value number"); + VNInfo *InsVNI = LI.getVNInfoBefore(LIS.getInstructionIndex(Insert)); + if (InsVNI && DefVNI != InsVNI) + return false; + } + + // Check for memory dependencies and side effects. + for (--I; I != D; --I) + SawSideEffects |= I->isSafeToMove(&AA, SawStore); + return !(SawStore && Def->mayLoad() && !Def->isInvariantLoad(&AA)) && + !(SawSideEffects && !Def->isSafeToMove(&AA, SawStore)); +} + +bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** Register Stackifying **********\n" + "********** Function: " + << MF.getName() << '\n'); + + bool Changed = false; + MachineRegisterInfo &MRI = MF.getRegInfo(); + WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>(); + AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults(); + LiveIntervals &LIS = getAnalysis<LiveIntervals>(); + + // Walk the instructions from the bottom up. Currently we don't look past + // block boundaries, and the blocks aren't ordered so the block visitation + // order isn't significant, but we may want to change this in the future. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : reverse(MBB)) { + MachineInstr *Insert = &MI; + // Don't nest anything inside a phi. + if (Insert->getOpcode() == TargetOpcode::PHI) + break; + + // Don't nest anything inside an inline asm, because we don't have + // constraints for $push inputs. + if (Insert->getOpcode() == TargetOpcode::INLINEASM) + break; + + // Iterate through the inputs in reverse order, since we'll be pulling + // operands off the stack in LIFO order. + bool AnyStackified = false; + for (MachineOperand &Op : reverse(Insert->uses())) { + // We're only interested in explicit virtual register operands. + if (!Op.isReg() || Op.isImplicit() || !Op.isUse()) + continue; + + unsigned Reg = Op.getReg(); + + // Only consider registers with a single definition. + // TODO: Eventually we may relax this, to stackify phi transfers. + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (!Def) + continue; + + // There's no use in nesting implicit defs inside anything. + if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF) + continue; + + // Don't nest an INLINE_ASM def into anything, because we don't have + // constraints for $pop outputs. + if (Def->getOpcode() == TargetOpcode::INLINEASM) + continue; + + // Don't nest PHIs inside of anything. + if (Def->getOpcode() == TargetOpcode::PHI) + continue; + + // Argument instructions represent live-in registers and not real + // instructions. + if (Def->getOpcode() == WebAssembly::ARGUMENT_I32 || + Def->getOpcode() == WebAssembly::ARGUMENT_I64 || + Def->getOpcode() == WebAssembly::ARGUMENT_F32 || + Def->getOpcode() == WebAssembly::ARGUMENT_F64) + continue; + + // Single-use expression trees require defs that have one use. + // TODO: Eventually we'll relax this, to take advantage of set_local + // returning its result. + if (!MRI.hasOneUse(Reg)) + continue; + + // For now, be conservative and don't look across block boundaries. + // TODO: Be more aggressive? + if (Def->getParent() != &MBB) + continue; + + // Don't move instructions that have side effects or memory dependencies + // or other complications. + if (!IsSafeToMove(Def, Insert, AA, LIS, MRI)) + continue; + + Changed = true; + AnyStackified = true; + // Move the def down and nest it in the current instruction. + MBB.splice(Insert, &MBB, Def); + LIS.handleMove(Def); + MFI.stackifyVReg(Reg); + ImposeStackOrdering(Def); + Insert = Def; + } + if (AnyStackified) + ImposeStackOrdering(&MI); + } + } + + // If we used EXPR_STACK anywhere, add it to the live-in sets everywhere + // so that it never looks like a use-before-def. + if (Changed) { + MF.getRegInfo().addLiveIn(WebAssembly::EXPR_STACK); + for (MachineBasicBlock &MBB : MF) + MBB.addLiveIn(WebAssembly::EXPR_STACK); + } + +#ifndef NDEBUG + // Verify that pushes and pops are performed in FIFO order. + SmallVector<unsigned, 0> Stack; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + for (MachineOperand &MO : reverse(MI.explicit_operands())) { + if (!MO.isReg()) + continue; + unsigned VReg = MO.getReg(); + + // Don't stackify physregs like SP or FP. + if (!TargetRegisterInfo::isVirtualRegister(VReg)) + continue; + + if (MFI.isVRegStackified(VReg)) { + if (MO.isDef()) + Stack.push_back(VReg); + else + assert(Stack.pop_back_val() == VReg); + } + } + } + // TODO: Generalize this code to support keeping values on the stack across + // basic block boundaries. + assert(Stack.empty()); + } +#endif + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp index 385c40b..dcada45 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp @@ -43,7 +43,7 @@ WebAssemblyRegisterInfo::getCalleeSavedRegs(const MachineFunction *) const { } BitVector -WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const { +WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction & /*MF*/) const { BitVector Reserved(getNumRegs()); for (auto Reg : {WebAssembly::SP32, WebAssembly::SP64, WebAssembly::FP32, WebAssembly::FP64}) @@ -52,9 +52,37 @@ WebAssemblyRegisterInfo::getReservedRegs(const MachineFunction &MF) const { } void WebAssemblyRegisterInfo::eliminateFrameIndex( - MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, - RegScavenger *RS) const { - llvm_unreachable("WebAssemblyRegisterInfo::eliminateFrameIndex"); // FIXME + MachineBasicBlock::iterator II, int SPAdj, + unsigned FIOperandNum, RegScavenger * /*RS*/) const { + assert(SPAdj == 0); + MachineInstr &MI = *II; + + MachineBasicBlock &MBB = *MI.getParent(); + MachineFunction &MF = *MBB.getParent(); + int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); + const MachineFrameInfo& MFI = *MF.getFrameInfo(); + int FrameOffset = MFI.getStackSize() + MFI.getObjectOffset(FrameIndex); + + if (MI.mayLoadOrStore()) { + // If this is a load or store, make it relative to SP and fold the frame + // offset directly in + assert(MI.getOperand(1).getImm() == 0 && + "Can't eliminate FI yet if offset is already set"); + MI.getOperand(1).setImm(FrameOffset); + MI.getOperand(2).ChangeToRegister(WebAssembly::SP32, /*IsDef=*/false); + } else { + // Otherwise create an i32.add SP, offset and make it the operand + auto &MRI = MF.getRegInfo(); + const auto *TII = MF.getSubtarget().getInstrInfo(); + + unsigned OffsetReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::CONST_I32), OffsetReg) + .addImm(FrameOffset); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(WebAssembly::ADD_I32), OffsetReg) + .addReg(WebAssembly::SP32) + .addReg(OffsetReg); + MI.getOperand(FIOperandNum).ChangeToRegister(OffsetReg, /*IsDef=*/false); + } } unsigned @@ -67,21 +95,11 @@ WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const { return Regs[TFI->hasFP(MF)][TT.isArch64Bit()]; } -bool WebAssemblyRegisterInfo::canRealignStack(const MachineFunction &MF) const { - return !MF.getFunction()->hasFnAttribute("no-realign-stack"); -} - -// FIXME: share this with other backends with identical implementation? -bool WebAssemblyRegisterInfo::needsStackRealignment( - const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const WebAssemblyFrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = - ((MFI->getMaxAlignment() > StackAlign) || - F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::StackAlignment)); - - return requiresRealignment && canRealignStack(MF); +const TargetRegisterClass * +WebAssemblyRegisterInfo::getPointerRegClass(const MachineFunction &MF, + unsigned Kind) const { + assert(Kind == 0 && "Only one kind of pointer on WebAssembly"); + if (MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()) + return &WebAssembly::I64RegClass; + return &WebAssembly::I32RegClass; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h index dbdb9d0..ad1d71e 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.h @@ -42,9 +42,9 @@ public: // Debug information queries. unsigned getFrameRegister(const MachineFunction &MF) const override; - // Base pointer (stack realignment) support. - bool canRealignStack(const MachineFunction &MF) const; - bool needsStackRealignment(const MachineFunction &MF) const override; + const TargetRegisterClass * + getPointerRegClass(const MachineFunction &MF, + unsigned Kind = 0) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td index 2ba42eb..80a83fa 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.td @@ -33,22 +33,26 @@ def FP64 : WebAssemblyReg<"%FP64">; def SP32 : WebAssemblyReg<"%SP32">; def SP64 : WebAssemblyReg<"%SP64">; -// TODO(jfb) The following comes from NVPTX. Is it really needed, or can we do -// away with it? Try deleting once the backend works. -// WebAssembly uses virtual registers, but the backend defines a few physical -// registers here to keep SDAG and the MachineInstr layers happy. -foreach i = 0-4 in { - def I#i : WebAssemblyReg<"%i."#i>; // i32 - def L#i : WebAssemblyReg<"%l."#i>; // i64 - def F#i : WebAssemblyReg<"%f."#i>; // f32 - def D#i : WebAssemblyReg<"%d."#i>; // f64 -} +// The register allocation framework requires register classes have at least +// one register, so we define a few for the floating point register classes +// since we otherwise don't need a physical register in those classes. +def F32_0 : WebAssemblyReg<"%f32.0">; +def F64_0 : WebAssemblyReg<"%f64.0">; + +// The expression stack "register". This is an opaque entity which serves to +// order uses and defs that must remain in LIFO order. +def EXPR_STACK : WebAssemblyReg<"STACK">; + +// The incoming arguments "register". This is an opaque entity which serves to +// order the ARGUMENT instructions that are emulating live-in registers and +// must not be scheduled below other instructions. +def ARGUMENTS : WebAssemblyReg<"ARGUMENTS">; //===----------------------------------------------------------------------===// // Register classes //===----------------------------------------------------------------------===// -def Int32 : WebAssemblyRegClass<[i32], 32, (add (sequence "I%u", 0, 4), SP32)>; -def Int64 : WebAssemblyRegClass<[i64], 64, (add (sequence "L%u", 0, 4), SP64)>; -def Float32 : WebAssemblyRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>; -def Float64 : WebAssemblyRegClass<[f64], 64, (add (sequence "D%u", 0, 4))>; +def I32 : WebAssemblyRegClass<[i32], 32, (add FP32, SP32)>; +def I64 : WebAssemblyRegClass<[i64], 64, (add FP64, SP64)>; +def F32 : WebAssemblyRegClass<[f32], 32, (add F32_0)>; +def F64 : WebAssemblyRegClass<[f64], 64, (add F64_0)>; diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp new file mode 100644 index 0000000..4e08b2b --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyStoreResults.cpp @@ -0,0 +1,124 @@ +//===-- WebAssemblyStoreResults.cpp - Optimize using store result values --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements an optimization pass using store result values. +/// +/// WebAssembly's store instructions return the stored value. This is to enable +/// an optimization wherein uses of the stored value can be replaced by uses of +/// the store's result value, making the stored value register more likely to +/// be single-use, thus more likely to be useful to register stackifying, and +/// potentially also exposing the store to register stackifying. These both can +/// reduce get_local/set_local traffic. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssembly.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" +#include "WebAssemblyMachineFunctionInfo.h" +#include "WebAssemblySubtarget.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "wasm-store-results" + +namespace { +class WebAssemblyStoreResults final : public MachineFunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + WebAssemblyStoreResults() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { + return "WebAssembly Store Results"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineBlockFrequencyInfo>(); + AU.addPreserved<MachineBlockFrequencyInfo>(); + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: +}; +} // end anonymous namespace + +char WebAssemblyStoreResults::ID = 0; +FunctionPass *llvm::createWebAssemblyStoreResults() { + return new WebAssemblyStoreResults(); +} + +bool WebAssemblyStoreResults::runOnMachineFunction(MachineFunction &MF) { + DEBUG({ + dbgs() << "********** Store Results **********\n" + << "********** Function: " << MF.getName() << '\n'; + }); + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>(); + bool Changed = false; + + assert(MRI.isSSA() && "StoreResults depends on SSA form"); + + for (auto &MBB : MF) { + DEBUG(dbgs() << "Basic Block: " << MBB.getName() << '\n'); + for (auto &MI : MBB) + switch (MI.getOpcode()) { + default: + break; + case WebAssembly::STORE8_I32: + case WebAssembly::STORE16_I32: + case WebAssembly::STORE8_I64: + case WebAssembly::STORE16_I64: + case WebAssembly::STORE32_I64: + case WebAssembly::STORE_F32: + case WebAssembly::STORE_F64: + case WebAssembly::STORE_I32: + case WebAssembly::STORE_I64: + unsigned ToReg = MI.getOperand(0).getReg(); + unsigned FromReg = MI.getOperand(3).getReg(); + for (auto I = MRI.use_begin(FromReg), E = MRI.use_end(); I != E;) { + MachineOperand &O = *I++; + MachineInstr *Where = O.getParent(); + if (Where->getOpcode() == TargetOpcode::PHI) { + // PHIs use their operands on their incoming CFG edges rather than + // in their parent blocks. Get the basic block paired with this use + // of FromReg and check that MI's block dominates it. + MachineBasicBlock *Pred = + Where->getOperand(&O - &Where->getOperand(0) + 1).getMBB(); + if (!MDT.dominates(&MBB, Pred)) + continue; + } else { + // For a non-PHI, check that MI dominates the instruction in the + // normal way. + if (&MI == Where || !MDT.dominates(&MI, Where)) + continue; + } + Changed = true; + DEBUG(dbgs() << "Setting operand " << O << " in " << *Where + << " from " << MI << "\n"); + O.setReg(ToReg); + // If the store's def was previously dead, it is no longer. But the + // dead flag shouldn't be set yet. + assert(!MI.getOperand(0).isDead() && "Dead flag set on store result"); + } + } + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp index 3d9e7aa..cb2d5a6 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp @@ -46,3 +46,4 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT, TLInfo(TM, *this) {} bool WebAssemblySubtarget::enableMachineScheduler() const { return true; } +bool WebAssemblySubtarget::useAA() const { return true; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index 6f17619..f530a29 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -61,9 +61,15 @@ public: const WebAssemblyTargetLowering *getTargetLowering() const override { return &TLInfo; } + const WebAssemblyInstrInfo *getInstrInfo() const override { + return &InstrInfo; + } + const WebAssemblyRegisterInfo *getRegisterInfo() const override { + return &getInstrInfo()->getRegisterInfo(); + } const Triple &getTargetTriple() const { return TargetTriple; } bool enableMachineScheduler() const override; - bool useAA() const override { return true; } + bool useAA() const override; // Predicates used by WebAssemblyInstrInfo.td. bool hasAddr64() const { return TargetTriple.isArch64Bit(); } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp index 6f93248..e31ea46 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp @@ -45,11 +45,16 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine( const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, TT.isArch64Bit() - ? "e-p:64:64-i64:64-v128:8:128-n32:64-S128" - : "e-p:32:32-i64:64-v128:8:128-n32:64-S128", + : LLVMTargetMachine(T, TT.isArch64Bit() ? "e-p:64:64-i64:64-n32:64-S128" + : "e-p:32:32-i64:64-n32:64-S128", TT, CPU, FS, Options, RM, CM, OL), TLOF(make_unique<WebAssemblyTargetObjectFile>()) { + // WebAssembly type-checks expressions, but a noreturn function with a return + // type that doesn't match the context will cause a check failure. So we lower + // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's + // 'unreachable' expression which is meant for that case. + this->Options.TrapUnreachable = true; + initAsmInfo(); // We need a reducible CFG, so disable some optimizations which tend to @@ -77,7 +82,7 @@ WebAssemblyTargetMachine::getSubtargetImpl(const Function &F) const { // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); - I = make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this); + I = llvm::make_unique<WebAssemblySubtarget>(TargetTriple, CPU, FS, *this); } return I.get(); } @@ -94,23 +99,18 @@ public: } FunctionPass *createTargetRegisterAllocator(bool) override; - void addFastRegAlloc(FunctionPass *RegAllocPass) override; - void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override; void addIRPasses() override; - bool addPreISel() override; bool addInstSelector() override; bool addILPOpts() override; void addPreRegAlloc() override; - void addRegAllocPasses(bool Optimized); void addPostRegAlloc() override; - void addPreSched2() override; void addPreEmitPass() override; }; } // end anonymous namespace TargetIRAnalysis WebAssemblyTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(WebAssemblyTTIImpl(this, F)); }); } @@ -124,50 +124,86 @@ FunctionPass *WebAssemblyPassConfig::createTargetRegisterAllocator(bool) { return nullptr; // No reg alloc } -void WebAssemblyPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { - assert(!RegAllocPass && "WebAssembly uses no regalloc!"); - addRegAllocPasses(false); -} - -void WebAssemblyPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) { - assert(!RegAllocPass && "WebAssembly uses no regalloc!"); - addRegAllocPasses(true); -} - //===----------------------------------------------------------------------===// // The following functions are called from lib/CodeGen/Passes.cpp to modify // the CodeGen pass sequence. //===----------------------------------------------------------------------===// void WebAssemblyPassConfig::addIRPasses() { - // FIXME: the default for this option is currently POSIX, whereas - // WebAssembly's MVP should default to Single. if (TM->Options.ThreadModel == ThreadModel::Single) + // In "single" mode, atomics get lowered to non-atomics. addPass(createLowerAtomicPass()); else // Expand some atomic operations. WebAssemblyTargetLowering has hooks which // control specifically what gets lowered. addPass(createAtomicExpandPass(TM)); + // Optimize "returned" function attributes. + addPass(createWebAssemblyOptimizeReturned()); + TargetPassConfig::addIRPasses(); } -bool WebAssemblyPassConfig::addPreISel() { return false; } - bool WebAssemblyPassConfig::addInstSelector() { + (void)TargetPassConfig::addInstSelector(); addPass( createWebAssemblyISelDag(getWebAssemblyTargetMachine(), getOptLevel())); + // Run the argument-move pass immediately after the ScheduleDAG scheduler + // so that we can fix up the ARGUMENT instructions before anything else + // sees them in the wrong place. + addPass(createWebAssemblyArgumentMove()); return false; } -bool WebAssemblyPassConfig::addILPOpts() { return true; } +bool WebAssemblyPassConfig::addILPOpts() { + (void)TargetPassConfig::addILPOpts(); + return true; +} + +void WebAssemblyPassConfig::addPreRegAlloc() { + TargetPassConfig::addPreRegAlloc(); -void WebAssemblyPassConfig::addPreRegAlloc() {} + // Prepare store instructions for register stackifying. + addPass(createWebAssemblyStoreResults()); +} -void WebAssemblyPassConfig::addRegAllocPasses(bool Optimized) {} +void WebAssemblyPassConfig::addPostRegAlloc() { + // TODO: The following CodeGen passes don't currently support code containing + // virtual registers. Consider removing their restrictions and re-enabling + // them. + // + // We use our own PrologEpilogInserter which is very slightly modified to + // tolerate virtual registers. + disablePass(&PrologEpilogCodeInserterID); + // Fails with: should be run after register allocation. + disablePass(&MachineCopyPropagationID); + + // Mark registers as representing wasm's expression stack. + addPass(createWebAssemblyRegStackify()); + + // Run the register coloring pass to reduce the total number of registers. + addPass(createWebAssemblyRegColoring()); + + TargetPassConfig::addPostRegAlloc(); + + // Run WebAssembly's version of the PrologEpilogInserter. Target-independent + // PEI runs after PostRegAlloc and after ShrinkWrap. Putting it here will run + // PEI before ShrinkWrap but otherwise in the same position in the order. + addPass(createWebAssemblyPEI()); +} -void WebAssemblyPassConfig::addPostRegAlloc() {} +void WebAssemblyPassConfig::addPreEmitPass() { + TargetPassConfig::addPreEmitPass(); -void WebAssemblyPassConfig::addPreSched2() {} + // Put the CFG in structured form; insert BLOCK and LOOP markers. + addPass(createWebAssemblyCFGStackify()); -void WebAssemblyPassConfig::addPreEmitPass() {} + // Lower br_unless into br_if. + addPass(createWebAssemblyLowerBrUnless()); + + // Create a mapping from LLVM CodeGen virtual registers to wasm registers. + addPass(createWebAssemblyRegNumbering()); + + // Perform the very last peephole optimizations on the code. + addPass(createWebAssemblyPeephole()); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp new file mode 100644 index 0000000..74e33b9 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp @@ -0,0 +1,24 @@ +//===-- WebAssemblyTargetObjectFile.cpp - WebAssembly Object Info ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file defines the functions of the WebAssembly-specific subclass +/// of TargetLoweringObjectFile. +/// +//===----------------------------------------------------------------------===// + +#include "WebAssemblyTargetObjectFile.h" +#include "WebAssemblyTargetMachine.h" +using namespace llvm; + +void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx, + const TargetMachine &TM) { + TargetLoweringObjectFileELF::Initialize(Ctx, TM); + InitializeELF(TM.Options.UseInitArray); +} diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h index ee78b94..39e50c9 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h @@ -16,50 +16,13 @@ #ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H #define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYTARGETOBJECTFILE_H -#include "llvm/Target/TargetLoweringObjectFile.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" namespace llvm { -class GlobalVariable; - -class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFile { +class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF { public: - WebAssemblyTargetObjectFile() { - TextSection = nullptr; - DataSection = nullptr; - BSSSection = nullptr; - ReadOnlySection = nullptr; - - StaticCtorSection = nullptr; - StaticDtorSection = nullptr; - LSDASection = nullptr; - EHFrameSection = nullptr; - DwarfAbbrevSection = nullptr; - DwarfInfoSection = nullptr; - DwarfLineSection = nullptr; - DwarfFrameSection = nullptr; - DwarfPubTypesSection = nullptr; - DwarfDebugInlineSection = nullptr; - DwarfStrSection = nullptr; - DwarfLocSection = nullptr; - DwarfARangesSection = nullptr; - DwarfRangesSection = nullptr; - } - - MCSection *getSectionForConstant(SectionKind Kind, - const Constant *C) const override { - return ReadOnlySection; - } - - MCSection *getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const override { - return DataSection; - } - - MCSection *SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind, - Mangler &Mang, - const TargetMachine &TM) const override; + void Initialize(MCContext &Ctx, const TargetMachine &TM) override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index fa88ed5..3566317 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -21,8 +21,7 @@ using namespace llvm; #define DEBUG_TYPE "wasmtti" TargetTransformInfo::PopcntSupportKind -WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) { +WebAssemblyTTIImpl::getPopcntSupport(unsigned TyWidth) const { assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); - // TODO: Make Math.popcount32 happen in WebAssembly. - return TTI::PSK_Software; + return TargetTransformInfo::PSK_FastHardware; } diff --git a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h index 7ffb604..26dc388 100644 --- a/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h @@ -38,7 +38,7 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> { const WebAssemblyTargetLowering *getTLI() const { return TLI; } public: - WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, Function &F) + WebAssemblyTTIImpl(const WebAssemblyTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -54,7 +54,7 @@ public: // TODO: Implement more Scalar TTI for WebAssembly - TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth); + TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const; /// @} diff --git a/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt new file mode 100644 index 0000000..ee9d060 --- /dev/null +++ b/contrib/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt @@ -0,0 +1,311 @@ +# Tests which are known to fail from the GCC torture test suite. + +# Core dump. +920908-1.c +pr38151.c +va-arg-22.c + +# TargetRegisterInfo.h:315: static unsigned int llvm::TargetRegisterInfo::virtReg2Index(unsigned int): Assertion `isVirtualRegister(Reg) && "Not a virtual register"' failed. +struct-ret-1.c +va-arg-11.c +va-arg-21.c +va-arg-24.c +va-arg-trap-1.c + +# WebAssemblyCFGStackify.cpp:211: void SortBlocks(llvm::MachineFunction&, const llvm::MachineLoopInfo&): Assertion `L->contains( MLI.getLoopFor(&*prev(MachineFunction::iterator(&MBB)))) && "Loop isn't contiguous"' failed. +20000815-1.c +20010129-1.c +930628-1.c +980707-1.c + +# WebAssemblyISelLowering.cpp:316: virtual llvm::SDValue llvm::WebAssemblyTargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const: Assertion `!Out.Flags.isByVal() && "byval is not valid for return values"' failed. +20030914-2.c +20040703-1.c +20081117-1.c +920625-1.c +931004-11.c +931004-13.c +980223.c +bitfld-5.c +complex-7.c +pr38969.c +pr51323.c +pr52129.c +pr57130.c + +# These were previously "Cannot select FrameIndex." Now most of them fail +# because they contain call frame pseudos (e.g. call a vararg func), +# frame pointers, or similar. This list will be updated again soon. +20000519-1.c +20000706-4.c +20000706-5.c +20000801-2.c +20000801-4.c +20011126-2.c + +20020529-1.c +20021024-1.c + +20030828-1.c +20030914-1.c + +20040302-1.c +20040625-1.c +20040823-1.c + +20041113-1.c + +20041214-1.c + +20050826-2.c + +20071213-1.c + +20080506-2.c +20080519-1.c + +20081103-1.c +20090113-1.c +20090113-2.c +20090113-3.c + +20090623-1.c + +920501-6.c +920501-8.c +920726-1.c +930518-1.c + +931004-10.c +931004-12.c +931004-14.c +931004-2.c +931004-4.c +931004-6.c +931004-8.c + +980205.c +980608-1.c +980709-1.c +980716-1.c +990127-1.c + +991216-2.c + +#cbrt.c +complex-5.c +complex-6.c + +enum-3.c +fprintf-chk-1.c +frame-address.c +loop-15.c +loop-ivopts-2.c +mayalias-3.c + +multi-ix.c + +pr20466-1.c + + +pr28778.c +pr28982b.c + +pr30778.c +pr31448-2.c +pr31448.c + +pr33870-1.c +pr33870.c + +pr38051.c + +pr39100.c + +pr39339.c +pr40022.c +pr40657.c + +pr43987.c + +pr44575.c + +pr44942.c +pr46309.c +pr47538.c +pr47925.c + +pr49390.c +pr49419.c + +#pr51877.c + +#pr52979-1.c +#pr52979-2.c +pr53645-2.c +pr53645.c + +pr56205.c + +pr56866.c + +pr57876.c +pr58277-1.c + +pr59643.c + +printf-chk-1.c +pta-field-1.c +pta-field-2.c + +stdarg-1.c +stdarg-2.c +stdarg-3.c +stdarg-4.c +strct-stdarg-1.c +strct-varg-1.c + +va-arg-1.c +va-arg-10.c +va-arg-12.c +va-arg-13.c +va-arg-14.c +va-arg-15.c +va-arg-16.c +va-arg-17.c +va-arg-18.c +va-arg-19.c +va-arg-2.c +va-arg-20.c +va-arg-23.c +va-arg-26.c +va-arg-4.c +va-arg-5.c +va-arg-6.c +va-arg-7.c +va-arg-8.c +va-arg-9.c +va-arg-pack-1.c +vfprintf-1.c +vfprintf-chk-1.c +vprintf-1.c +vprintf-chk-1.c + +# Cannot select callseq_end. +20040811-1.c +pr43220.c +vla-dealloc-1.c + +# Cannot select brind. +20071210-1.c +920501-4.c +920501-5.c + +# Cannot select BlockAddress. +comp-goto-1.c +980526-1.c +990208-1.c + +# WebAssembly hasn't implemented byval arguments. +20000412-3.c +20000419-1.c +20000706-1.c +20000706-2.c +20000707-1.c +20000717-1.c +20000717-5.c +20000808-1.c +20010605-2.c +20011113-1.c +20020215-1.c +20020810-1.c +20021118-1.c +20040707-1.c +20040709-1.c +20040709-2.c +20041201-1.c +20050713-1.c +20070614-1.c +920908-2.c +921112-1.c +921117-1.c +921123-2.c +921204-1.c +930126-1.c +930208-1.c +931004-5.c +931004-9.c +931031-1.c +950607-2.c +960416-1.c +990525-1.c +991118-1.c +bf64-1.c +complex-1.c +complex-2.c +pr15262-2.c +pr20621-1.c +pr23135.c +pr30185.c +pr42248.c + +# unimplemented operation lowering. +20010122-1.c +20030323-1.c +20030811-1.c +pr17377.c + +# Error: invalid output constraint '=t' in asm. +990413-2.c +990826-0.c + +# Error: __builtin_setjmp / __builtin_longjmp is not supported for the current target. +built-in-setjmp.c +pr60003.c + +# Error in the program / unsupported by Clang. +scal-to-vec1.c +scal-to-vec2.c +scal-to-vec3.c +20000822-1.c +20010209-1.c +20010605-1.c +20030501-1.c +20040520-1.c +20061220-1.c +20090219-1.c +920415-1.c +920428-2.c +920501-7.c +920612-2.c +920721-4.c +921017-1.c +921215-1.c +931002-1.c +comp-goto-2.c +nest-align-1.c +nest-stdar-1.c +nestfunc-1.c +nestfunc-2.c +nestfunc-3.c +nestfunc-5.c +nestfunc-6.c +nestfunc-7.c +pr22061-3.c +pr22061-4.c +pr24135.c +pr51447.c +20020412-1.c +20040308-1.c +20040423-1.c +20041218-2.c +20070919-1.c +align-nest.c +pr41935.c +20050107-1.c +20050119-1.c +20050119-2.c +920302-1.c +920501-3.c +920728-1.c +pr28865.c diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index 9eee4a0..09cc53a 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -10,10 +10,8 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "X86AsmInstrumentation.h" #include "X86Operand.h" -#include "X86RegisterInfo.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" @@ -118,11 +116,6 @@ bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; } bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; } -std::string FuncName(unsigned AccessSize, bool IsWrite) { - return std::string("__asan_report_") + (IsWrite ? "store" : "load") + - utostr(AccessSize); -} - class X86AddressSanitizer : public X86AsmInstrumentation { public: struct RegisterContext { @@ -136,26 +129,26 @@ public: public: RegisterContext(unsigned AddressReg, unsigned ShadowReg, unsigned ScratchReg) { - BusyRegs.push_back(convReg(AddressReg, MVT::i64)); - BusyRegs.push_back(convReg(ShadowReg, MVT::i64)); - BusyRegs.push_back(convReg(ScratchReg, MVT::i64)); + BusyRegs.push_back(convReg(AddressReg, 64)); + BusyRegs.push_back(convReg(ShadowReg, 64)); + BusyRegs.push_back(convReg(ScratchReg, 64)); } - unsigned AddressReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_ADDRESS], VT); + unsigned AddressReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_ADDRESS], Size); } - unsigned ShadowReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_SHADOW], VT); + unsigned ShadowReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SHADOW], Size); } - unsigned ScratchReg(MVT::SimpleValueType VT) const { - return convReg(BusyRegs[REG_OFFSET_SCRATCH], VT); + unsigned ScratchReg(unsigned Size) const { + return convReg(BusyRegs[REG_OFFSET_SCRATCH], Size); } void AddBusyReg(unsigned Reg) { if (Reg != X86::NoRegister) - BusyRegs.push_back(convReg(Reg, MVT::i64)); + BusyRegs.push_back(convReg(Reg, 64)); } void AddBusyRegs(const X86Operand &Op) { @@ -163,36 +156,36 @@ public: AddBusyReg(Op.getMemIndexReg()); } - unsigned ChooseFrameReg(MVT::SimpleValueType VT) const { + unsigned ChooseFrameReg(unsigned Size) const { static const MCPhysReg Candidates[] = { X86::RBP, X86::RAX, X86::RBX, X86::RCX, X86::RDX, X86::RDI, X86::RSI }; for (unsigned Reg : Candidates) { if (!std::count(BusyRegs.begin(), BusyRegs.end(), Reg)) - return convReg(Reg, VT); + return convReg(Reg, Size); } return X86::NoRegister; } private: - unsigned convReg(unsigned Reg, MVT::SimpleValueType VT) const { - return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, VT); + unsigned convReg(unsigned Reg, unsigned Size) const { + return Reg == X86::NoRegister ? Reg : getX86SubSuperRegister(Reg, Size); } std::vector<unsigned> BusyRegs; }; - X86AddressSanitizer(const MCSubtargetInfo &STI) + X86AddressSanitizer(const MCSubtargetInfo *&STI) : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {} - virtual ~X86AddressSanitizer() {} + ~X86AddressSanitizer() override {} // X86AsmInstrumentation implementation: - virtual void InstrumentAndEmitInstruction(const MCInst &Inst, - OperandVector &Operands, - MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) override { + void InstrumentAndEmitInstruction(const MCInst &Inst, + OperandVector &Operands, + MCContext &Ctx, + const MCInstrInfo &MII, + MCStreamer &Out) override { InstrumentMOVS(Inst, Operands, Ctx, MII, Out); if (RepPrefix) EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); @@ -240,17 +233,16 @@ public: protected: void EmitLabel(MCStreamer &Out, MCSymbol *Label) { Out.EmitLabel(Label); } - void EmitLEA(X86Operand &Op, MVT::SimpleValueType VT, unsigned Reg, - MCStreamer &Out) { - assert(VT == MVT::i32 || VT == MVT::i64); + void EmitLEA(X86Operand &Op, unsigned Size, unsigned Reg, MCStreamer &Out) { + assert(Size == 32 || Size == 64); MCInst Inst; - Inst.setOpcode(VT == MVT::i32 ? X86::LEA32r : X86::LEA64r); - Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, VT))); + Inst.setOpcode(Size == 32 ? X86::LEA32r : X86::LEA64r); + Inst.addOperand(MCOperand::createReg(getX86SubSuperRegister(Reg, Size))); Op.addMemOperands(Inst, 5); EmitInstruction(Out, Inst); } - void ComputeMemOperandAddress(X86Operand &Op, MVT::SimpleValueType VT, + void ComputeMemOperandAddress(X86Operand &Op, unsigned Size, unsigned Reg, MCContext &Ctx, MCStreamer &Out); // Creates new memory operand with Displacement added to an original @@ -261,13 +253,13 @@ protected: MCContext &Ctx, int64_t *Residue); bool is64BitMode() const { - return STI.getFeatureBits()[X86::Mode64Bit]; + return STI->getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { - return STI.getFeatureBits()[X86::Mode32Bit]; + return STI->getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { - return STI.getFeatureBits()[X86::Mode16Bit]; + return STI->getFeatureBits()[X86::Mode16Bit]; } unsigned getPointerWidth() { @@ -437,7 +429,7 @@ void X86AddressSanitizer::InstrumentMOV(const MCInst &Inst, } void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, - MVT::SimpleValueType VT, + unsigned Size, unsigned Reg, MCContext &Ctx, MCStreamer &Out) { int64_t Displacement = 0; @@ -450,14 +442,14 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, // Emit Op as is. if (Displacement == 0) { - EmitLEA(Op, VT, Reg, Out); + EmitLEA(Op, Size, Reg, Out); return; } int64_t Residue; std::unique_ptr<X86Operand> NewOp = AddDisplacement(Op, Displacement, Ctx, &Residue); - EmitLEA(*NewOp, VT, Reg, Out); + EmitLEA(*NewOp, Size, Reg, Out); while (Residue != 0) { const MCConstantExpr *Disp = @@ -465,7 +457,7 @@ void X86AddressSanitizer::ComputeMemOperandAddress(X86Operand &Op, std::unique_ptr<X86Operand> DispOp = X86Operand::CreateMem(getPointerWidth(), 0, Disp, Reg, 0, 1, SMLoc(), SMLoc()); - EmitLEA(*DispOp, VT, Reg, Out); + EmitLEA(*DispOp, Size, Reg, Out); Residue -= Disp->getValue(); } } @@ -503,16 +495,16 @@ class X86AddressSanitizer32 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x20000000; - X86AddressSanitizer32(const MCSubtargetInfo &STI) + X86AddressSanitizer32(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - virtual ~X86AddressSanitizer32() {} + ~X86AddressSanitizer32() override {} unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); if (FrameReg == X86::NoRegister) return FrameReg; - return getX86SubSuperRegister(FrameReg, MVT::i32); + return getX86SubSuperRegister(FrameReg, 32); } void SpillReg(MCStreamer &Out, unsigned Reg) { @@ -535,10 +527,10 @@ public: OrigSPOffset += 4; } - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); assert(LocalFrameReg != X86::NoRegister); const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); @@ -558,24 +550,24 @@ public: MRI->getDwarfRegNum(LocalFrameReg, true /* IsEH */)); } - SpillReg(Out, RegCtx.AddressReg(MVT::i32)); - SpillReg(Out, RegCtx.ShadowReg(MVT::i32)); - if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(MVT::i32)); + SpillReg(Out, RegCtx.AddressReg(32)); + SpillReg(Out, RegCtx.ShadowReg(32)); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(32)); StoreFlags(Out); } - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i32); + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(32); assert(LocalFrameReg != X86::NoRegister); RestoreFlags(Out); - if (RegCtx.ScratchReg(MVT::i32) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(MVT::i32)); - RestoreReg(Out, RegCtx.ShadowReg(MVT::i32)); - RestoreReg(Out, RegCtx.AddressReg(MVT::i32)); + if (RegCtx.ScratchReg(32) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(32)); + RestoreReg(Out, RegCtx.ShadowReg(32)); + RestoreReg(Out, RegCtx.AddressReg(32)); unsigned FrameReg = GetFrameReg(Ctx, Out); if (Ctx.getRegisterInfo() && FrameReg != X86::NoRegister) { @@ -586,18 +578,18 @@ public: } } - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitCallAsanReport(unsigned AccessSize, bool IsWrite, MCContext &Ctx, @@ -610,10 +602,11 @@ private: .addReg(X86::ESP) .addImm(-16)); EmitInstruction( - Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(MVT::i32))); + Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32))); - const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr)); @@ -623,14 +616,14 @@ private: void X86AddressSanitizer32::InstrumentMemOperandSmall( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( AddressRegI32)); @@ -673,7 +666,7 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); + EmitLEA(*Op, 32, ScratchRegI32, Out); break; } case 4: @@ -698,10 +691,10 @@ void X86AddressSanitizer32::InstrumentMemOperandSmall( void X86AddressSanitizer32::InstrumentMemOperandLarge( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); - ComputeMemOperandAddress(Op, MVT::i32, AddressRegI32, Ctx, Out); + ComputeMemOperandAddress(Op, 32, AddressRegI32, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV32rr).addReg(ShadowRegI32).addReg( AddressRegI32)); @@ -760,16 +753,16 @@ class X86AddressSanitizer64 : public X86AddressSanitizer { public: static const long kShadowOffset = 0x7fff8000; - X86AddressSanitizer64(const MCSubtargetInfo &STI) + X86AddressSanitizer64(const MCSubtargetInfo *&STI) : X86AddressSanitizer(STI) {} - virtual ~X86AddressSanitizer64() {} + ~X86AddressSanitizer64() override {} unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) { unsigned FrameReg = GetFrameRegGeneric(Ctx, Out); if (FrameReg == X86::NoRegister) return FrameReg; - return getX86SubSuperRegister(FrameReg, MVT::i64); + return getX86SubSuperRegister(FrameReg, 64); } void SpillReg(MCStreamer &Out, unsigned Reg) { @@ -792,10 +785,10 @@ public: OrigSPOffset += 8; } - virtual void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + void InstrumentMemOperandPrologue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); assert(LocalFrameReg != X86::NoRegister); const MCRegisterInfo *MRI = Ctx.getRegisterInfo(); @@ -816,24 +809,24 @@ public: } EmitAdjustRSP(Ctx, Out, -128); - SpillReg(Out, RegCtx.ShadowReg(MVT::i64)); - SpillReg(Out, RegCtx.AddressReg(MVT::i64)); - if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) - SpillReg(Out, RegCtx.ScratchReg(MVT::i64)); + SpillReg(Out, RegCtx.ShadowReg(64)); + SpillReg(Out, RegCtx.AddressReg(64)); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + SpillReg(Out, RegCtx.ScratchReg(64)); StoreFlags(Out); } - virtual void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override { - unsigned LocalFrameReg = RegCtx.ChooseFrameReg(MVT::i64); + void InstrumentMemOperandEpilogue(const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override { + unsigned LocalFrameReg = RegCtx.ChooseFrameReg(64); assert(LocalFrameReg != X86::NoRegister); RestoreFlags(Out); - if (RegCtx.ScratchReg(MVT::i64) != X86::NoRegister) - RestoreReg(Out, RegCtx.ScratchReg(MVT::i64)); - RestoreReg(Out, RegCtx.AddressReg(MVT::i64)); - RestoreReg(Out, RegCtx.ShadowReg(MVT::i64)); + if (RegCtx.ScratchReg(64) != X86::NoRegister) + RestoreReg(Out, RegCtx.ScratchReg(64)); + RestoreReg(Out, RegCtx.AddressReg(64)); + RestoreReg(Out, RegCtx.ShadowReg(64)); EmitAdjustRSP(Ctx, Out, 128); unsigned FrameReg = GetFrameReg(Ctx, Out); @@ -845,18 +838,18 @@ public: } } - virtual void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, - bool IsWrite, - const RegisterContext &RegCtx, - MCContext &Ctx, - MCStreamer &Out) override; - virtual void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, - MCStreamer &Out) override; + void InstrumentMemOperandSmall(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMemOperandLarge(X86Operand &Op, unsigned AccessSize, + bool IsWrite, + const RegisterContext &RegCtx, + MCContext &Ctx, + MCStreamer &Out) override; + void InstrumentMOVSImpl(unsigned AccessSize, MCContext &Ctx, + MCStreamer &Out) override; private: void EmitAdjustRSP(MCContext &Ctx, MCStreamer &Out, long Offset) { @@ -864,7 +857,7 @@ private: std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i64, X86::RSP, Out); + EmitLEA(*Op, 64, X86::RSP, Out); OrigSPOffset += Offset; } @@ -878,12 +871,13 @@ private: .addReg(X86::RSP) .addImm(-16)); - if (RegCtx.AddressReg(MVT::i64) != X86::RDI) { + if (RegCtx.AddressReg(64) != X86::RDI) { EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg( - RegCtx.AddressReg(MVT::i64))); + RegCtx.AddressReg(64))); } - const std::string &Fn = FuncName(AccessSize, IsWrite); - MCSymbol *FnSym = Ctx.getOrCreateSymbol(StringRef(Fn)); + MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") + + (IsWrite ? "store" : "load") + + llvm::Twine(AccessSize)); const MCSymbolRefExpr *FnExpr = MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx); EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr)); @@ -893,16 +887,16 @@ private: void X86AddressSanitizer64::InstrumentMemOperandSmall( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); - unsigned AddressRegI32 = RegCtx.AddressReg(MVT::i32); - unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); - unsigned ShadowRegI32 = RegCtx.ShadowReg(MVT::i32); - unsigned ShadowRegI8 = RegCtx.ShadowReg(MVT::i8); + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned AddressRegI32 = RegCtx.AddressReg(32); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); + unsigned ShadowRegI32 = RegCtx.ShadowReg(32); + unsigned ShadowRegI8 = RegCtx.ShadowReg(8); - assert(RegCtx.ScratchReg(MVT::i32) != X86::NoRegister); - unsigned ScratchRegI32 = RegCtx.ScratchReg(MVT::i32); + assert(RegCtx.ScratchReg(32) != X86::NoRegister); + unsigned ScratchRegI32 = RegCtx.ScratchReg(32); - ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( AddressRegI64)); @@ -944,7 +938,7 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( std::unique_ptr<X86Operand> Op( X86Operand::CreateMem(getPointerWidth(), 0, Disp, ScratchRegI32, 0, 1, SMLoc(), SMLoc())); - EmitLEA(*Op, MVT::i32, ScratchRegI32, Out); + EmitLEA(*Op, 32, ScratchRegI32, Out); break; } case 4: @@ -969,10 +963,10 @@ void X86AddressSanitizer64::InstrumentMemOperandSmall( void X86AddressSanitizer64::InstrumentMemOperandLarge( X86Operand &Op, unsigned AccessSize, bool IsWrite, const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) { - unsigned AddressRegI64 = RegCtx.AddressReg(MVT::i64); - unsigned ShadowRegI64 = RegCtx.ShadowReg(MVT::i64); + unsigned AddressRegI64 = RegCtx.AddressReg(64); + unsigned ShadowRegI64 = RegCtx.ShadowReg(64); - ComputeMemOperandAddress(Op, MVT::i64, AddressRegI64, Ctx, Out); + ComputeMemOperandAddress(Op, 64, AddressRegI64, Ctx, Out); EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(ShadowRegI64).addReg( AddressRegI64)); @@ -1030,7 +1024,7 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize, } // End anonymous namespace -X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo &STI) +X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI) : STI(STI), InitialFrameReg(0) {} X86AsmInstrumentation::~X86AsmInstrumentation() {} @@ -1043,7 +1037,7 @@ void X86AsmInstrumentation::InstrumentAndEmitInstruction( void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst) { - Out.EmitInstruction(Inst, STI); + Out.EmitInstruction(Inst, *STI); } unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, @@ -1067,17 +1061,17 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI) { - Triple T(STI.getTargetTriple()); + const MCContext &Ctx, const MCSubtargetInfo *&STI) { + Triple T(STI->getTargetTriple()); const bool hasCompilerRTSupport = T.isOSLinux(); if (ClAsanInstrumentAssembly && hasCompilerRTSupport && MCOptions.SanitizeAddress) { - if (STI.getFeatureBits()[X86::Mode32Bit] != 0) + if (STI->getFeatureBits()[X86::Mode32Bit] != 0) return new X86AddressSanitizer32(STI); - if (STI.getFeatureBits()[X86::Mode64Bit] != 0) + if (STI->getFeatureBits()[X86::Mode64Bit] != 0) return new X86AddressSanitizer64(STI); } return new X86AsmInstrumentation(STI); } -} // End llvm namespace +} // end llvm namespace diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 19ebcc4..470cead 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -28,7 +28,8 @@ class X86AsmInstrumentation; X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI); + const MCContext &Ctx, + const MCSubtargetInfo *&STI); class X86AsmInstrumentation { public: @@ -48,15 +49,16 @@ public: protected: friend X86AsmInstrumentation * CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions, - const MCContext &Ctx, const MCSubtargetInfo &STI); + const MCContext &Ctx, + const MCSubtargetInfo *&STI); - X86AsmInstrumentation(const MCSubtargetInfo &STI); + X86AsmInstrumentation(const MCSubtargetInfo *&STI); unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); void EmitInstruction(MCStreamer &Out, const MCInst &Inst); - const MCSubtargetInfo &STI; + const MCSubtargetInfo *&STI; unsigned InitialFrameReg; }; diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index bca059d..4d8ffac 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -11,7 +11,6 @@ #include "X86AsmInstrumentation.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" -#include "X86ISelLowering.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" @@ -26,6 +25,7 @@ #include "llvm/MC/MCParser/MCAsmParser.h" #include "llvm/MC/MCParser/MCParsedAsmOperand.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSection.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" @@ -57,10 +57,10 @@ static const char OpPrecedence[] = { }; class X86AsmParser : public MCTargetAsmParser { - MCSubtargetInfo &STI; const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; + private: SMLoc consumeToken() { MCAsmParser &Parser = getParser(); @@ -154,6 +154,7 @@ private: // Push the new operator. InfixOperatorStack.push_back(Op); } + int64_t execute() { // Push any remaining operators onto the postfix stack. while (!InfixOperatorStack.empty()) { @@ -268,6 +269,7 @@ private: bool StopOnLBrac, AddImmPrefix; InfixCalculator IC; InlineAsmIdentifierInfo Info; + public: IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), @@ -712,10 +714,10 @@ private: SMLoc End, unsigned Size, StringRef Identifier, InlineAsmIdentifierInfo &Info); + bool parseDirectiveEven(SMLoc L); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); - bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds @@ -758,23 +760,24 @@ private: bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode64Bit]; + return getSTI().getFeatureBits()[X86::Mode64Bit]; } bool is32BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode32Bit]; + return getSTI().getFeatureBits()[X86::Mode32Bit]; } bool is16BitMode() const { // FIXME: Can tablegen auto-generate this? - return STI.getFeatureBits()[X86::Mode16Bit]; + return getSTI().getFeatureBits()[X86::Mode16Bit]; } void SwitchMode(unsigned mode) { + MCSubtargetInfo &STI = copySTI(); FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); FeatureBitset OldMode = STI.getFeatureBits() & AllModes; unsigned FB = ComputeAvailableFeatures( STI.ToggleFeature(OldMode.flip(mode))); setAvailableFeatures(FB); - + assert(FeatureBitset({mode}) == (STI.getFeatureBits() & AllModes)); } @@ -798,12 +801,12 @@ private: /// } public: - X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &Parser, + X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) - : MCTargetAsmParser(), STI(sti), MII(mii), InstInfo(nullptr) { + : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr) { // Initialize the set of available features. - setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits())); + setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits())); Instrumentation.reset( CreateX86AsmInstrumentation(Options, Parser.getContext(), STI)); } @@ -912,6 +915,11 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, if (RegNo == 0) RegNo = MatchRegisterName(Tok.getString().lower()); + // The "flags" register cannot be referenced directly. + // Treat it as an identifier instead. + if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS) + RegNo = 0; + if (!is64BitMode()) { // FIXME: This should be done using Requires<Not64BitMode> and // Requires<In64BitMode> so "eiz" usage in 64-bit instructions can be also @@ -1042,8 +1050,11 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) { .Cases("BYTE", "byte", 8) .Cases("WORD", "word", 16) .Cases("DWORD", "dword", 32) + .Cases("FWORD", "fword", 48) .Cases("QWORD", "qword", 64) + .Cases("MMWORD","mmword", 64) .Cases("XWORD", "xword", 80) + .Cases("TBYTE", "tbyte", 80) .Cases("XMMWORD", "xmmword", 128) .Cases("YMMWORD", "ymmword", 256) .Cases("ZMMWORD", "zmmword", 512) @@ -1062,8 +1073,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( // Insert an explicit size if the user didn't have one. if (!Size) { Size = getPointerWidth(); - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, - /*Len=*/0, Size)); + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); } // Create an absolute memory reference in order to match against @@ -1082,8 +1093,8 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( if (!Size) { Size = Info.Type * 8; // Size is in terms of bits in this context. if (Size) - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_SizeDirective, Start, - /*Len=*/0, Size)); + InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, + /*Len=*/0, Size); } } @@ -1097,13 +1108,13 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( } static void -RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, +RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites, StringRef SymName, int64_t ImmDisp, int64_t FinalImmDisp, SMLoc &BracLoc, SMLoc &StartInBrac, SMLoc &End) { // Remove the '[' and ']' from the IR string. - AsmRewrites->push_back(AsmRewrite(AOK_Skip, BracLoc, 1)); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, End, 1)); + AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1); + AsmRewrites.emplace_back(AOK_Skip, End, 1); // If ImmDisp is non-zero, then we parsed a displacement before the // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp]) @@ -1114,15 +1125,14 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, // We have an immediate displacement before the bracketed expression. // Adjust this to match the final immediate displacement. bool Found = false; - for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), - E = AsmRewrites->end(); I != E; ++I) { - if ((*I).Loc.getPointer() > BracLoc.getPointer()) + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() > BracLoc.getPointer()) continue; - if ((*I).Kind == AOK_ImmPrefix || (*I).Kind == AOK_Imm) { + if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) { assert (!Found && "ImmDisp already rewritten."); - (*I).Kind = AOK_Imm; - (*I).Len = BracLoc.getPointer() - (*I).Loc.getPointer(); - (*I).Val = FinalImmDisp; + AR.Kind = AOK_Imm; + AR.Len = BracLoc.getPointer() - AR.Loc.getPointer(); + AR.Val = FinalImmDisp; Found = true; break; } @@ -1133,28 +1143,27 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites, // We have a symbolic and an immediate displacement, but no displacement // before the bracketed expression. Put the immediate displacement // before the bracketed expression. - AsmRewrites->push_back(AsmRewrite(AOK_Imm, BracLoc, 0, FinalImmDisp)); + AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp); } } // Remove all the ImmPrefix rewrites within the brackets. - for (SmallVectorImpl<AsmRewrite>::iterator I = AsmRewrites->begin(), - E = AsmRewrites->end(); I != E; ++I) { - if ((*I).Loc.getPointer() < StartInBrac.getPointer()) + for (AsmRewrite &AR : AsmRewrites) { + if (AR.Loc.getPointer() < StartInBrac.getPointer()) continue; - if ((*I).Kind == AOK_ImmPrefix) - (*I).Kind = AOK_Delete; + if (AR.Kind == AOK_ImmPrefix) + AR.Kind = AOK_Delete; } const char *SymLocPtr = SymName.data(); // Skip everything before the symbol. if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, StartInBrac, Len)); + AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len); } // Skip everything after the symbol. if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) { SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size()); assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites->push_back(AsmRewrite(AOK_Skip, Loc, Len)); + AsmRewrites.emplace_back(AOK_Skip, Loc, Len); } } @@ -1162,6 +1171,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); + AsmToken::TokenKind PrevTK = AsmToken::Error; bool Done = false; while (!Done) { bool UpdateLocLex = true; @@ -1205,7 +1215,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return Error(Tok.getLoc(), "Unexpected identifier!"); } else { // This is a dot operator, not an adjacent identifier. - if (Identifier.find('.') != StringRef::npos) { + if (Identifier.find('.') != StringRef::npos && + PrevTK == AsmToken::RBrac) { return false; } else { InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); @@ -1223,8 +1234,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { case AsmToken::Integer: { StringRef ErrMsg; if (isParsingInlineAsm() && SM.getAddImmPrefix()) - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, - Tok.getLoc())); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc()); // Look for 'b' or 'f' following an Integer as a directional label SMLoc Loc = getTok().getLoc(); int64_t IntVal = getTok().getIntVal(); @@ -1237,7 +1247,7 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { getContext().getDirectionalLocalSymbol(IntVal, IDVal == "b"); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; const MCExpr *Val = - MCSymbolRefExpr::create(Sym, Variant, getContext()); + MCSymbolRefExpr::create(Sym, Variant, getContext()); if (IDVal == "b" && Sym->isUndefined()) return Error(Loc, "invalid reference to undefined symbol"); StringRef Identifier = Sym->getName(); @@ -1275,6 +1285,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (!Done && UpdateLocLex) End = consumeToken(); + + PrevTK = TK; } return false; } @@ -1302,7 +1314,7 @@ X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, // A symbolic displacement. Disp = Sym; if (isParsingInlineAsm()) - RewriteIntelBracExpression(InstInfo->AsmRewrites, SM.getSymName(), + RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(), ImmDisp, SM.getImm(), BracLoc, StartInBrac, End); } @@ -1359,7 +1371,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, InlineAsmIdentifierInfo &Info, bool IsUnevaluatedOperand, SMLoc &End) { MCAsmParser &Parser = getParser(); - assert (isParsingInlineAsm() && "Expected to be parsing inline assembly."); + assert(isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; StringRef LineBuf(Identifier.data()); @@ -1372,15 +1384,17 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, // Advance the token stream until the end of the current token is // after the end of what the frontend claimed. const char *EndPtr = Tok.getLoc().getPointer() + LineBuf.size(); - while (true) { + do { End = Tok.getEndLoc(); getLexer().Lex(); - - assert(End.getPointer() <= EndPtr && "frontend claimed part of a token?"); - if (End.getPointer() == EndPtr) break; - } + } while (End.getPointer() < EndPtr); Identifier = LineBuf; + // The frontend should end parsing on an assembler token boundary, unless it + // failed parsing. + assert((End.getPointer() == EndPtr || !Result) && + "frontend claimed part of a token?"); + // If the identifier lookup was unsuccessful, assume that we are dealing with // a label. if (!Result) { @@ -1389,9 +1403,8 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, Loc, false); assert(InternalName.size() && "We should have an internal name here."); // Push a rewrite for replacing the identifier name with the internal name. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Label, Loc, - Identifier.size(), - InternalName)); + InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(), + InternalName); } // Create the symbol reference. @@ -1418,8 +1431,7 @@ X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, AsmToken ImmDispToken = Parser.Lex(); // Eat the integer. if (isParsingInlineAsm()) - InstInfo->AsmRewrites->push_back( - AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc())); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc()); if (getLexer().isNot(AsmToken::LBrac)) { // An immediate following a 'segment register', 'colon' token sequence can @@ -1588,8 +1600,7 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); unsigned Len = DotDispStr.size(); unsigned Val = OrigDispVal + DotDispVal; - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len, - Val)); + InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, Val); } NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext()); @@ -1613,7 +1624,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { return nullptr; // Don't emit the offset operator. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7)); + InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7); // The offset operator will have an 'r' constraint, thus we need to create // register operand to ensure proper matching. Just pick a GPR based on @@ -1664,7 +1675,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) { // Rewrite the type operator and the C or C++ type or variable in terms of an // immediate. E.g. TYPE foo -> $$4 unsigned Len = End.getPointer() - TypeLoc.getPointer(); - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal)); + InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal); const MCExpr *Imm = MCConstantExpr::create(CVal, getContext()); return X86Operand::CreateImm(Imm, Start, End); @@ -1688,12 +1699,14 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { return ParseIntelOperator(IOK_TYPE); } + bool PtrInOperand = false; unsigned Size = getIntelMemOperandSize(Tok.getString()); if (Size) { Parser.Lex(); // Eat operand size (e.g., byte, word). if (Tok.getString() != "PTR" && Tok.getString() != "ptr") return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); Parser.Lex(); // Eat ptr. + PtrInOperand = true; } Start = Tok.getLoc(); @@ -1711,10 +1724,10 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); if (StartTok.getString().size() == Len) // Just add a prefix if this wasn't a complex immediate expression. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix, Start)); + InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start); else // Otherwise, rewrite the complex expression as a single immediate. - InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, Start, Len, Imm)); + InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm); } if (getLexer().isNot(AsmToken::LBrac)) { @@ -1740,7 +1753,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { } // rounding mode token - if (STI.getFeatureBits()[X86::FeatureAVX512] && + if (getSTI().getFeatureBits()[X86::FeatureAVX512] && getLexer().is(AsmToken::LCurly)) return ParseRoundingModeOp(Start, End); @@ -1749,9 +1762,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { if (!ParseRegister(RegNo, Start, End)) { // If this is a segment register followed by a ':', then this is the start // of a segment override, otherwise this is a normal register reference. - if (getLexer().isNot(AsmToken::Colon)) + // In case it is a normal register and there is ptr in the operand this + // is an error + if (getLexer().isNot(AsmToken::Colon)){ + if (PtrInOperand){ + return ErrorOperand(Start, "expected memory operand after " + "'ptr', found register operand instead"); + } return X86Operand::CreateReg(RegNo, Start, End); - + } + return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size); } @@ -1798,7 +1818,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { } case AsmToken::LCurly:{ SMLoc Start = Parser.getTok().getLoc(), End; - if (STI.getFeatureBits()[X86::FeatureAVX512]) + if (getSTI().getFeatureBits()[X86::FeatureAVX512]) return ParseRoundingModeOp(Start, End); return ErrorOperand(Start, "unknown token in expression"); } @@ -1808,7 +1828,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, const MCParsedAsmOperand &Op) { MCAsmParser &Parser = getParser(); - if(STI.getFeatureBits()[X86::FeatureAVX512]) { + if(getSTI().getFeatureBits()[X86::FeatureAVX512]) { if (getLexer().is(AsmToken::LCurly)) { // Eat "{" and mark the current place. const SMLoc consumedToken = consumeToken(); @@ -1983,12 +2003,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, } // Validate the scale amount. - if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && + if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) && ScaleVal != 1) { Error(Loc, "scale factor in 16-bit address must be 1"); return nullptr; - } - if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){ + } + if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && + ScaleVal != 8) { Error(Loc, "scale factor in address must be 1, 2, 4 or 8"); return nullptr; } @@ -2175,7 +2196,6 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, Name == "repne" || Name == "repnz" || Name == "rex64" || Name == "data16"; - // This does the actual operand parsing. Don't parse any more if we have a // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we // just want to parse the "lock" as the first instruction and the "incl" as @@ -2213,6 +2233,20 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, (isPrefix && getLexer().is(AsmToken::Slash))) Parser.Lex(); + // This is for gas compatibility and cannot be done in td. + // Adding "p" for some floating point with no argument. + // For example: fsub --> fsubp + bool IsFp = + Name == "fsub" || Name == "fdiv" || Name == "fsubr" || Name == "fdivr"; + if (IsFp && Operands.size() == 1) { + const char *Repl = StringSwitch<const char *>(Name) + .Case("fsub", "fsubp") + .Case("fdiv", "fdivp") + .Case("fsubr", "fsubrp") + .Case("fdivr", "fdivrp"); + static_cast<X86Operand &>(*Operands[0]).setTokenValue(Repl); + } + // This is a terrible hack to handle "out[bwl]? %al, (%dx)" -> // "outb %al, %dx". Out doesn't take a memory form, but this is a widely // documented form in various unofficial manuals, so a lot of code uses it. @@ -2242,9 +2276,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // Append default arguments to "ins[bwld]" if (Name.startswith("ins") && Operands.size() == 1 && - (Name == "insb" || Name == "insw" || Name == "insl" || - Name == "insd" )) { - AddDefaultSrcDestOperands(Operands, + (Name == "insb" || Name == "insw" || Name == "insl" || Name == "insd")) { + AddDefaultSrcDestOperands(Operands, X86Operand::CreateReg(X86::DX, NameLoc, NameLoc), DefaultMemDIOperand(NameLoc)); } @@ -2346,98 +2379,21 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, // instalias with an immediate operand yet. if (Name == "int" && Operands.size() == 2) { X86Operand &Op1 = static_cast<X86Operand &>(*Operands[1]); - if (Op1.isImm() && isa<MCConstantExpr>(Op1.getImm()) && - cast<MCConstantExpr>(Op1.getImm())->getValue() == 3) { - Operands.erase(Operands.begin() + 1); - static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); - } + if (Op1.isImm()) + if (auto *CE = dyn_cast<MCConstantExpr>(Op1.getImm())) + if (CE->getValue() == 3) { + Operands.erase(Operands.begin() + 1); + static_cast<X86Operand &>(*Operands[0]).setTokenValue("int3"); + } } return false; } -static bool convertToSExti8(MCInst &Inst, unsigned Opcode, unsigned Reg, - bool isCmp) { - MCInst TmpInst; - TmpInst.setOpcode(Opcode); - if (!isCmp) - TmpInst.addOperand(MCOperand::createReg(Reg)); - TmpInst.addOperand(MCOperand::createReg(Reg)); - TmpInst.addOperand(Inst.getOperand(0)); - Inst = TmpInst; - return true; -} - -static bool convert16i16to16ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti16i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::AX, isCmp); -} - -static bool convert32i32to32ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti32i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::EAX, isCmp); -} - -static bool convert64i32to64ri8(MCInst &Inst, unsigned Opcode, - bool isCmp = false) { - if (!Inst.getOperand(0).isImm() || - !isImmSExti64i8Value(Inst.getOperand(0).getImm())) - return false; - - return convertToSExti8(Inst, Opcode, X86::RAX, isCmp); -} - -bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { - switch (Inst.getOpcode()) { - default: return true; - case X86::INT: - X86Operand &Op = static_cast<X86Operand &>(*Ops[1]); - assert(Op.isImm() && "expected immediate"); - int64_t Res; - if (!Op.getImm()->evaluateAsAbsolute(Res) || Res > 255) { - Error(Op.getStartLoc(), "interrupt vector must be in range [0-255]"); - return false; - } - return true; - } - llvm_unreachable("handle the instruction appropriately"); -} - bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { switch (Inst.getOpcode()) { default: return false; - case X86::AND16i16: return convert16i16to16ri8(Inst, X86::AND16ri8); - case X86::AND32i32: return convert32i32to32ri8(Inst, X86::AND32ri8); - case X86::AND64i32: return convert64i32to64ri8(Inst, X86::AND64ri8); - case X86::XOR16i16: return convert16i16to16ri8(Inst, X86::XOR16ri8); - case X86::XOR32i32: return convert32i32to32ri8(Inst, X86::XOR32ri8); - case X86::XOR64i32: return convert64i32to64ri8(Inst, X86::XOR64ri8); - case X86::OR16i16: return convert16i16to16ri8(Inst, X86::OR16ri8); - case X86::OR32i32: return convert32i32to32ri8(Inst, X86::OR32ri8); - case X86::OR64i32: return convert64i32to64ri8(Inst, X86::OR64ri8); - case X86::CMP16i16: return convert16i16to16ri8(Inst, X86::CMP16ri8, true); - case X86::CMP32i32: return convert32i32to32ri8(Inst, X86::CMP32ri8, true); - case X86::CMP64i32: return convert64i32to64ri8(Inst, X86::CMP64ri8, true); - case X86::ADD16i16: return convert16i16to16ri8(Inst, X86::ADD16ri8); - case X86::ADD32i32: return convert32i32to32ri8(Inst, X86::ADD32ri8); - case X86::ADD64i32: return convert64i32to64ri8(Inst, X86::ADD64ri8); - case X86::SUB16i16: return convert16i16to16ri8(Inst, X86::SUB16ri8); - case X86::SUB32i32: return convert32i32to32ri8(Inst, X86::SUB32ri8); - case X86::SUB64i32: return convert64i32to64ri8(Inst, X86::SUB64ri8); - case X86::ADC16i16: return convert16i16to16ri8(Inst, X86::ADC16ri8); - case X86::ADC32i32: return convert32i32to32ri8(Inst, X86::ADC32ri8); - case X86::ADC64i32: return convert64i32to64ri8(Inst, X86::ADC64ri8); - case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8); - case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8); - case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8); + case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: case X86::VMOVAPSrr: @@ -2457,18 +2413,19 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { unsigned NewOpc; switch (Inst.getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; - case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; - case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; - case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; - case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; - case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; - case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; - case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; - case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; - case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; - case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; - case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; } Inst.setOpcode(NewOpc); return true; @@ -2573,9 +2530,6 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, isParsingIntelSyntax())) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: - if (!validateInstruction(Inst, Operands)) - return true; - // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the // individual transformations can chain off each other. @@ -2819,9 +2773,6 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, unsigned NumSuccessfulMatches = std::count(std::begin(Match), std::end(Match), Match_Success); if (NumSuccessfulMatches == 1) { - if (!validateInstruction(Inst, Operands)) - return true; - // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual // transformations can chain off each other. @@ -2898,10 +2849,29 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { "a '%' prefix in .intel_syntax"); } return false; - } + } else if (IDVal == ".even") + return parseDirectiveEven(DirectiveID.getLoc()); return true; } +/// parseDirectiveEven +/// ::= .even +bool X86AsmParser::parseDirectiveEven(SMLoc L) { + const MCSection *Section = getStreamer().getCurrentSection().first; + if (getLexer().isNot(AsmToken::EndOfStatement)) { + TokError("unexpected token in directive"); + return false; + } + if (!Section) { + getStreamer().InitSections(false); + Section = getStreamer().getCurrentSection().first; + } + if (Section->UseCodeAlign()) + getStreamer().EmitCodeAlignment(2, 0); + else + getStreamer().EmitValueToAlignment(2, 0, 1, 0); + return false; +} /// ParseDirectiveWord /// ::= .word [ expression (, expression)* ] bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { @@ -2909,10 +2879,19 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) { if (getLexer().isNot(AsmToken::EndOfStatement)) { for (;;) { const MCExpr *Value; + SMLoc ExprLoc = getLexer().getLoc(); if (getParser().parseExpression(Value)) return false; - getParser().getStreamer().EmitValue(Value, Size); + if (const auto *MCE = dyn_cast<MCConstantExpr>(Value)) { + assert(Size <= 8 && "Invalid size"); + uint64_t IntValue = MCE->getValue(); + if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue)) + return Error(ExprLoc, "literal value out of range for directive"); + getStreamer().EmitIntValue(IntValue, Size); + } else { + getStreamer().EmitValue(Value, Size, ExprLoc); + } if (getLexer().is(AsmToken::EndOfStatement)) break; diff --git a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h index 7610806..54538c8 100644 --- a/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h +++ b/contrib/llvm/lib/Target/X86/AsmParser/X86AsmParserCommon.h @@ -13,30 +13,25 @@ namespace llvm { inline bool isImmSExti16i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0x000000000000FF80ULL <= Value && Value <= 0x000000000000FFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value) || + (isUInt<16>(Value) && isInt<8>(static_cast<int16_t>(Value))); } inline bool isImmSExti32i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0x00000000FFFFFF80ULL <= Value && Value <= 0x00000000FFFFFFFFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value) || + (isUInt<32>(Value) && isInt<8>(static_cast<int32_t>(Value))); } inline bool isImmSExti64i8Value(uint64_t Value) { - return (( Value <= 0x000000000000007FULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<8>(Value); } inline bool isImmSExti64i32Value(uint64_t Value) { - return (( Value <= 0x000000007FFFFFFFULL)|| - (0xFFFFFFFF80000000ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isInt<32>(Value); } inline bool isImmUnsignedi8Value(uint64_t Value) { - return (( Value <= 0x00000000000000FFULL)|| - (0xFFFFFFFFFFFFFF80ULL <= Value && Value <= 0xFFFFFFFFFFFFFFFFULL)); + return isUInt<8>(Value) || isInt<8>(Value); } } // End of namespace llvm diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp index cfc3ee2..ce8fcf1 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -95,11 +95,13 @@ X86GenericDisassembler::X86GenericDisassembler( llvm_unreachable("Invalid CPU mode"); } +namespace { struct Region { ArrayRef<uint8_t> Bytes; uint64_t Base; Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {} }; +} // end anonymous namespace /// A callback function that wraps the readByte method from Region. /// @@ -831,8 +833,12 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_XMM256: case TYPE_XMM512: case TYPE_VK1: + case TYPE_VK2: + case TYPE_VK4: case TYPE_VK8: case TYPE_VK16: + case TYPE_VK32: + case TYPE_VK64: case TYPE_DEBUGREG: case TYPE_CONTROLREG: case TYPE_BNDR: @@ -962,6 +968,7 @@ static bool translateInstruction(MCInst &mcInst, return true; } + mcInst.clear(); mcInst.setOpcode(insn.instructionID); // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3 // prefix bytes should be disassembled as xrelease and xacquire then set the diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index f73fa75..040143b 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -361,7 +361,7 @@ static int readPrefixes(struct InternalInstruction* insn) { * then it should be disassembled as a xacquire/xrelease not repne/rep. */ if ((byte == 0xf2 || byte == 0xf3) && - ((nextByte == 0xf0) | + ((nextByte == 0xf0) || ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) insn->xAcquireRelease = true; /* @@ -980,6 +980,47 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { insn->opcode == 0xE3) attrMask ^= ATTR_ADSIZE; + /* + * In 64-bit mode all f64 superscripted opcodes ignore opcode size prefix + * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes + */ + + if (insn->mode == MODE_64BIT && + isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) { + switch (insn->opcode) { + case 0xE8: + case 0xE9: + // Take care of psubsb and other mmx instructions. + if (insn->opcodeType == ONEBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + case 0x82: + case 0x83: + case 0x84: + case 0x85: + case 0x86: + case 0x87: + case 0x88: + case 0x89: + case 0x8A: + case 0x8B: + case 0x8C: + case 0x8D: + case 0x8E: + case 0x8F: + // Take care of lea and three byte ops. + if (insn->opcodeType == TWOBYTE) { + attrMask ^= ATTR_OPSIZE; + insn->immediateSize = 4; + insn->displacementSize = 4; + } + break; + } + } + if (getIDWithAttrMask(&instructionID, insn, attrMask)) return -1; @@ -1447,8 +1488,12 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_XMM: \ return prefix##_XMM0 + index; \ case TYPE_VK1: \ + case TYPE_VK2: \ + case TYPE_VK4: \ case TYPE_VK8: \ case TYPE_VK16: \ + case TYPE_VK32: \ + case TYPE_VK64: \ if (index > 7) \ *valid = 0; \ return prefix##_K0 + index; \ diff --git a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index a79a923..28a628e 100644 --- a/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/contrib/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -572,8 +572,6 @@ struct InternalInstruction { // The last byte of the opcode, not counting any ModR/M extension uint8_t opcode; - // The ModR/M byte of the instruction, if it is an opcode extension - uint8_t modRMExtension; // decode state diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index ea727e6..b4c0bc4 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -21,6 +21,7 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h index 62b6b73..bbb3090 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h @@ -15,12 +15,9 @@ #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H #include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCSubtargetInfo.h" namespace llvm { -class MCOperand; - class X86ATTInstPrinter final : public MCInstPrinter { public: X86ATTInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp index 91b144a..82f0ee5 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -21,6 +21,27 @@ using namespace llvm; +static unsigned getVectorRegSize(unsigned RegNo) { + if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31) + return 512; + if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31) + return 256; + if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31) + return 128; + if (X86::MM0 <= RegNo && RegNo <= X86::MM7) + return 64; + + llvm_unreachable("Unknown vector reg!"); + return 0; +} + +static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT, + unsigned OperandIndex) { + unsigned OpReg = MI->getOperand(OperandIndex).getReg(); + return MVT::getVectorVT(ScalarVT, + getVectorRegSize(OpReg)/ScalarVT.getSizeInBits()); +} + /// \brief Extracts the src/dst types for a given zero extension instruction. /// \note While the number of elements in DstVT type correct, the /// number in the SrcVT type is expanded to fill the src xmm register and the @@ -107,6 +128,75 @@ static void getZeroExtensionTypes(const MCInst *MI, MVT &SrcVT, MVT &DstVT) { } } +#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: \ + case X86::V##Inst##Suffix##src##k: \ + case X86::V##Inst##Suffix##src##kz: + +#define CASE_SSE_INS_COMMON(Inst, src) \ + case X86::Inst##src: + +#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: + +#define CASE_MOVDUP(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_UNPCK(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_SHUF(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src##i) \ + CASE_AVX_INS_COMMON(Inst, , r##src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src##i) \ + CASE_SSE_INS_COMMON(Inst, r##src##i) \ + +#define CASE_VPERM(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z256, src##i) \ + CASE_MASK_INS_COMMON(Inst, Z128, src##i) \ + CASE_AVX_INS_COMMON(Inst, , src##i) \ + CASE_AVX_INS_COMMON(Inst, Y, src##i) \ + +#define CASE_VSHUF(Inst, src) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) \ + +/// \brief Extracts the types and if it has memory operand for a given +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction. +static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) { + HasMemOp = false; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unknown VSHUF64x2 family instructions."); + break; + CASE_VSHUF(64X2, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(64X2, r) + VT = getRegOperandVectorVT(MI, MVT::i64, 0); + break; + CASE_VSHUF(32X4, m) + HasMemOp = true; // FALL THROUGH. + CASE_VSHUF(32X4, r) + VT = getRegOperandVectorVT(MI, MVT::i32, 0); + break; + } +} + //===----------------------------------------------------------------------===// // Top Level Entrypoint //===----------------------------------------------------------------------===// @@ -127,23 +217,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::BLENDPDrri: case X86::VBLENDPDrri: + case X86::VBLENDPDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::BLENDPDrmi: case X86::VBLENDPDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VBLENDPDYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4f64, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -152,23 +233,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::BLENDPSrri: case X86::VBLENDPSrri: + case X86::VBLENDPSYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::BLENDPSrmi: case X86::VBLENDPSrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VBLENDPSYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VBLENDPSYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8f32, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -177,23 +249,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::PBLENDWrri: case X86::VPBLENDWrri: + case X86::VPBLENDWYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PBLENDWrmi: case X86::VPBLENDWrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPBLENDWYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VPBLENDWYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v16i16, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -201,23 +264,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, break; case X86::VPBLENDDrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPBLENDDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPBLENDDYrri: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. + case X86::VPBLENDDrmi: case X86::VPBLENDDYrmi: if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeBLENDMask(MVT::v8i32, + DecodeBLENDMask(getRegOperandVectorVT(MI, MVT::i32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -239,6 +292,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVLHPSrr: case X86::VMOVLHPSrr: + case X86::VMOVLHPSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); @@ -247,569 +301,327 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVHLPSrr: case X86::VMOVHLPSrr: + case X86::VMOVHLPSZrr: Src2Name = getRegName(MI->getOperand(2).getReg()); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); DecodeMOVHLPSMask(2, ShuffleMask); break; - case X86::MOVSLDUPrr: - case X86::VMOVSLDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVSLDUPrm: - case X86::VMOVSLDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask); - break; - - case X86::VMOVSHDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVSHDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask); - break; - - case X86::VMOVSLDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVSLDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::VMOVSLDUPYrm: + CASE_MOVDUP(MOVSLDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask); + DecodeMOVSLDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - case X86::MOVSHDUPrr: - case X86::VMOVSHDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVSHDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::MOVSHDUPrm: - case X86::VMOVSHDUPrm: + CASE_MOVDUP(MOVSHDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); + DecodeMOVSHDUPMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); break; - case X86::VMOVDDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVDDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::VMOVDDUPYrm: + CASE_MOVDUP(MOVDDUP, m) DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v4f64, ShuffleMask); - break; - - case X86::MOVDDUPrr: - case X86::VMOVDDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVDDUPrm: - case X86::VMOVDDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVDDUPMask(MVT::v2f64, ShuffleMask); + DecodeMOVDDUPMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); break; case X86::PSLLDQri: case X86::VPSLLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSLLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSLLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSLLDQMask(MVT::v32i8, + DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSRLDQri: case X86::VPSRLDQri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSRLDQMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSRLDQYri: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSRLDQMask(MVT::v32i8, + DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PALIGNR128rr: case X86::VPALIGNR128rr: + case X86::VPALIGNR256rr: Src1Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. case X86::PALIGNR128rm: case X86::VPALIGNR128rm: - Src2Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePALIGNRMask(MVT::v16i8, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPALIGNR256rr: - Src1Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. case X86::VPALIGNR256rm: Src2Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePALIGNRMask(MVT::v32i8, + DecodePALIGNRMask(getRegOperandVectorVT(MI, MVT::i8, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSHUFDri: case X86::VPSHUFDri: + case X86::VPSHUFDYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFDmi: case X86::VPSHUFDmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4i32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFDYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFDYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v8i32, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::i32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; case X86::PSHUFHWri: case X86::VPSHUFHWri: + case X86::VPSHUFHWYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFHWmi: case X86::VPSHUFHWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFHWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFHWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFHWYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFHWMask(MVT::v16i16, + DecodePSHUFHWMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; + case X86::PSHUFLWri: case X86::VPSHUFLWri: + case X86::VPSHUFLWYri: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::PSHUFLWmi: case X86::VPSHUFLWmi: - DestName = getRegName(MI->getOperand(0).getReg()); - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFLWMask(MVT::v8i16, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - break; - case X86::VPSHUFLWYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. case X86::VPSHUFLWYmi: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFLWMask(MVT::v16i16, + DecodePSHUFLWMask(getRegOperandVectorVT(MI, MVT::i16, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); break; - case X86::PUNPCKHBWrr: - case X86::VPUNPCKHBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHBWrm: - case X86::VPUNPCKHBWrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKHBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKHWDrr: - case X86::VPUNPCKHWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHWDrm: - case X86::VPUNPCKHWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i16, ShuffleMask); - break; - case X86::VPUNPCKHWDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHWDYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i16, ShuffleMask); - break; - case X86::PUNPCKHDQrr: - case X86::VPUNPCKHDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHDQrm: - case X86::VPUNPCKHDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i32, ShuffleMask); - break; - case X86::VPUNPCKHDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHDQYrm: + case X86::MMX_PSHUFWri: Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i32, ShuffleMask); - break; - case X86::VPUNPCKHDQZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKHDQZrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v16i32, ShuffleMask); - break; - case X86::PUNPCKHQDQrr: - case X86::VPUNPCKHQDQrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKHQDQrm: - case X86::VPUNPCKHQDQrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::MMX_PSHUFWmi: DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v2i64, ShuffleMask); - break; - case X86::VPUNPCKHQDQYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHQDQYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v4i64, ShuffleMask); - break; - case X86::VPUNPCKHQDQZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKHQDQZrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKHMask(MVT::v8i64, ShuffleMask); + if (MI->getOperand(MI->getNumOperands() - 1).isImm()) + DecodePSHUFMask(MVT::v4i16, + MI->getOperand(MI->getNumOperands() - 1).getImm(), + ShuffleMask); break; - case X86::PUNPCKLBWrr: - case X86::VPUNPCKLBWrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::PUNPCKLBWrm: - case X86::VPUNPCKLBWrm: + case X86::PSWAPDrr: Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i8, ShuffleMask); - break; - case X86::VPUNPCKLBWYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VPUNPCKLBWYrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v32i8, ShuffleMask); - break; - case X86::PUNPCKLWDrr: - case X86::VPUNPCKLWDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLWDrm: - case X86::VPUNPCKLWDrm: - Src1Name = getRegName(MI->getOperand(1).getReg()); + case X86::PSWAPDrm: DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i16, ShuffleMask); + DecodePSWAPMask(MVT::v2i32, ShuffleMask); break; - case X86::VPUNPCKLWDYrr: + + CASE_UNPCK(PUNPCKHBW, r) + case X86::MMX_PUNPCKHBWirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLWDYrm: + CASE_UNPCK(PUNPCKHBW, m) + case X86::MMX_PUNPCKHBWirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i16, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); break; - case X86::PUNPCKLDQrr: - case X86::VPUNPCKLDQrr: + + CASE_UNPCK(PUNPCKHWD, r) + case X86::MMX_PUNPCKHWDirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLDQrm: - case X86::VPUNPCKLDQrm: + CASE_UNPCK(PUNPCKHWD, m) + case X86::MMX_PUNPCKHWDirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); break; - case X86::VPUNPCKLDQYrr: + + CASE_UNPCK(PUNPCKHDQ, r) + case X86::MMX_PUNPCKHDQirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLDQYrm: + CASE_UNPCK(PUNPCKHDQ, m) + case X86::MMX_PUNPCKHDQirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); break; - case X86::VPUNPCKLDQZrr: + + CASE_UNPCK(PUNPCKHQDQ, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLDQZrm: + CASE_UNPCK(PUNPCKHQDQ, m) Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v16i32, ShuffleMask); + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); break; - case X86::PUNPCKLQDQrr: - case X86::VPUNPCKLQDQrr: + + CASE_UNPCK(PUNPCKLBW, r) + case X86::MMX_PUNPCKLBWirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::PUNPCKLQDQrm: - case X86::VPUNPCKLQDQrm: + CASE_UNPCK(PUNPCKLBW, m) + case X86::MMX_PUNPCKLBWirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v2i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i8, 0), ShuffleMask); break; - case X86::VPUNPCKLQDQYrr: + + CASE_UNPCK(PUNPCKLWD, r) + case X86::MMX_PUNPCKLWDirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLQDQYrm: + CASE_UNPCK(PUNPCKLWD, m) + case X86::MMX_PUNPCKLWDirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v4i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i16, 0), ShuffleMask); break; - case X86::VPUNPCKLQDQZrr: + + CASE_UNPCK(PUNPCKLDQ, r) + case X86::MMX_PUNPCKLDQirr: Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VPUNPCKLQDQZrm: + CASE_UNPCK(PUNPCKLDQ, m) + case X86::MMX_PUNPCKLDQirm: Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeUNPCKLMask(MVT::v8i64, ShuffleMask); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i32, 0), ShuffleMask); break; - case X86::SHUFPDrri: - case X86::VSHUFPDrri: + CASE_UNPCK(PUNPCKLQDQ, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::SHUFPDrmi: - case X86::VSHUFPDrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v2f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VSHUFPDYrri: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VSHUFPDYrmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v4f64, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); + CASE_UNPCK(PUNPCKLQDQ, m) Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::i64, 0), ShuffleMask); break; - case X86::SHUFPSrri: - case X86::VSHUFPSrri: + CASE_SHUF(SHUFPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::SHUFPSrmi: - case X86::VSHUFPSrmi: + CASE_SHUF(SHUFPD, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v4f32, + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VSHUFPSYrri: + + CASE_SHUF(SHUFPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VSHUFPSYrmi: + CASE_SHUF(SHUFPS, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodeSHUFPMask(MVT::v8f32, + DecodeSHUFPMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::UNPCKLPDrr: - case X86::VUNPCKLPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPDrm: - case X86::VUNPCKLPDrm: - DecodeUNPCKLMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPDYrm: - DecodeUNPCKLMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPDZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPDZrm: - DecodeUNPCKLMask(MVT::v8f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKLPSrr: - case X86::VUNPCKLPSrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKLPSrm: - case X86::VUNPCKLPSrm: - DecodeUNPCKLMask(MVT::v4f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPSYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPSYrm: - DecodeUNPCKLMask(MVT::v8f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKLPSZrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKLPSZrm: - DecodeUNPCKLMask(MVT::v16f32, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::UNPCKHPDrr: - case X86::VUNPCKHPDrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::UNPCKHPDrm: - case X86::VUNPCKHPDrm: - DecodeUNPCKHMask(MVT::v2f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VUNPCKHPDYrr: - Src2Name = getRegName(MI->getOperand(2).getReg()); - // FALL THROUGH. - case X86::VUNPCKHPDYrm: - DecodeUNPCKHMask(MVT::v4f64, ShuffleMask); - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_VSHUF(64X2, r) + CASE_VSHUF(64X2, m) + CASE_VSHUF(32X4, r) + CASE_VSHUF(32X4, m) { + MVT VT; + bool HasMemOp; + unsigned NumOp = MI->getNumOperands(); + getVSHUF64x2FamilyInfo(MI, VT, HasMemOp); + decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(), + ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); + if (HasMemOp) { + assert((NumOp >= 8) && "Expected at least 8 operands!"); + Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg()); + } else { + assert((NumOp >= 4) && "Expected at least 4 operands!"); + Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg()); + Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg()); + } break; - case X86::VUNPCKHPDZrr: + } + + CASE_UNPCK(UNPCKLPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPDZrm: - DecodeUNPCKHMask(MVT::v8f64, ShuffleMask); + CASE_UNPCK(UNPCKLPD, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::UNPCKHPSrr: - case X86::VUNPCKHPSrr: + + CASE_UNPCK(UNPCKLPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::UNPCKHPSrm: - case X86::VUNPCKHPSrm: - DecodeUNPCKHMask(MVT::v4f32, ShuffleMask); + CASE_UNPCK(UNPCKLPS, m) + DecodeUNPCKLMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VUNPCKHPSYrr: + + CASE_UNPCK(UNPCKHPD, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPSYrm: - DecodeUNPCKHMask(MVT::v8f32, ShuffleMask); + CASE_UNPCK(UNPCKHPD, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f64, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VUNPCKHPSZrr: + + CASE_UNPCK(UNPCKHPS, r) Src2Name = getRegName(MI->getOperand(2).getReg()); // FALL THROUGH. - case X86::VUNPCKHPSZrm: - DecodeUNPCKHMask(MVT::v16f32, ShuffleMask); + CASE_UNPCK(UNPCKHPS, m) + DecodeUNPCKHMask(getRegOperandVectorVT(MI, MVT::f32, 0), ShuffleMask); Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VPERMILPSri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPSYri: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VPERMILPSYmi: - if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v8f32, - MI->getOperand(MI->getNumOperands() - 1).getImm(), - ShuffleMask); - DestName = getRegName(MI->getOperand(0).getReg()); - break; - case X86::VPERMILPDri: + + CASE_VPERM(PERMILPS, r) Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. - case X86::VPERMILPDmi: + CASE_VPERM(PERMILPS, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v2f64, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f32, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; - case X86::VPERMILPDYri: + + CASE_VPERM(PERMILPD, r) Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. - case X86::VPERMILPDYmi: + CASE_VPERM(PERMILPD, m) if (MI->getOperand(MI->getNumOperands() - 1).isImm()) - DecodePSHUFMask(MVT::v4f64, + DecodePSHUFMask(getRegOperandVectorVT(MI, MVT::f64, 0), MI->getOperand(MI->getNumOperands() - 1).getImm(), ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERM2F128rr: case X86::VPERM2I128rr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -824,6 +636,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::VPERMQYri: case X86::VPERMPDYri: Src1Name = getRegName(MI->getOperand(1).getReg()); @@ -846,6 +659,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeScalarMoveMask(MVT::v2f64, nullptr == Src2Name, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::MOVSSrr: case X86::VMOVSSrr: Src2Name = getRegName(MI->getOperand(2).getReg()); @@ -861,6 +675,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::MOVZPQILo2PQIrr: case X86::VMOVPQI2QIrr: case X86::VMOVZPQILo2PQIrr: + case X86::VMOVZPQILo2PQIZrr: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. case X86::MOVQI2PQIrm: @@ -869,9 +684,11 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VMOVQI2PQIrm: case X86::VMOVZQI2PQIrm: case X86::VMOVZPQILo2PQIrm: + case X86::VMOVZPQILo2PQIZrm: DecodeZeroMoveLowMask(MVT::v2i64, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + case X86::MOVDI2PDIrm: case X86::VMOVDI2PDIrm: DecodeZeroMoveLowMask(MVT::v4i32, ShuffleMask); diff --git a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h index 6e371da..20cd7ff 100644 --- a/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h +++ b/contrib/llvm/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h @@ -19,8 +19,6 @@ namespace llvm { -class MCOperand; - class X86IntelInstPrinter final : public MCInstPrinter { public: X86IntelInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 629802f..133bd0e 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -69,15 +69,19 @@ public: class X86AsmBackend : public MCAsmBackend { const StringRef CPU; bool HasNopl; - const uint64_t MaxNopLength; + uint64_t MaxNopLength; public: - X86AsmBackend(const Target &T, StringRef CPU) - : MCAsmBackend(), CPU(CPU), MaxNopLength(CPU == "slm" ? 7 : 15) { + X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" && CPU != "c3" && CPU != "c3-2"; + // Max length of true long nop instruction is 15 bytes. + // Max length of long nop replacement instruction is 7 bytes. + // Taking into account SilverMont architecture features max length of nops + // is reduced for it to achieve better performance. + MaxNopLength = (!HasNopl || CPU == "slm") ? 7 : 15; } unsigned getNumFixupKinds() const override { @@ -200,6 +204,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::ADD64ri8: return X86::ADD64ri32; case X86::ADD64mi8: return X86::ADD64mi32; + // ADC + case X86::ADC16ri8: return X86::ADC16ri; + case X86::ADC16mi8: return X86::ADC16mi; + case X86::ADC32ri8: return X86::ADC32ri; + case X86::ADC32mi8: return X86::ADC32mi; + case X86::ADC64ri8: return X86::ADC64ri32; + case X86::ADC64mi8: return X86::ADC64mi32; + // SUB case X86::SUB16ri8: return X86::SUB16ri; case X86::SUB16mi8: return X86::SUB16mi; @@ -208,6 +220,14 @@ static unsigned getRelaxedOpcodeArith(unsigned Op) { case X86::SUB64ri8: return X86::SUB64ri32; case X86::SUB64mi8: return X86::SUB64mi32; + // SBB + case X86::SBB16ri8: return X86::SBB16ri; + case X86::SBB16mi8: return X86::SBB16mi; + case X86::SBB32ri8: return X86::SBB32ri; + case X86::SBB32mi8: return X86::SBB32mi; + case X86::SBB64ri8: return X86::SBB64ri32; + case X86::SBB64mi8: return X86::SBB64mi32; + // CMP case X86::CMP16ri8: return X86::CMP16ri; case X86::CMP16mi8: return X86::CMP16mi; @@ -279,7 +299,7 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const { /// bytes. /// \return - true on success, false on failure bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { - static const uint8_t Nops[10][10] = { + static const uint8_t TrueNops[10][10] = { // nop {0x90}, // xchg %ax,%ax @@ -302,17 +322,31 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, }; - // This CPU doesn't support long nops. If needed add more. - // FIXME: Can we get this from the subtarget somehow? - // FIXME: We could generated something better than plain 0x90. - if (!HasNopl) { - for (uint64_t i = 0; i < Count; ++i) - OW->write8(0x90); - return true; - } + // Alternative nop instructions for CPUs which don't support long nops. + static const uint8_t AltNops[7][10] = { + // nop + {0x90}, + // xchg %ax,%ax + {0x66, 0x90}, + // lea 0x0(%esi),%esi + {0x8d, 0x76, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0x74, 0x26, 0x00}, + // nop + lea 0x0(%esi),%esi + {0x90, 0x8d, 0x74, 0x26, 0x00}, + // lea 0x0(%esi),%esi + {0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 }, + // lea 0x0(%esi),%esi + {0x8d, 0xb4, 0x26, 0x00, 0x00, 0x00, 0x00}, + }; + + // Select the right NOP table. + // FIXME: Can we get if CPU supports long nops from the subtarget somehow? + const uint8_t (*Nops)[10] = HasNopl ? TrueNops : AltNops; + assert(HasNopl || MaxNopLength <= 7); - // 15 is the longest single nop instruction. Emit as many 15-byte nops as - // needed, then emit a nop of the remaining length. + // Emit as many largest nops as needed, then emit a nop of the remaining + // length. do { const uint8_t ThisNopLength = (uint8_t) std::min(Count, MaxNopLength); const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10; @@ -359,6 +393,17 @@ public: } }; +class ELFX86_IAMCUAsmBackend : public ELFX86AsmBackend { +public: + ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) + : ELFX86AsmBackend(T, OSABI, CPU) {} + + MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, + ELF::EM_IAMCU); + } +}; + class ELFX86_64AsmBackend : public ELFX86AsmBackend { public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) @@ -610,13 +655,13 @@ private: /// \brief Get the compact unwind number for a given register. The number /// corresponds to the enum lists in compact_unwind_encoding.h. int getCompactUnwindRegNum(unsigned Reg) const { - static const uint16_t CU32BitRegs[7] = { + static const MCPhysReg CU32BitRegs[7] = { X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 }; - static const uint16_t CU64BitRegs[] = { + static const MCPhysReg CU64BitRegs[] = { X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 }; - const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; + const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs; for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; @@ -780,6 +825,10 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, return new WindowsX86AsmBackend(T, false, CPU); uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS()); + + if (TheTriple.isOSIAMCU()) + return new ELFX86_IAMCUAsmBackend(T, OSABI, CPU); + return new ELFX86_32AsmBackend(T, OSABI, CPU); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index f0d00b0..9ff85b9 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -41,6 +41,16 @@ namespace X86 { /// AddrNumOperands - Total number of operands in a memory reference. AddrNumOperands = 5 }; + + /// AVX512 static rounding constants. These need to match the values in + /// avx512fintrin.h. + enum STATIC_ROUNDING { + TO_NEAREST_INT = 0, + TO_NEG_INF = 1, + TO_POS_INF = 2, + TO_ZERO = 3, + CUR_DIRECTION = 4 + }; } // end namespace X86; /// X86II - This namespace holds all of the target specific flags that @@ -675,7 +685,7 @@ namespace X86II { case X86II::RawFrmSrc: case X86II::RawFrmDst: case X86II::RawFrmDstSrc: - return -1; + return -1; case X86II::MRMDestMem: return 0; case X86II::MRMSrcMem: @@ -696,23 +706,27 @@ namespace X86II { // Start from 0, skip registers encoded in VEX_VVVV or a mask register. return 0 + HasVEX_4V + HasEVEX_K; case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2: - case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C8: + case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5: + case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8: case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB: + case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE: case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1: - case X86II::MRM_D4: case X86II::MRM_D5: case X86II::MRM_D6: - case X86II::MRM_D7: case X86II::MRM_D8: case X86II::MRM_D9: - case X86II::MRM_DA: case X86II::MRM_DB: case X86II::MRM_DC: - case X86II::MRM_DD: case X86II::MRM_DE: case X86II::MRM_DF: - case X86II::MRM_E0: case X86II::MRM_E1: case X86II::MRM_E2: - case X86II::MRM_E3: case X86II::MRM_E4: case X86II::MRM_E5: - case X86II::MRM_E8: case X86II::MRM_E9: case X86II::MRM_EA: - case X86II::MRM_EB: case X86II::MRM_EC: case X86II::MRM_ED: - case X86II::MRM_EE: case X86II::MRM_F0: case X86II::MRM_F1: - case X86II::MRM_F2: case X86II::MRM_F3: case X86II::MRM_F4: - case X86II::MRM_F5: case X86II::MRM_F6: case X86II::MRM_F7: - case X86II::MRM_F8: case X86II::MRM_F9: case X86II::MRM_FA: - case X86II::MRM_FB: case X86II::MRM_FC: case X86II::MRM_FD: - case X86II::MRM_FE: case X86II::MRM_FF: + case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4: + case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7: + case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA: + case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD: + case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0: + case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3: + case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6: + case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9: + case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC: + case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF: + case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2: + case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5: + case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8: + case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB: + case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE: + case X86II::MRM_FF: return -1; } } @@ -740,7 +754,7 @@ namespace X86II { case X86::R12B: case X86::R13B: case X86::R14B: case X86::R15B: case X86::CR8: case X86::CR9: case X86::CR10: case X86::CR11: case X86::CR12: case X86::CR13: case X86::CR14: case X86::CR15: - return true; + return true; } return false; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index a33468d..736c39d 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -32,9 +32,11 @@ namespace { X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine) - : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, - // Only i386 uses Rel instead of RelA. - /*HasRelocationAddend*/ EMachine != ELF::EM_386) {} + : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine, + // Only i386 and IAMCU use Rel instead of RelA. + /*HasRelocationAddend*/ + (EMachine != ELF::EM_386) && + (EMachine != ELF::EM_IAMCU)) {} X86ELFObjectWriter::~X86ELFObjectWriter() {} @@ -246,7 +248,8 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target, if (getEMachine() == ELF::EM_X86_64) return getRelocType64(Modifier, Type, IsPCRel); - assert(getEMachine() == ELF::EM_386 && "Unsupported ELF machine type."); + assert((getEMachine() == ELF::EM_386 || getEMachine() == ELF::EM_IAMCU) && + "Unsupported ELF machine type."); return getRelocType32(Modifier, getType32(Type), IsPCRel); } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h index deaad2a..30d5c80 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h @@ -20,39 +20,42 @@ #include "llvm/MC/MCAsmInfoELF.h" namespace llvm { - class Triple; - - class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { - virtual void anchor(); - - public: - explicit X86MCAsmInfoDarwin(const Triple &Triple); - }; - - struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { - explicit X86_64MCAsmInfoDarwin(const Triple &Triple); - const MCExpr * - getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, - MCStreamer &Streamer) const override; - }; - - class X86ELFMCAsmInfo : public MCAsmInfoELF { - void anchor() override; - public: - explicit X86ELFMCAsmInfo(const Triple &Triple); - }; - - class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { - void anchor() override; - public: - explicit X86MCAsmInfoMicrosoft(const Triple &Triple); - }; - - class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { - void anchor() override; - public: - explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); - }; +class Triple; + +class X86MCAsmInfoDarwin : public MCAsmInfoDarwin { + virtual void anchor(); + +public: + explicit X86MCAsmInfoDarwin(const Triple &Triple); +}; + +struct X86_64MCAsmInfoDarwin : public X86MCAsmInfoDarwin { + explicit X86_64MCAsmInfoDarwin(const Triple &Triple); + const MCExpr * + getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding, + MCStreamer &Streamer) const override; +}; + +class X86ELFMCAsmInfo : public MCAsmInfoELF { + void anchor() override; + +public: + explicit X86ELFMCAsmInfo(const Triple &Triple); +}; + +class X86MCAsmInfoMicrosoft : public MCAsmInfoMicrosoft { + void anchor() override; + +public: + explicit X86MCAsmInfoMicrosoft(const Triple &Triple); +}; + +class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF { + void anchor() override; + +public: + explicit X86MCAsmInfoGNUCOFF(const Triple &Triple); +}; } // namespace llvm #endif diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 10c434c..dfab6ec 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -510,8 +510,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op, // Otherwise, emit the most general non-SIB encoding: [REG+disp32] EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS); - EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), CurByte, OS, - Fixups); + EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte), + CurByte, OS, Fixups); return; } @@ -988,6 +988,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, const MCInstrDesc &Desc) { unsigned REX = 0; + bool UsesHighByteReg = false; + if (TSFlags & X86II::REX_W) REX |= 1 << 3; // set REX.W @@ -1004,6 +1006,8 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, const MCOperand &MO = MI.getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); + if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH) + UsesHighByteReg = true; if (!X86II::isX86_64NonExtLowByteReg(Reg)) continue; // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything // that returns non-zero. @@ -1073,6 +1077,9 @@ static unsigned DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags, } break; } + if (REX && UsesHighByteReg) + report_fatal_error("Cannot encode high byte register in REX-prefixed instruction"); + return REX; } diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 83b4091..53a6550 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -122,7 +122,8 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, } else if (TheTriple.isOSBinFormatELF()) { // Force the use of an ELF container. MAI = new X86ELFMCAsmInfo(TheTriple); - } else if (TheTriple.isWindowsMSVCEnvironment()) { + } else if (TheTriple.isWindowsMSVCEnvironment() || + TheTriple.isWindowsCoreCLREnvironment()) { MAI = new X86MCAsmInfoMicrosoft(TheTriple); } else if (TheTriple.isOSCygMing() || TheTriple.isWindowsItaniumEnvironment()) { @@ -267,3 +268,184 @@ extern "C" void LLVMInitializeX86TargetMC() { TargetRegistry::RegisterMCAsmBackend(TheX86_64Target, createX86_64AsmBackend); } + +unsigned llvm::getX86SubSuperRegisterOrZero(unsigned Reg, unsigned Size, + bool High) { + switch (Size) { + default: return 0; + case 8: + if (High) { + switch (Reg) { + default: return getX86SubSuperRegisterOrZero(Reg, 64); + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AH; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DH; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CH; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BH; + } + } else { + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AL; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DL; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CL; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BL; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SIL; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DIL; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BPL; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SPL; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8B; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9B; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10B; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11B; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12B; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13B; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14B; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15B; + } + } + case 16: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::AX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::DX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::CX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::BX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::SI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::DI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::BP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::SP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8W; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9W; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10W; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11W; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12W; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13W; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14W; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15W; + } + case 32: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::EAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::EDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::ECX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::EBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::ESI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::EDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::EBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::ESP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8D; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9D; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10D; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11D; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12D; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13D; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14D; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15D; + } + case 64: + switch (Reg) { + default: return 0; + case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: + return X86::RAX; + case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: + return X86::RDX; + case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: + return X86::RCX; + case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: + return X86::RBX; + case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: + return X86::RSI; + case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: + return X86::RDI; + case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: + return X86::RBP; + case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: + return X86::RSP; + case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: + return X86::R8; + case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: + return X86::R9; + case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: + return X86::R10; + case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: + return X86::R11; + case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: + return X86::R12; + case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: + return X86::R13; + case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: + return X86::R14; + case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: + return X86::R15; + } + } +} + +unsigned llvm::getX86SubSuperRegister(unsigned Reg, unsigned Size, bool High) { + unsigned Res = getX86SubSuperRegisterOrZero(Reg, Size, High); + assert(Res != 0 && "Unexpected register or VT"); + return Res; +} + + diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index 6221bab..2d2836f 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -79,7 +79,7 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, /// Takes ownership of \p AB and \p CE. MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, MCCodeEmitter *CE, - bool RelaxAll); + bool RelaxAll, bool IncrementalLinkerCompatible); /// Construct an X86 Mach-O object writer. MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, @@ -98,6 +98,17 @@ MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx); /// Construct X86-64 ELF relocation info. MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx); + +/// Returns the sub or super register of a specific X86 register. +/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX. +/// Aborts on error. +unsigned getX86SubSuperRegister(unsigned, unsigned, bool High=false); + +/// Returns the sub or super register of a specific X86 register. +/// Like getX86SubSuperRegister() but returns 0 on error. +unsigned getX86SubSuperRegisterOrZero(unsigned, unsigned, + bool High = false); + } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 9e801fc..191ebea 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -149,14 +149,19 @@ void X86MachObjectWriter::RecordX86_64Relocation( // Neither symbol can be modified. if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None || - Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported relocation of modified symbol", false); + Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of modified symbol"); + return; + } // We don't support PCrel relocations of differences. Darwin 'as' doesn't // implement most of these correctly. - if (IsPCRel) - report_fatal_error("unsupported pc-relative relocation of difference", - false); + if (IsPCRel) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported pc-relative relocation of difference"); + return; + } // The support for the situation where one or both of the symbols would // require a local relocation is handled just like if the symbols were @@ -168,16 +173,20 @@ void X86MachObjectWriter::RecordX86_64Relocation( // Darwin 'as' doesn't emit correct relocations for this (it ends up with a // single SIGNED relocation); reject it for now. Except the case where both // symbols don't have a base, equal but both NULL. - if (A_Base == B_Base && A_Base) - report_fatal_error("unsupported relocation with identical base", false); + if (A_Base == B_Base && A_Base) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation with identical base"); + return; + } // A subtraction expression where either symbol is undefined is a // non-relocatable expression. if (A->isUndefined() || B->isUndefined()) { StringRef Name = A->isUndefined() ? A->getName() : B->getName(); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), "unsupported relocation with subtraction expression, symbol '" + Name + "' can not be undefined in a subtraction expression"); + return; } Value += Writer->getSymbolAddress(*A, Layout) - @@ -244,12 +253,16 @@ void X86MachObjectWriter::RecordX86_64Relocation( FixedValue = Res; return; } else { - report_fatal_error("unsupported relocation of variable '" + - Symbol->getName() + "'", false); + Asm.getContext().reportError(Fixup.getLoc(), + "unsupported relocation of variable '" + + Symbol->getName() + "'"); + return; } } else { - report_fatal_error("unsupported relocation of undefined symbol '" + - Symbol->getName() + "'", false); + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported relocation of undefined symbol '" + + Symbol->getName() + "'"); + return; } MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind(); @@ -266,8 +279,9 @@ void X86MachObjectWriter::RecordX86_64Relocation( } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { Type = MachO::X86_64_RELOC_TLV; } else if (Modifier != MCSymbolRefExpr::VK_None) { - report_fatal_error("unsupported symbol modifier in relocation", - false); + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; } else { Type = MachO::X86_64_RELOC_SIGNED; @@ -292,9 +306,12 @@ void X86MachObjectWriter::RecordX86_64Relocation( } } } else { - if (Modifier != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported symbol modifier in branch " - "relocation", false); + if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), + "unsupported symbol modifier in branch relocation"); + return; + } Type = MachO::X86_64_RELOC_BRANCH; } @@ -309,16 +326,22 @@ void X86MachObjectWriter::RecordX86_64Relocation( Type = MachO::X86_64_RELOC_GOT; IsPCRel = 1; } else if (Modifier == MCSymbolRefExpr::VK_TLVP) { - report_fatal_error("TLVP symbol modifier should have been rip-rel", - false); - } else if (Modifier != MCSymbolRefExpr::VK_None) - report_fatal_error("unsupported symbol modifier in relocation", false); - else { + Asm.getContext().reportError( + Fixup.getLoc(), "TLVP symbol modifier should have been rip-rel"); + return; + } else if (Modifier != MCSymbolRefExpr::VK_None) { + Asm.getContext().reportError( + Fixup.getLoc(), "unsupported symbol modifier in relocation"); + return; + } else { Type = MachO::X86_64_RELOC_UNSIGNED; unsigned Kind = Fixup.getKind(); - if (Kind == X86::reloc_signed_4byte) - report_fatal_error("32-bit absolute addressing is not supported in " - "64-bit mode", false); + if (Kind == X86::reloc_signed_4byte) { + Asm.getContext().reportError( + Fixup.getLoc(), + "32-bit absolute addressing is not supported in 64-bit mode"); + return; + } } } } @@ -350,10 +373,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, // See <reloc.h>. const MCSymbol *A = &Target.getSymA()->getSymbol(); - if (!A->getFragment()) - report_fatal_error("symbol '" + A->getName() + - "' can not be undefined in a subtraction expression", - false); + if (!A->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + A->getName() + + "' can not be undefined in a subtraction expression"); + return false; + } uint32_t Value = Writer->getSymbolAddress(*A, Layout); uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent()); @@ -363,10 +389,13 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, if (const MCSymbolRefExpr *B = Target.getSymB()) { const MCSymbol *SB = &B->getSymbol(); - if (!SB->getFragment()) - report_fatal_error("symbol '" + B->getSymbol().getName() + - "' can not be undefined in a subtraction expression", - false); + if (!SB->getFragment()) { + Asm.getContext().reportError( + Fixup.getLoc(), + "symbol '" + B->getSymbol().getName() + + "' can not be undefined in a subtraction expression"); + return false; + } // Select the appropriate difference relocation type. // @@ -387,12 +416,12 @@ bool X86MachObjectWriter::recordScatteredRelocation(MachObjectWriter *Writer, if (FixupOffset > 0xffffff) { char Buffer[32]; format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer)); - Asm.getContext().reportFatalError(Fixup.getLoc(), + Asm.getContext().reportError(Fixup.getLoc(), Twine("Section too large, can't encode " "r_address (") + Buffer + ") into 24 bits of scattered " "relocation entry."); - llvm_unreachable("fatal error returned?!"); + return false; } MachO::any_relocation_info MRE; diff --git a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index 92f42b6..d045118 100644 --- a/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/contrib/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -50,9 +50,11 @@ void X86WinCOFFStreamer::FinishImpl() { MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, raw_pwrite_stream &OS, - MCCodeEmitter *CE, bool RelaxAll) { + MCCodeEmitter *CE, bool RelaxAll, + bool IncrementalLinkerCompatible) { X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); S->getAssembler().setRelaxAll(RelaxAll); + S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); return S; } diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp index cae865a..4fdd527 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -140,13 +140,14 @@ void DecodePALIGNRMask(MVT VT, unsigned Imm, } } -/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*. +/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*. /// VT indicates the type of the vector allowing it to handle different /// datatypes and vector widths. void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits() / 128; + if (NumLanes == 0) NumLanes = 1; // Handle MMX unsigned NumLaneElts = NumElts / NumLanes; unsigned NewImm = Imm; @@ -191,6 +192,16 @@ void DecodePSHUFLWMask(MVT VT, unsigned Imm, } } +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumHalfElts = NumElts / 2; + + for (unsigned l = 0; l != NumHalfElts; ++l) + ShuffleMask.push_back(l + NumHalfElts); + for (unsigned h = 0; h != NumHalfElts; ++h) + ShuffleMask.push_back(h); +} + /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. @@ -222,7 +233,7 @@ void DecodeUNPCKHMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits() / 128; - if (NumLanes == 0 ) NumLanes = 1; // Handle MMX + if (NumLanes == 0) NumLanes = 1; // Handle MMX unsigned NumLaneElts = NumElts / NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { @@ -253,6 +264,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVectorImpl<int> &ShuffleMask) { } } +/// \brief Decode a shuffle packed values at 128-bit granularity +/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits(); + unsigned ControlBitsMask = NumLanes - 1; + unsigned NumControlBits = NumLanes / 2; + + for (unsigned l = 0; l != NumLanes; ++l) { + unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask; + // We actually need the other source. + if (l >= NumLanes / 2) + LaneMask += NumLanes; + for (unsigned i = 0; i != NumElementsInLane; ++i) + ShuffleMask.push_back(LaneMask * NumElementsInLane + i); + } +} + void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { unsigned HalfSize = VT.getVectorNumElements() / 2; @@ -277,10 +308,10 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { // <4 x i32> <i32 -2147483648, i32 -2147483648, // i32 -2147483648, i32 -2147483648> +#ifndef NDEBUG unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); - - if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. - return; + assert(MaskTySize == 128 || MaskTySize == 256 || MaskTySize == 512); +#endif // This is a straightforward byte vector. if (MaskTy->isVectorTy() && MaskTy->getVectorElementType()->isIntegerTy(8)) { @@ -290,7 +321,7 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { for (int i = 0; i < NumElements; ++i) { // For AVX vectors with 32 bytes the base of the shuffle is the 16-byte // lane of the vector we're inside. - int Base = i < 16 ? 0 : 16; + int Base = i & ~0xf; Constant *COp = C->getAggregateElement(i); if (!COp) { ShuffleMask.clear(); @@ -357,44 +388,66 @@ void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask) { } } -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) { +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask) { Type *MaskTy = C->getType(); - assert(MaskTy->isVectorTy() && "Expected a vector constant mask!"); - assert(MaskTy->getVectorElementType()->isIntegerTy() && - "Expected integer constant mask elements!"); - int ElementBits = MaskTy->getScalarSizeInBits(); - int NumElements = MaskTy->getVectorNumElements(); + // It is not an error for the PSHUFB mask to not be a vector of i8 because the + // constant pool uniques constants by their bit representation. + // e.g. the following take up the same space in the constant pool: + // i128 -170141183420855150465331762880109871104 + // + // <2 x i64> <i64 -9223372034707292160, i64 -9223372034707292160> + // + // <4 x i32> <i32 -2147483648, i32 -2147483648, + // i32 -2147483648, i32 -2147483648> + + unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits(); + + if (MaskTySize != 128 && MaskTySize != 256) // FIXME: Add support for AVX-512. + return; + + // Only support vector types. + if (!MaskTy->isVectorTy()) + return; + + // Make sure its an integer type. + Type *VecEltTy = MaskTy->getVectorElementType(); + if (!VecEltTy->isIntegerTy()) + return; + + // Support any element type from byte up to element size. + // This is necesary primarily because 64-bit elements get split to 32-bit + // in the constant pool on 32-bit target. + unsigned EltTySize = VecEltTy->getIntegerBitWidth(); + if (EltTySize < 8 || EltTySize > ElSize) + return; + + unsigned NumElements = MaskTySize / ElSize; assert((NumElements == 2 || NumElements == 4 || NumElements == 8) && "Unexpected number of vector elements."); ShuffleMask.reserve(NumElements); - if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { - assert((unsigned)NumElements == CDS->getNumElements() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - uint64_t Element = CDS->getElementAsInteger(i); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); - } - } else if (auto *CV = dyn_cast<ConstantVector>(C)) { - assert((unsigned)NumElements == C->getNumOperands() && - "Constant mask has a different number of elements!"); - - for (int i = 0; i < NumElements; ++i) { - int Base = (i * ElementBits / 128) * (128 / ElementBits); - Constant *COp = CV->getOperand(i); - if (isa<UndefValue>(COp)) { - ShuffleMask.push_back(SM_SentinelUndef); - continue; - } - uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); - // Only the least significant 2 bits of the integer are used. - int Index = Base + (Element & 0x3); - ShuffleMask.push_back(Index); + unsigned NumElementsPerLane = 128 / ElSize; + unsigned Factor = ElSize / EltTySize; + + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i * Factor); + if (!COp) { + ShuffleMask.clear(); + return; + } else if (isa<UndefValue>(COp)) { + ShuffleMask.push_back(SM_SentinelUndef); + continue; } + int Index = i & ~(NumElementsPerLane - 1); + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + if (ElSize == 64) + Index += (Element >> 1) & 0x1; + else + Index += Element & 0x3; + ShuffleMask.push_back(Index); } + + // TODO: Handle funny-looking vectors too. } void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, SmallVectorImpl<int> &Mask) { @@ -503,4 +556,74 @@ void DecodeINSERTQIMask(int Len, int Idx, ShuffleMask.push_back(SM_SentinelUndef); } +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask) { + for (int i = 0, e = RawMask.size(); i < e; ++i) { + uint64_t M = RawMask[i]; + ShuffleMask.push_back((int)M); + } +} + +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + if (MaskTy->isVectorTy()) { + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp))) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements) - 1; + ShuffleMask.push_back(Element); + } + } + } + return; + } + // Scalar value; just broadcast it + if (!isa<ConstantInt>(C)) + return; + uint64_t Element = cast<ConstantInt>(C)->getZExtValue(); + int NumElements = VT.getVectorNumElements(); + Element &= (1 << NumElements) - 1; + for (int i = 0; i < NumElements; ++i) + ShuffleMask.push_back(Element); +} + +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask) { + Type *MaskTy = C->getType(); + unsigned NumElements = MaskTy->getVectorNumElements(); + if (NumElements == VT.getVectorNumElements()) { + for (unsigned i = 0; i < NumElements; ++i) { + Constant *COp = C->getAggregateElement(i); + if (!COp) { + ShuffleMask.clear(); + return; + } + if (isa<UndefValue>(COp)) + ShuffleMask.push_back(SM_SentinelUndef); + else { + uint64_t Element = cast<ConstantInt>(COp)->getZExtValue(); + Element &= (1 << NumElements*2) - 1; + ShuffleMask.push_back(Element); + } + } + } +} } // llvm namespace diff --git a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h index 3d10d18..ab18e64 100644 --- a/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/contrib/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -54,6 +54,9 @@ void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodePSHUFLWMask(MVT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decodes a PSWAPD 3DNow! instruction. +void DecodePSWAPMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); + /// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates /// the type of the vector allowing it to handle different datatypes and vector /// widths. @@ -83,12 +86,18 @@ void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); void DecodeVPERM2X128Mask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask); +/// \brief Decode a shuffle packed values at 128-bit granularity +/// immediate mask into a shuffle mask. +void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm, + SmallVectorImpl<int> &ShuffleMask); + /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD. /// No VT provided since it only works on 256-bit, 4 element vectors. void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a VPERMILP variable mask from an IR-level vector constant. -void DecodeVPERMILPMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask); +void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, + SmallVectorImpl<int> &ShuffleMask); /// \brief Decode a zero extension instruction as a shuffle mask. void DecodeZeroExtendMask(MVT SrcVT, MVT DstVT, @@ -108,6 +117,22 @@ void DecodeEXTRQIMask(int Len, int Idx, /// \brief Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. void DecodeINSERTQIMask(int Len, int Idx, SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMVMask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMVMask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from an IR-level vector constant. +void DecodeVPERMV3Mask(const Constant *C, MVT VT, + SmallVectorImpl<int> &ShuffleMask); + +/// \brief Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants. +void DecodeVPERMV3Mask(ArrayRef<uint64_t> RawMask, + SmallVectorImpl<int> &ShuffleMask); } // llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 8403ae6..fbec662 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -23,56 +23,47 @@ class FunctionPass; class ImmutablePass; class X86TargetMachine; -/// createX86ISelDag - This pass converts a legalized DAG into a -/// X86-specific DAG, ready for instruction scheduling. -/// +/// This pass converts a legalized DAG into a X86-specific DAG, ready for +/// instruction scheduling. FunctionPass *createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel); -/// createX86GlobalBaseRegPass - This pass initializes a global base -/// register for PIC on x86-32. +/// This pass initializes a global base register for PIC on x86-32. FunctionPass* createX86GlobalBaseRegPass(); -/// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses -/// to local-dynamic TLS variables so that the TLS base address for the module -/// is only fetched once per execution path through the function. +/// This pass combines multiple accesses to local-dynamic TLS variables so that +/// the TLS base address for the module is only fetched once per execution path +/// through the function. FunctionPass *createCleanupLocalDynamicTLSPass(); -/// createX86FloatingPointStackifierPass - This function returns a pass which -/// converts floating point register references and pseudo instructions into -/// floating point stack references and physical instructions. -/// +/// This function returns a pass which converts floating-point register +/// references and pseudo instructions into floating-point stack references and +/// physical instructions. FunctionPass *createX86FloatingPointStackifierPass(); -/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions -/// before each call to avoid transition penalty between functions encoded with -/// AVX and SSE. +/// This pass inserts AVX vzeroupper instructions before each call to avoid +/// transition penalty between functions encoded with AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); -/// createX86EmitCodeToMemory - Returns a pass that converts a register -/// allocated function into raw machine code in a dynamically -/// allocated chunk of memory. -/// -FunctionPass *createEmitX86CodeToMemory(); - -/// createX86PadShortFunctions - Return a pass that pads short functions -/// with NOOPs. This will prevent a stall when returning on the Atom. +/// Return a pass that pads short functions with NOOPs. +/// This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); -/// createX86FixupLEAs - Return a a pass that selectively replaces -/// certain instructions (like add, sub, inc, dec, some shifts, -/// and some multiplies) by equivalent LEA instructions, in order -/// to eliminate execution delays in some Atom processors. + +/// Return a a pass that selectively replaces certain instructions (like add, +/// sub, inc, dec, some shifts, and some multiplies) by equivalent LEA +/// instructions, in order to eliminate execution delays in some processors. FunctionPass *createX86FixupLEAs(); -/// createX86CallFrameOptimization - Return a pass that optimizes -/// the code-size of x86 call sequences. This is done by replacing -/// esp-relative movs with pushes. +/// Return a pass that removes redundant address recalculations. +FunctionPass *createX86OptimizeLEAs(); + +/// Return a pass that optimizes the code-size of x86 call sequences. This is +/// done by replacing esp-relative movs with pushes. FunctionPass *createX86CallFrameOptimization(); -/// createX86WinEHStatePass - Return an IR pass that inserts EH registration -/// stack objects and explicit EH state updates. This pass must run after EH -/// preparation, which does Windows-specific but architecture-neutral -/// preparation. +/// Return an IR pass that inserts EH registration stack objects and explicit +/// EH state updates. This pass must run after EH preparation, which does +/// Windows-specific but architecture-neutral preparation. FunctionPass *createX86WinEHStatePass(); /// Return a Machine IR pass that expands X86-specific pseudo diff --git a/contrib/llvm/lib/Target/X86/X86.td b/contrib/llvm/lib/Target/X86/X86.td index 8522674..8902a85 100644 --- a/contrib/llvm/lib/Target/X86/X86.td +++ b/contrib/llvm/lib/Target/X86/X86.td @@ -37,14 +37,26 @@ def FeatureCMOV : SubtargetFeature<"cmov","HasCMov", "true", def FeaturePOPCNT : SubtargetFeature<"popcnt", "HasPOPCNT", "true", "Support POPCNT instruction">; +def FeatureFXSR : SubtargetFeature<"fxsr", "HasFXSR", "true", + "Support fxsave/fxrestore instructions">; + +def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", + "Support xsave instructions">; + +def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", + "Support xsaveopt instructions">; + +def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", + "Support xsavec instructions">; + +def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", + "Support xsaves instructions">; -def FeatureMMX : SubtargetFeature<"mmx","X86SSELevel", "MMX", - "Enable MMX instructions">; def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions", // SSE codegen depends on cmovs, and all // SSE1+ processors support them. - [FeatureMMX, FeatureCMOV]>; + [FeatureCMOV]>; def FeatureSSE2 : SubtargetFeature<"sse2", "X86SSELevel", "SSE2", "Enable SSE2 instructions", [FeatureSSE1]>; @@ -60,6 +72,11 @@ def FeatureSSE41 : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41", def FeatureSSE42 : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42", "Enable SSE 4.2 instructions", [FeatureSSE41]>; +// The MMX subtarget feature is separate from the rest of the SSE features +// because it's important (for odd compatibility reasons) to be able to +// turn it off explicitly while allowing SSE+ to be on. +def FeatureMMX : SubtargetFeature<"mmx","X863DNowLevel", "MMX", + "Enable MMX instructions">; def Feature3DNow : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow", "Enable 3DNow! instructions", [FeatureMMX]>; @@ -79,16 +96,13 @@ def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; -// FIXME: This is a 16-byte (SSE/AVX) feature; we should rename it to make that -// explicit. Also, it seems this would be the default state for most chips -// going forward, so it would probably be better to negate the logic and -// match the 32-byte "slow mem" feature below. -def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem", - "IsUAMemFast", "true", - "Fast unaligned memory access">; +// FIXME: This should not apply to CPUs that do not have SSE. +def FeatureSlowUAMem16 : SubtargetFeature<"slow-unaligned-mem-16", + "IsUAMem16Slow", "true", + "Slow unaligned 16-byte memory access">; def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32", - "IsUAMem32Slow", "true", - "Slow unaligned 32-byte memory access">; + "IsUAMem32Slow", "true", + "Slow unaligned 32-byte memory access">; def FeatureSSE4A : SubtargetFeature<"sse4a", "HasSSE4A", "true", "Support SSE 4a instructions", [FeatureSSE3]>; @@ -120,6 +134,8 @@ def FeatureBWI : SubtargetFeature<"avx512bw", "HasBWI", "true", def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", "Enable AVX-512 Vector Length eXtensions", [FeatureAVX512]>; +def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", + "Enable protection keys">; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -168,9 +184,11 @@ def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", "Support RDSEED instruction">; +def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true", + "Support LAHF and SAHF instructions">; def FeatureMPX : SubtargetFeature<"mpx", "HasMPX", "true", "Support MPX instructions">; -def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", +def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true", "Use LEA for adjusting the stack pointer">; def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb", "HasSlowDivide32", "true", @@ -181,6 +199,11 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divw", def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions", "PadShortFunctions", "true", "Pad short functions">; +// TODO: This feature ought to be renamed. +// What it really refers to are CPUs for which certain instructions +// (which ones besides the example below?) are microcoded. +// The best examples of this are the memory forms of CALL and PUSH +// instructions, which should be avoided in favor of a MOV + register CALL/PUSH. def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", "CallRegIndirect", "true", "Call register indirect">; @@ -208,278 +231,473 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; -def : Proc<"generic", []>; -def : Proc<"i386", []>; -def : Proc<"i486", []>; -def : Proc<"i586", []>; -def : Proc<"pentium", []>; -def : Proc<"pentium-mmx", [FeatureMMX]>; -def : Proc<"i686", []>; -def : Proc<"pentiumpro", [FeatureCMOV]>; -def : Proc<"pentium2", [FeatureMMX, FeatureCMOV]>; -def : Proc<"pentium3", [FeatureSSE1]>; -def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>; -def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>; -def : Proc<"pentium4", [FeatureSSE2]>; -def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>; +def : Proc<"generic", [FeatureSlowUAMem16]>; +def : Proc<"i386", [FeatureSlowUAMem16]>; +def : Proc<"i486", [FeatureSlowUAMem16]>; +def : Proc<"i586", [FeatureSlowUAMem16]>; +def : Proc<"pentium", [FeatureSlowUAMem16]>; +def : Proc<"pentium-mmx", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"i686", [FeatureSlowUAMem16]>; +def : Proc<"pentiumpro", [FeatureSlowUAMem16, FeatureCMOV]>; +def : Proc<"pentium2", [FeatureSlowUAMem16, FeatureMMX, FeatureCMOV, + FeatureFXSR]>; +def : Proc<"pentium3", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR]>; +def : Proc<"pentium3m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium-m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; +def : Proc<"pentium4", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR]>; +def : Proc<"pentium4m", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, + FeatureFXSR, FeatureSlowBTMem]>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, - [FeatureSSE3, FeatureSlowBTMem]>; + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; // NetBurst. -def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>; -def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : Proc<"prescott", + [FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, + FeatureSlowBTMem]>; +def : Proc<"nocona", [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem +]>; // Intel Core 2 Solo/Duo. -def : ProcessorModel<"core2", SandyBridgeModel, - [FeatureSSSE3, FeatureCMPXCHG16B, FeatureSlowBTMem]>; -def : ProcessorModel<"penryn", SandyBridgeModel, - [FeatureSSE41, FeatureCMPXCHG16B, FeatureSlowBTMem]>; +def : ProcessorModel<"core2", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; +def : ProcessorModel<"penryn", SandyBridgeModel, [ + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSE41, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; // Atom CPUs. class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ - ProcIntelAtom, - FeatureSSSE3, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeatureSlowBTMem, - FeatureLeaForSP, - FeatureSlowDivide32, - FeatureSlowDivide64, - FeatureCallRegIndirect, - FeatureLEAUsesAG, - FeaturePadShortFunctions - ]>; + ProcIntelAtom, + FeatureSlowUAMem16, + FeatureMMX, + FeatureSSSE3, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeatureSlowBTMem, + FeatureLEAForSP, + FeatureSlowDivide32, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeatureLEAUsesAG, + FeaturePadShortFunctions, + FeatureLAHFSAHF +]>; def : BonnellProc<"bonnell">; def : BonnellProc<"atom">; // Pin the generic name to the baseline. class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ - ProcIntelSLM, - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureMOVBE, - FeaturePOPCNT, - FeaturePCLMUL, - FeatureAES, - FeatureSlowDivide64, - FeatureCallRegIndirect, - FeaturePRFCHW, - FeatureSlowLEA, - FeatureSlowIncDec, - FeatureSlowBTMem, - FeatureFastUAMem - ]>; + ProcIntelSLM, + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureMOVBE, + FeaturePOPCNT, + FeaturePCLMUL, + FeatureAES, + FeatureSlowDivide64, + FeatureCallRegIndirect, + FeaturePRFCHW, + FeatureSlowLEA, + FeatureSlowIncDec, + FeatureSlowBTMem, + FeatureLAHFSAHF +]>; def : SilvermontProc<"silvermont">; def : SilvermontProc<"slm">; // Legacy alias. // "Arrandale" along with corei3 and corei5 class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureFastUAMem, - FeaturePOPCNT - ]>; + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureLAHFSAHF +]>; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureSSE42, - FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL - ]>; + FeatureMMX, + FeatureSSE42, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureLAHFSAHF +]>; def : WestmereProc<"westmere">; // SSE is not listed here since llvm treats AVX as a reimplementation of SSE, // rather than a superset. class SandyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureAVX, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeatureSlowUAMem32, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL - ]>; + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureLAHFSAHF +]>; def : SandyBridgeProc<"sandybridge">; def : SandyBridgeProc<"corei7-avx">; // Legacy alias. class IvyBridgeProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ - FeatureAVX, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeatureSlowUAMem32, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase - ]>; + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeatureSlowUAMem32, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; def : IvyBridgeProc<"ivybridge">; def : IvyBridgeProc<"core-avx-i">; // Legacy alias. class HaswellProc<string Name> : ProcessorModel<Name, HaswellModel, [ - FeatureAVX2, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeatureRTM, - FeatureHLE, - FeatureSlowIncDec - ]>; + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureRDRAND, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; def : HaswellProc<"haswell">; def : HaswellProc<"core-avx2">; // Legacy alias. class BroadwellProc<string Name> : ProcessorModel<Name, HaswellModel, [ - FeatureAVX2, - FeatureCMPXCHG16B, - FeatureFastUAMem, - FeaturePOPCNT, - FeatureAES, - FeaturePCLMUL, - FeatureRDRAND, - FeatureF16C, - FeatureFSGSBase, - FeatureMOVBE, - FeatureLZCNT, - FeatureBMI, - FeatureBMI2, - FeatureFMA, - FeatureRTM, - FeatureHLE, - FeatureADX, - FeatureRDSEED, - FeatureSlowIncDec - ]>; + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureLAHFSAHF +]>; def : BroadwellProc<"broadwell">; // FIXME: define KNL model -class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, - [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI, - FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, - FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureMPX]>; +class KnightsLandingProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureERI, + FeatureCDI, + FeaturePFI, + FeatureCMPXCHG16B, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureSlowIncDec, + FeatureMPX, + FeatureLAHFSAHF +]>; def : KnightsLandingProc<"knl">; // FIXME: define SKX model -class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, - [FeatureAVX512, FeatureCDI, - FeatureDQI, FeatureBWI, FeatureVLX, - FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT, - FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C, - FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI, - FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE, - FeatureSlowIncDec, FeatureMPX]>; +class SkylakeProc<string Name> : ProcessorModel<Name, HaswellModel, [ + FeatureMMX, + FeatureAVX512, + FeatureFXSR, + FeatureCDI, + FeatureDQI, + FeatureBWI, + FeatureVLX, + FeaturePKU, + FeatureCMPXCHG16B, + FeatureSlowBTMem, + FeaturePOPCNT, + FeatureAES, + FeaturePCLMUL, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureRDRAND, + FeatureF16C, + FeatureFSGSBase, + FeatureMOVBE, + FeatureLZCNT, + FeatureBMI, + FeatureBMI2, + FeatureFMA, + FeatureRTM, + FeatureHLE, + FeatureADX, + FeatureRDSEED, + FeatureSlowIncDec, + FeatureMPX, + FeatureXSAVEC, + FeatureXSAVES, + FeatureLAHFSAHF +]>; def : SkylakeProc<"skylake">; def : SkylakeProc<"skx">; // Legacy alias. // AMD CPUs. -def : Proc<"k6", [FeatureMMX]>; -def : Proc<"k6-2", [Feature3DNow]>; -def : Proc<"k6-3", [Feature3DNow]>; -def : Proc<"athlon", [Feature3DNowA, FeatureSlowBTMem, +def : Proc<"k6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"k6-2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"k6-3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"athlon", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-tbird", [FeatureSlowUAMem16, Feature3DNowA, + FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-4", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-xp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"athlon-mp", [FeatureSlowUAMem16, FeatureSSE1, Feature3DNowA, + FeatureFXSR, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"k8", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-tbird", [Feature3DNowA, FeatureSlowBTMem, +def : Proc<"opteron", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon64", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-xp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"athlon-fx", [FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-mp", [FeatureSSE1, Feature3DNowA, FeatureSlowBTMem, +def : Proc<"k8-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-fx", [FeatureSSE2, Feature3DNowA, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64-sse3", [FeatureSSE3, Feature3DNowA, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"amdfam10", [FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, +def : Proc<"opteron-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"barcelona", [FeatureSSE4A, - Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, +def : Proc<"athlon64-sse3", [FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowBTMem, FeatureSlowSHLD]>; +def : Proc<"amdfam10", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; +def : Proc<"barcelona", [FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowBTMem, FeatureSlowSHLD, FeatureLAHFSAHF]>; + // Bobcat -def : Proc<"btver1", [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT, - FeatureSlowSHLD]>; +def : Proc<"btver1", [ + FeatureMMX, + FeatureSSSE3, + FeatureSSE4A, + FeatureFXSR, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Jaguar -def : ProcessorModel<"btver2", BtVer2Model, - [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B, - FeaturePRFCHW, FeatureAES, FeaturePCLMUL, - FeatureBMI, FeatureF16C, FeatureMOVBE, - FeatureLZCNT, FeaturePOPCNT, FeatureFastUAMem, - FeatureSlowSHLD]>; - -// TODO: We should probably add 'FeatureFastUAMem' to all of the AMD chips. +def : ProcessorModel<"btver2", BtVer2Model, [ + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureCMPXCHG16B, + FeaturePRFCHW, + FeatureAES, + FeaturePCLMUL, + FeatureBMI, + FeatureF16C, + FeatureMOVBE, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Bulldozer -def : Proc<"bdver1", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowSHLD]>; +def : Proc<"bdver1", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Piledriver -def : Proc<"bdver2", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureF16C, - FeatureLZCNT, FeaturePOPCNT, FeatureBMI, - FeatureTBM, FeatureFMA, FeatureSlowSHLD]>; +def : Proc<"bdver2", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureSlowSHLD, + FeatureLAHFSAHF +]>; // Steamroller -def : Proc<"bdver3", [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B, - FeatureAES, FeaturePRFCHW, FeaturePCLMUL, - FeatureAVX, FeatureSSE4A, FeatureF16C, - FeatureLZCNT, FeaturePOPCNT, FeatureBMI, - FeatureTBM, FeatureFMA, FeatureSlowSHLD, - FeatureFSGSBase]>; +def : Proc<"bdver3", [ + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureMMX, + FeatureAVX, + FeatureFXSR, + FeatureSSE4A, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureSlowSHLD, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; // Excavator -def : Proc<"bdver4", [FeatureAVX2, FeatureXOP, FeatureFMA4, - FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW, - FeaturePCLMUL, FeatureF16C, FeatureLZCNT, - FeaturePOPCNT, FeatureBMI, FeatureBMI2, - FeatureTBM, FeatureFMA, FeatureSSE4A, - FeatureFSGSBase]>; - -def : Proc<"geode", [Feature3DNowA]>; - -def : Proc<"winchip-c6", [FeatureMMX]>; -def : Proc<"winchip2", [Feature3DNow]>; -def : Proc<"c3", [Feature3DNow]>; -def : Proc<"c3-2", [FeatureSSE1]>; +def : Proc<"bdver4", [ + FeatureMMX, + FeatureAVX2, + FeatureFXSR, + FeatureXOP, + FeatureFMA4, + FeatureCMPXCHG16B, + FeatureAES, + FeaturePRFCHW, + FeaturePCLMUL, + FeatureF16C, + FeatureLZCNT, + FeaturePOPCNT, + FeatureXSAVE, + FeatureBMI, + FeatureBMI2, + FeatureTBM, + FeatureFMA, + FeatureXSAVEOPT, + FeatureFSGSBase, + FeatureLAHFSAHF +]>; + +def : Proc<"geode", [FeatureSlowUAMem16, Feature3DNowA]>; + +def : Proc<"winchip-c6", [FeatureSlowUAMem16, FeatureMMX]>; +def : Proc<"winchip2", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3", [FeatureSlowUAMem16, Feature3DNow]>; +def : Proc<"c3-2", [FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, FeatureFXSR]>; // We also provide a generic 64-bit specific x86 processor model which tries to // be good for modern chips without enabling instruction set encodings past the @@ -492,8 +710,8 @@ def : Proc<"c3-2", [FeatureSSE1]>; // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. def : ProcessorModel<"x86-64", SandyBridgeModel, - [FeatureSSE2, Feature64Bit, FeatureSlowBTMem, - FeatureFastUAMem]>; + [FeatureMMX, FeatureSSE2, FeatureFXSR, Feature64Bit, + FeatureSlowBTMem ]>; //===----------------------------------------------------------------------===// // Register File Description @@ -520,10 +738,6 @@ include "X86CallingConv.td" // Assembly Parser //===----------------------------------------------------------------------===// -def ATTAsmParser : AsmParser { - string AsmParserClassName = "AsmParser"; -} - def ATTAsmParserVariant : AsmParserVariant { int Variant = 0; @@ -568,7 +782,6 @@ def IntelAsmWriter : AsmWriter { def X86 : Target { // Information about the instructions... let InstructionSet = X86InstrInfo; - let AssemblyParsers = [ATTAsmParser]; let AssemblyParserVariants = [ATTAsmParserVariant, IntelAsmParserVariant]; let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter]; } diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp index ba33248..2170e62 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -217,10 +217,10 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI, if (AsmVariant == 0) O << '%'; unsigned Reg = MO.getReg(); if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) { - MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ? - MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 : - ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8)); - Reg = getX86SubSuperRegister(Reg, VT); + unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 : + (strcmp(Modifier+6,"32") == 0) ? 32 : + (strcmp(Modifier+6,"16") == 0) ? 16 : 8; + Reg = getX86SubSuperRegister(Reg, Size); } O << X86ATTInstPrinter::getRegisterName(Reg); return; @@ -361,22 +361,21 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO, switch (Mode) { default: return true; // Unknown mode. case 'b': // Print QImode register - Reg = getX86SubSuperRegister(Reg, MVT::i8); + Reg = getX86SubSuperRegister(Reg, 8); break; case 'h': // Print QImode high register - Reg = getX86SubSuperRegister(Reg, MVT::i8, true); + Reg = getX86SubSuperRegister(Reg, 8, true); break; case 'w': // Print HImode register - Reg = getX86SubSuperRegister(Reg, MVT::i16); + Reg = getX86SubSuperRegister(Reg, 16); break; case 'k': // Print SImode register - Reg = getX86SubSuperRegister(Reg, MVT::i32); + Reg = getX86SubSuperRegister(Reg, 32); break; case 'q': // Print 64-bit register names if 64-bit integer registers are available. // Otherwise, print 32-bit register names. - MVT::SimpleValueType Ty = P.getSubtarget().is64Bit() ? MVT::i64 : MVT::i32; - Reg = getX86SubSuperRegister(Reg, Ty); + Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32); break; } @@ -535,6 +534,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) { S, MCConstantExpr::create(int64_t(1), MMI->getContext())); } } + OutStreamer->EmitSyntaxDirective(); } static void @@ -565,10 +565,11 @@ MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const { const MachineConstantPoolEntry &CPE = MF->getConstantPool()->getConstants()[CPID]; if (!CPE.isMachineConstantPoolEntry()) { - SectionKind Kind = CPE.getSectionKind(TM.getDataLayout()); + const DataLayout &DL = MF->getDataLayout(); + SectionKind Kind = CPE.getSectionKind(&DL); const Constant *C = CPE.Val.ConstVal; if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>( - getObjFileLowering().getSectionForConstant(Kind, C))) { + getObjFileLowering().getSectionForConstant(DL, Kind, C))) { if (MCSymbol *Sym = S->getCOMDATSymbol()) { if (Sym->isUndefined()) OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global); diff --git a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h index 7f5d127..9c8bd98 100644 --- a/contrib/llvm/lib/Target/X86/X86AsmPrinter.h +++ b/contrib/llvm/lib/Target/X86/X86AsmPrinter.h @@ -78,8 +78,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { // outputting it to the OutStream. This allows the shadow tracker to minimise // the number of NOPs used for stackmap padding. void EmitAndCountInstruction(MCInst &Inst); - - void InsertStackMapShadows(MachineFunction &MF); void LowerSTACKMAP(const MachineInstr &MI); void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL); diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index 031ba4b..fc6ee17 100644 --- a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" @@ -53,10 +54,13 @@ private: // Information we know about a particular call site struct CallContext { CallContext() - : Call(nullptr), SPCopy(nullptr), ExpectedDist(0), - MovVector(4, nullptr), NoStackParams(false), UsePush(false){}; + : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0), + MovVector(4, nullptr), NoStackParams(false), UsePush(false){} - // Actuall call instruction + // Iterator referring to the frame setup instruction + MachineBasicBlock::iterator FrameSetup; + + // Actual call instruction MachineInstr *Call; // A copy of the stack pointer @@ -75,17 +79,16 @@ private: bool UsePush; }; - typedef DenseMap<MachineInstr *, CallContext> ContextMap; + typedef SmallVector<CallContext, 8> ContextVector; bool isLegal(MachineFunction &MF); - bool isProfitable(MachineFunction &MF, ContextMap &CallSeqMap); + bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap); void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, CallContext &Context); - bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock::iterator I, - const CallContext &Context); + bool adjustCallSequence(MachineFunction &MF, const CallContext &Context); MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, unsigned Reg); @@ -100,7 +103,8 @@ private: const char *getPassName() const override { return "X86 Optimize Call Frame"; } const TargetInstrInfo *TII; - const TargetFrameLowering *TFL; + const X86FrameLowering *TFL; + const X86Subtarget *STI; const MachineRegisterInfo *MRI; static char ID; }; @@ -124,8 +128,15 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // No point in running this in 64-bit mode, since some arguments are // passed in-register in all common calling conventions, so the pattern // we're looking for will never match. - const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); - if (STI.is64Bit()) + if (STI->is64Bit()) + return false; + + // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset + // in the compact unwind encoding that Darwin uses. So, bail if there + // is a danger of that being generated. + if (STI->isTargetDarwin() && + (!MF.getMMI().getLandingPads().empty() || + (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF)))) return false; // You would expect straight-line code between call-frame setup and @@ -161,7 +172,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // Check whether this trasnformation is profitable for a particular // function - in terms of code size. bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, - ContextMap &CallSeqMap) { + ContextVector &CallSeqVector) { // This transformation is always a win when we do not expect to have // a reserved call frame. Under other circumstances, it may be either // a win or a loss, and requires a heuristic. @@ -170,24 +181,20 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, return true; // Don't do this when not optimizing for size. - bool OptForSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize); - - if (!OptForSize) + if (!MF.getFunction()->optForSize()) return false; unsigned StackAlign = TFL->getStackAlignment(); int64_t Advantage = 0; - for (auto CC : CallSeqMap) { + for (auto CC : CallSeqVector) { // Call sites where no parameters are passed on the stack // do not affect the cost, since there needs to be no // stack adjustment. - if (CC.second.NoStackParams) + if (CC.NoStackParams) continue; - if (!CC.second.UsePush) { + if (!CC.UsePush) { // If we don't use pushes for a particular call site, // we pay for not having a reserved call frame with an // additional sub/add esp pair. The cost is ~3 bytes per instruction, @@ -200,11 +207,11 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, // We'll need a add after the call. Advantage -= 3; // If we have to realign the stack, we'll also need and sub before - if (CC.second.ExpectedDist % StackAlign) + if (CC.ExpectedDist % StackAlign) Advantage -= 3; // Now, for each push, we save ~3 bytes. For small constants, we actually, // save more (up to 5 bytes), but 3 should be a good approximation. - Advantage += (CC.second.ExpectedDist / 4) * 3; + Advantage += (CC.ExpectedDist / 4) * 3; } } @@ -212,8 +219,9 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, } bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { - TII = MF.getSubtarget().getInstrInfo(); - TFL = MF.getSubtarget().getFrameLowering(); + STI = &MF.getSubtarget<X86Subtarget>(); + TII = STI->getInstrInfo(); + TFL = STI->getFrameLowering(); MRI = &MF.getRegInfo(); if (!isLegal(MF)) @@ -223,21 +231,22 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - ContextMap CallSeqMap; + ContextVector CallSeqVector; for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) if (I->getOpcode() == FrameSetupOpcode) { - CallContext &Context = CallSeqMap[I]; + CallContext Context; collectCallInfo(MF, *BB, I, Context); + CallSeqVector.push_back(Context); } - if (!isProfitable(MF, CallSeqMap)) + if (!isProfitable(MF, CallSeqVector)) return false; - for (auto CC : CallSeqMap) - if (CC.second.UsePush) - Changed |= adjustCallSequence(MF, CC.first, CC.second); + for (auto CC : CallSeqVector) + if (CC.UsePush) + Changed |= adjustCallSequence(MF, CC); return Changed; } @@ -307,13 +316,13 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, // Check that this particular call sequence is amenable to the // transformation. const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>( - MF.getSubtarget().getRegisterInfo()); - unsigned StackPtr = RegInfo.getStackRegister(); + STI->getRegisterInfo()); unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); // We expect to enter this at the beginning of a call sequence assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); MachineBasicBlock::iterator FrameSetup = I++; + Context.FrameSetup = FrameSetup; // How much do we adjust the stack? This puts an upper bound on // the number of parameters actually passed on it. @@ -338,7 +347,8 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (!I->isCopy() || !I->getOperand(0).isReg()) return; Context.SPCopy = I++; - StackPtr = Context.SPCopy->getOperand(0).getReg(); + + unsigned StackPtr = Context.SPCopy->getOperand(0).getReg(); // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of MOV32mi or MOV32mr @@ -434,22 +444,22 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, } bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, - MachineBasicBlock::iterator I, const CallContext &Context) { // Ok, we can in fact do the transformation for this call. // Do not remove the FrameSetup instruction, but adjust the parameters. // PEI will end up finalizing the handling of this. - MachineBasicBlock::iterator FrameSetup = I; - MachineBasicBlock &MBB = *(I->getParent()); + MachineBasicBlock::iterator FrameSetup = Context.FrameSetup; + MachineBasicBlock &MBB = *(FrameSetup->getParent()); FrameSetup->getOperand(1).setImm(Context.ExpectedDist); - DebugLoc DL = I->getDebugLoc(); + DebugLoc DL = FrameSetup->getDebugLoc(); // Now, iterate through the vector in reverse order, and replace the movs // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to // replace uses. for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) { MachineBasicBlock::iterator MOV = *Context.MovVector[Idx]; MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + MachineBasicBlock::iterator Push = nullptr; if (MOV->getOpcode() == X86::MOV32mi) { unsigned PushOpcode = X86::PUSHi32; // If the operand is a small (8-bit) immediate, we can use a @@ -461,21 +471,20 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, if (isInt<8>(Val)) PushOpcode = X86::PUSH32i8; } - BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp); + Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)) + .addOperand(PushOp); } else { unsigned int Reg = PushOp.getReg(); // If PUSHrmm is not slow on this target, try to fold the source of the // push into the instruction. - const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); - bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); + bool SlowPUSHrmm = STI->isAtom() || STI->isSLM(); // Check that this is legal to fold. Right now, we're extremely // conservative about that. MachineInstr *DefMov = nullptr; if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { - MachineInstr *Push = - BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm)); unsigned NumOps = DefMov->getDesc().getNumOperands(); for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) @@ -483,12 +492,19 @@ bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, DefMov->eraseFromParent(); } else { - BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) + Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r)) .addReg(Reg) .getInstr(); } } + // For debugging, when using SP-based CFA, we need to adjust the CFA + // offset after each push. + // TODO: This is needed only if we require precise CFA. + if (!TFL->hasFP(MF)) + TFL->BuildCFI(MBB, std::next(Push), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, 4)); + MBB.erase(MOV); } @@ -532,13 +548,10 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( DefMI->getParent() != FrameSetup->getParent()) return nullptr; - // Now, make sure everything else up until the ADJCALLSTACK is a sequence - // of MOVs. To be less conservative would require duplicating a lot of the - // logic from PeepholeOptimizer. - // FIXME: A possibly better approach would be to teach the PeepholeOptimizer - // to be smarter about folding into pushes. + // Make sure we don't have any instructions between DefMI and the + // push that make folding the load illegal. for (auto I = DefMI; I != FrameSetup; ++I) - if (I->getOpcode() != X86::MOV32rm) + if (I->isLoadFoldBarrier()) return nullptr; return DefMI; diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.h b/contrib/llvm/lib/Target/X86/X86CallingConv.h index 0eb2494..a08160f 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.h +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86CALLINGCONV_H #define LLVM_LIB_TARGET_X86_X86CALLINGCONV_H +#include "MCTargetDesc/X86MCTargetDesc.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/IR/CallingConv.h" @@ -42,6 +43,64 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &, return false; } +inline bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, + MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, + CCState &State) { + // This is similar to CCAssignToReg<[EAX, EDX, ECX]>, but makes sure + // not to split i64 and double between a register and stack + static const MCPhysReg RegList[] = {X86::EAX, X86::EDX, X86::ECX}; + static const unsigned NumRegs = sizeof(RegList)/sizeof(RegList[0]); + + SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs(); + + // If this is the first part of an double/i64/i128, or if we're already + // in the middle of a split, add to the pending list. If this is not + // the end of the split, return, otherwise go on to process the pending + // list + if (ArgFlags.isSplit() || !PendingMembers.empty()) { + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + if (!ArgFlags.isSplitEnd()) + return true; + } + + // If there are no pending members, we are not in the middle of a split, + // so do the usual inreg stuff. + if (PendingMembers.empty()) { + if (unsigned Reg = State.AllocateReg(RegList)) { + State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); + return true; + } + return false; + } + + assert(ArgFlags.isSplitEnd()); + + // We now have the entire original argument in PendingMembers, so decide + // whether to use registers or the stack. + // Per the MCU ABI: + // a) To use registers, we need to have enough of them free to contain + // the entire argument. + // b) We never want to use more than 2 registers for a single argument. + + unsigned FirstFree = State.getFirstUnallocated(RegList); + bool UseRegs = PendingMembers.size() <= std::min(2U, NumRegs - FirstFree); + + for (auto &It : PendingMembers) { + if (UseRegs) + It.convertToReg(State.AllocateReg(RegList[FirstFree++])); + else + It.convertToMem(State.AllocateStack(4, 4)); + State.addLoc(It); + } + + PendingMembers.clear(); + + return true; +} + } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86CallingConv.td b/contrib/llvm/lib/Target/X86/X86CallingConv.td index 8f88888..54d88cb 100644 --- a/contrib/llvm/lib/Target/X86/X86CallingConv.td +++ b/contrib/llvm/lib/Target/X86/X86CallingConv.td @@ -158,6 +158,7 @@ def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, + CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, // MMX vector types are always returned in XMM0. CCIfType<[x86mmx], CCAssignToReg<[XMM0, XMM1]>>, @@ -202,6 +203,16 @@ def RetCC_X86_64_AnyReg : CallingConv<[ CCCustom<"CC_X86_AnyReg_Error"> ]>; +// X86-64 HHVM return-value convention. +def RetCC_X86_64_HHVM: CallingConv<[ + // Promote all types to i64 + CCIfType<[i8, i16, i32], CCPromoteToType<i64>>, + + // Return: could return in any GP register save RSP and R12. + CCIfType<[i64], CCAssignToReg<[RBX, RBP, RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14, R15]>> +]>; + // This is the root return-value convention for the X86-32 backend. def RetCC_X86_32 : CallingConv<[ // If FastCC, use RetCC_X86_32_Fast. @@ -227,6 +238,9 @@ def RetCC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, + // Handle HHVM calls. + CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>, + // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>, @@ -280,7 +294,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v64i1], CCPromoteToType<v64i8>>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -305,7 +319,7 @@ def CC_X86_64_C : CallingConv<[ // Long doubles get stack slots whose size and alignment depends on the // subtarget. - CCIfType<[f80], CCAssignToStack<0, 0>>, + CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, @@ -319,6 +333,23 @@ def CC_X86_64_C : CallingConv<[ CCAssignToStack<64, 64>> ]>; +// Calling convention for X86-64 HHVM. +def CC_X86_64_HHVM : CallingConv<[ + // Use all/any GP registers for args, except RSP. + CCIfType<[i64], CCAssignToReg<[RBX, R12, RBP, R15, + RDI, RSI, RDX, RCX, R8, R9, + RAX, R10, R11, R13, R14]>> +]>; + +// Calling convention for helper functions in HHVM. +def CC_X86_64_HHVM_C : CallingConv<[ + // Pass the first argument in RBP. + CCIfType<[i64], CCAssignToReg<[RBP]>>, + + // Otherwise it's the same as the regular C calling convention. + CCDelegateTo<CC_X86_64_C> +]>; + // Calling convention used on Win64 def CC_X86_Win64_C : CallingConv<[ // FIXME: Handle byval stuff. @@ -561,6 +592,23 @@ def CC_X86_32_C : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +def CC_X86_32_MCU : CallingConv<[ + // Handles byval parameters. Note that, like FastCC, we can't rely on + // the delegation to CC_X86_32_Common because that happens after code that + // puts arguments in registers. + CCIfByVal<CCPassByVal<4, 4>>, + + // Promote i1/i8/i16 arguments to i32. + CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, + + // If the call is not a vararg call, some arguments may be passed + // in integer registers. + CCIfNotVarArg<CCIfType<[i32], CCCustom<"CC_X86_32_MCUInReg">>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; + def CC_X86_32_FastCall : CallingConv<[ // Promote i1/i8/i16 arguments to i32. CCIfType<[i1, i8, i16], CCPromoteToType<i32>>, @@ -708,18 +756,28 @@ def CC_Intel_OCL_BI : CallingConv<[ CCDelegateTo<CC_X86_32_C> ]>; +def CC_X86_32_Intr : CallingConv<[ + CCAssignToStack<4, 4> +]>; + +def CC_X86_64_Intr : CallingConv<[ + CCAssignToStack<8, 8> +]>; + //===----------------------------------------------------------------------===// // X86 Root Argument Calling Conventions //===----------------------------------------------------------------------===// // This is the root argument convention for the X86-32 backend. def CC_X86_32 : CallingConv<[ + CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>, CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>, CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>, CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>, CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>, CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>, // Otherwise, drop to normal X86-32 CC CCDelegateTo<CC_X86_32_C> @@ -734,6 +792,9 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, + CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>, + CCIfCC<"CallingConv::HHVM_C", CCDelegateTo<CC_X86_64_HHVM_C>>, + CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_64_Intr>>, // Mingw64 and native Win64 use Win64 CC CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>, @@ -764,6 +825,12 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>; def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15, (sequence "XMM%u", 6, 15))>; +// The function used by Darwin to obtain the address of a thread-local variable +// uses rdi to pass a single parameter and rax for the return value. All other +// GPRs are preserved. +def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI, + R8, R9, R10, R11)>; + // All GPRs - except r11 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI, R8, R9, R10, RSP)>; @@ -778,6 +845,11 @@ def CSR_64_MostRegs : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15, RBP, (sequence "XMM%u", 0, 15))>; +def CSR_32_AllRegs : CalleeSavedRegs<(add EAX, EBX, ECX, EDX, EBP, ESI, + EDI, ESP)>; +def CSR_32_AllRegs_SSE : CalleeSavedRegs<(add CSR_32_AllRegs, + (sequence "XMM%u", 0, 7))>; + def CSR_64_AllRegs : CalleeSavedRegs<(add CSR_64_MostRegs, RAX, RSP, (sequence "XMM%u", 16, 31))>; def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX, RSP, @@ -804,3 +876,6 @@ def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64, def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15, (sequence "ZMM%u", 16, 31), K4, K5, K6, K7)>; + +// Only R12 is preserved for PHP calls in HHVM. +def CSR_64_HHVM : CalleeSavedRegs<(add R12)>; diff --git a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 6a5a28e..a09d065 100644 --- a/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -19,9 +19,10 @@ #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" -#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. #include "llvm/IR/GlobalValue.h" using namespace llvm; @@ -141,6 +142,24 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // The EH_RETURN pseudo is really removed during the MC Lowering. return true; } + case X86::IRET: { + // Adjust stack to erase error code + int64_t StackAdj = MBBI->getOperand(0).getImm(); + X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true); + // Replace pseudo with machine iret + BuildMI(MBB, MBBI, DL, + TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32)); + MBB.erase(MBBI); + return true; + } + case X86::EH_RESTORE: { + // Restore ESP and EBP, and optionally ESI if required. + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality( + MBB.getParent()->getFunction()->getPersonalityFn())); + X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH); + MBBI->eraseFromParent(); + return true; + } } llvm_unreachable("Previous switch has a fallthrough?"); } diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index b4319c8..de94a13 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -298,8 +298,8 @@ bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, return false; // Make sure nothing is in the way - BasicBlock::const_iterator Start = I; - BasicBlock::const_iterator End = II; + BasicBlock::const_iterator Start(I); + BasicBlock::const_iterator End(II); for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. @@ -433,6 +433,11 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { + bool HasSSE2 = Subtarget->hasSSE2(); + bool HasSSE4A = Subtarget->hasSSE4A(); + bool HasAVX = Subtarget->hasAVX(); + bool IsNonTemporal = MMO && MMO->isNonTemporal(); + // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { @@ -449,35 +454,59 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; - case MVT::i32: Opc = X86::MOV32mr; break; - case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. + case MVT::i32: + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTImr : X86::MOV32mr; + break; + case MVT::i64: + // Must be in x86-64 mode. + Opc = (IsNonTemporal && HasSSE2) ? X86::MOVNTI_64mr : X86::MOV64mr; + break; case MVT::f32: - Opc = X86ScalarSSEf32 ? - (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSS; + else + Opc = HasAVX ? X86::VMOVSSmr : X86::MOVSSmr; + } else + Opc = X86::ST_Fp32m; break; case MVT::f64: - Opc = X86ScalarSSEf64 ? - (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; + if (X86ScalarSSEf32) { + if (IsNonTemporal && HasSSE4A) + Opc = X86::MOVNTSD; + else + Opc = HasAVX ? X86::VMOVSDmr : X86::MOVSDmr; + } else + Opc = X86::ST_Fp64m; break; case MVT::v4f32: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; - else - Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPSmr : X86::MOVNTPSmr; + else + Opc = HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr; + } else + Opc = HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; - else - Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTPDmr : X86::MOVNTPDmr; + else + Opc = HasAVX ? X86::VMOVAPDmr : X86::MOVAPDmr; + } else + Opc = HasAVX ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: - if (Aligned) - Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; - else + if (Aligned) { + if (IsNonTemporal) + Opc = HasAVX ? X86::VMOVNTDQmr : X86::MOVNTDQmr; + else + Opc = HasAVX ? X86::VMOVDQAmr : X86::MOVDQAmr; + } else Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; break; } @@ -1069,12 +1098,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { RetRegs.push_back(VA.getLocReg()); } - // The x86-64 ABI for returning structs by value requires that we copy - // the sret argument into %rax for the return. We saved the argument into - // a virtual register in the entry block, so now we copy the value out - // and into %rax. We also do the same with %eax for Win32. - if (F.hasStructRetAttr() && - (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { + // All x86 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax. + if (F.hasStructRetAttr()) { unsigned Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); @@ -1431,17 +1459,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { .addMBB(TrueMBB); } - // Obtain the branch weight and add the TrueBB to the successor list. - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); - - // Emits an unconditional branch to the FalseBB, obtains the branch - // weight, and adds it to the successor list. - fastEmitBranch(FalseMBB, DbgLoc); - + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) { @@ -1472,12 +1490,8 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } } @@ -1492,12 +1506,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1511,12 +1520,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); - fastEmitBranch(FalseMBB, DbgLoc); - uint32_t BranchWeight = 0; - if (FuncInfo.BPI) - BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), - TrueMBB->getBasicBlock()); - FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + finishCondBranch(BI->getParent(), TrueMBB, FalseMBB); return true; } @@ -1945,6 +1949,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned ResultReg; if (Subtarget->hasAVX()) { + const TargetRegisterClass *FR32 = &X86::FR32RegClass; + const TargetRegisterClass *VR128 = &X86::VR128RegClass; + // If we have AVX, create 1 blendv instead of 3 logic instructions. // Blendv was introduced with SSE 4.1, but the 2 register form implicitly // uses XMM0 as the selection register. That may need just as many @@ -1955,10 +1962,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned BlendOpcode = (RetVT.SimpleTy == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr; - unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill, + unsigned CmpReg = fastEmitInst_rri(CmpOpcode, FR32, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); - ResultReg = fastEmitInst_rrr(BlendOpcode, RC, RHSReg, RHSIsKill, - LHSReg, LHSIsKill, CmpReg, true); + unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill, + LHSReg, LHSIsKill, CmpReg, true); + ResultReg = createResultReg(RC); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(VBlendReg); } else { unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); @@ -2806,10 +2816,12 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, if (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE) return 0; - if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) - return 0; - if (CS && CS->paramHasAttr(1, Attribute::InReg)) - return 0; + + if (CS) + if (CS->arg_empty() || !CS->paramHasAttr(1, Attribute::StructRet) || + CS->paramHasAttr(1, Attribute::InReg) || Subtarget->isTargetMCU()) + return 0; + return 4; } @@ -2924,7 +2936,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); @@ -3020,8 +3032,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, - ArgVT.getStoreSize(), Alignment); + MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset), + MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (Flags.isByVal()) { X86AddressMode SrcAM; SrcAM.Base.Reg = ArgReg; @@ -3252,6 +3264,30 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { updateValueMap(I, Reg); return true; } + case Instruction::BitCast: { + // Select SSE2/AVX bitcasts between 128/256 bit vector types. + if (!Subtarget->hasSSE2()) + return false; + + EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType()); + EVT DstVT = TLI.getValueType(DL, I->getType()); + + if (!SrcVT.isSimple() || !DstVT.isSimple()) + return false; + + if (!SrcVT.is128BitVector() && + !(Subtarget->hasAVX() && SrcVT.is256BitVector())) + return false; + + unsigned Reg = getRegForValue(I->getOperand(0)); + if (Reg == 0) + return false; + + // No instruction is needed for conversion. Reuse the register used by + // the fist operand. + updateValueMap(I, Reg); + return true; + } } return false; @@ -3384,8 +3420,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { TII.get(Opc), ResultReg); addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, - DL.getPointerSize(), Align); + MachinePointerInfo::getConstantPool(*FuncInfo.MF), + MachineMemOperand::MOLoad, DL.getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } diff --git a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp index 5eb4fae..1dd69e8 100644 --- a/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp +++ b/contrib/llvm/lib/Target/X86/X86FixupLEAs.cpp @@ -9,6 +9,7 @@ // // This file defines the pass that finds instructions that can be // re-written as LEA instructions in order to reduce pipeline delays. +// When optimizing for size it replaces suitable LEAs with INC or DEC. // //===----------------------------------------------------------------------===// @@ -61,6 +62,11 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg + /// and convert them to INC or DEC respectively. + bool fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const; + /// \brief Determine if an instruction references a machine register /// and, if so, whether it reads or writes the register. RegUsageState usesRegister(MachineOperand &p, MachineBasicBlock::iterator I); @@ -89,6 +95,8 @@ public: private: MachineFunction *MF; const X86InstrInfo *TII; // Machine instruction info. + bool OptIncDec; + bool OptLEA; }; char FixupLEAPass::ID = 0; } @@ -150,7 +158,10 @@ FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); - if (!ST.LEAusesAG() && !ST.slowLEA()) + OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); + OptLEA = ST.LEAusesAG() || ST.slowLEA(); + + if (!OptLEA && !OptIncDec) return false; TII = ST.getInstrInfo(); @@ -187,7 +198,7 @@ FixupLEAPass::usesRegister(MachineOperand &p, MachineBasicBlock::iterator I) { static inline bool getPreviousInstr(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { if (I == MFI->begin()) { - if (MFI->isPredecessor(MFI)) { + if (MFI->isPredecessor(&*MFI)) { I = --MFI->end(); return true; } else @@ -222,6 +233,60 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return nullptr; } +static inline bool isLEA(const int opcode) { + return opcode == X86::LEA16r || opcode == X86::LEA32r || + opcode == X86::LEA64r || opcode == X86::LEA64_32r; +} + +/// isLEASimpleIncOrDec - Does this LEA have one these forms: +/// lea %reg, 1(%reg) +/// lea %reg, -1(%reg) +static inline bool isLEASimpleIncOrDec(MachineInstr *LEA) { + unsigned SrcReg = LEA->getOperand(1 + X86::AddrBaseReg).getReg(); + unsigned DstReg = LEA->getOperand(0).getReg(); + unsigned AddrDispOp = 1 + X86::AddrDisp; + return SrcReg == DstReg && + LEA->getOperand(1 + X86::AddrIndexReg).getReg() == 0 && + LEA->getOperand(1 + X86::AddrSegmentReg).getReg() == 0 && + LEA->getOperand(AddrDispOp).isImm() && + (LEA->getOperand(AddrDispOp).getImm() == 1 || + LEA->getOperand(AddrDispOp).getImm() == -1); +} + +bool FixupLEAPass::fixupIncDec(MachineBasicBlock::iterator &I, + MachineFunction::iterator MFI) const { + MachineInstr *MI = I; + int Opcode = MI->getOpcode(); + if (!isLEA(Opcode)) + return false; + + if (isLEASimpleIncOrDec(MI) && TII->isSafeToClobberEFLAGS(*MFI, I)) { + int NewOpcode; + bool isINC = MI->getOperand(4).getImm() == 1; + switch (Opcode) { + case X86::LEA16r: + NewOpcode = isINC ? X86::INC16r : X86::DEC16r; + break; + case X86::LEA32r: + case X86::LEA64_32r: + NewOpcode = isINC ? X86::INC32r : X86::DEC32r; + break; + case X86::LEA64r: + NewOpcode = isINC ? X86::INC64r : X86::DEC64r; + break; + } + + MachineInstr *NewMI = + BuildMI(*MFI, I, MI->getDebugLoc(), TII->get(NewOpcode)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(1)); + MFI->erase(I); + I = static_cast<MachineBasicBlock::iterator>(NewMI); + return true; + } + return false; +} + void FixupLEAPass::processInstruction(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { // Process a load, store, or LEA instruction. @@ -265,8 +330,7 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr *MI = I; const int opcode = MI->getOpcode(); - if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r && - opcode != X86::LEA64_32r) + if (!isLEA(opcode)) return; if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -280,7 +344,8 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; int addrr_opcode, addri_opcode; switch (opcode) { - default: llvm_unreachable("Unexpected LEA instruction"); + default: + llvm_unreachable("Unexpected LEA instruction"); case X86::LEA16r: addrr_opcode = X86::ADD16rr; addri_opcode = X86::ADD16ri; @@ -330,10 +395,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI) { for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) { - if (MF.getSubtarget<X86Subtarget>().isSLM()) - processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + if (OptIncDec) + if (fixupIncDec(I, MFI)) + continue; + + if (OptLEA) { + if (MF.getSubtarget<X86Subtarget>().isSLM()) + processInstructionForSLM(I, MFI); + else + processInstruction(I, MFI); + } } return false; } diff --git a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp index 40b9c8a..97bb8ab 100644 --- a/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp +++ b/contrib/llvm/lib/Target/X86/X86FloatingPoint.cpp @@ -120,12 +120,10 @@ namespace { // Return a bitmask of FP registers in block's live-in list. static unsigned calcLiveInMask(MachineBasicBlock *MBB) { unsigned Mask = 0; - for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(), - E = MBB->livein_end(); I != E; ++I) { - unsigned Reg = *I; - if (Reg < X86::FP0 || Reg > X86::FP6) + for (const auto &LI : MBB->liveins()) { + if (LI.PhysReg < X86::FP0 || LI.PhysReg > X86::FP6) continue; - Mask |= 1 << (Reg - X86::FP0); + Mask |= 1 << (LI.PhysReg - X86::FP0); } return Mask; } @@ -301,8 +299,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { bool FPIsUsed = false; static_assert(X86::FP6 == X86::FP0+6, "Register enums aren't sorted right!"); + const MachineRegisterInfo &MRI = MF.getRegInfo(); for (unsigned i = 0; i <= 6; ++i) - if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) { + if (!MRI.reg_nodbg_empty(X86::FP0 + i)) { FPIsUsed = true; break; } @@ -321,7 +320,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process the function in depth first order so that we process at least one // of the predecessors for every reachable block in the function. SmallPtrSet<MachineBasicBlock*, 8> Processed; - MachineBasicBlock *Entry = MF.begin(); + MachineBasicBlock *Entry = &MF.front(); bool Changed = false; for (MachineBasicBlock *BB : depth_first_ext(Entry, Processed)) @@ -329,9 +328,9 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // Process any unreachable blocks in arbitrary order now. if (MF.size() != Processed.size()) - for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) - if (Processed.insert(BB).second) - Changed |= processBasicBlock(MF, *BB); + for (MachineBasicBlock &BB : MF) + if (Processed.insert(&BB).second) + Changed |= processBasicBlock(MF, BB); LiveBundles.clear(); @@ -348,13 +347,12 @@ void FPS::bundleCFG(MachineFunction &MF) { LiveBundles.resize(Bundles->getNumBundles()); // Gather the actual live-in masks for all MBBs. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) { - MachineBasicBlock *MBB = I; - const unsigned Mask = calcLiveInMask(MBB); + for (MachineBasicBlock &MBB : MF) { + const unsigned Mask = calcLiveInMask(&MBB); if (!Mask) continue; // Update MBB ingoing bundle mask. - LiveBundles[Bundles->getBundle(MBB->getNumber(), false)].Mask |= Mask; + LiveBundles[Bundles->getBundle(MBB.getNumber(), false)].Mask |= Mask; } } @@ -546,17 +544,9 @@ namespace { }; } -#ifndef NDEBUG -static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) { - for (unsigned i = 0; i != NumEntries-1; ++i) - if (!(Table[i] < Table[i+1])) return false; - return true; -} -#endif - -static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { - const TableEntry *I = std::lower_bound(Table, Table+N, Opcode); - if (I != Table+N && I->from == Opcode) +static int Lookup(ArrayRef<TableEntry> Table, unsigned Opcode) { + const TableEntry *I = std::lower_bound(Table.begin(), Table.end(), Opcode); + if (I != Table.end() && I->from == Opcode) return I->to; return -1; } @@ -567,7 +557,7 @@ static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) { #define ASSERT_SORTED(TABLE) \ { static bool TABLE##Checked = false; \ if (!TABLE##Checked) { \ - assert(TableIsSorted(TABLE, array_lengthof(TABLE)) && \ + assert(std::is_sorted(std::begin(TABLE), std::end(TABLE)) && \ "All lookup tables must be sorted for efficient access!"); \ TABLE##Checked = true; \ } \ @@ -746,7 +736,7 @@ static const TableEntry OpcodeTable[] = { static unsigned getConcreteOpcode(unsigned Opcode) { ASSERT_SORTED(OpcodeTable); - int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode); + int Opc = Lookup(OpcodeTable, Opcode); assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!"); return Opc; } @@ -797,7 +787,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) { RegMap[Stack[--StackTop]] = ~0; // Update state // Check to see if there is a popping version of this instruction... - int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode()); + int Opcode = Lookup(PopTable, I->getOpcode()); if (Opcode != -1) { I->setDesc(TII->get(Opcode)); if (Opcode == X86::UCOM_FPPr) @@ -1193,7 +1183,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { // We decide which form to use based on what is on the top of the stack, and // which operand is killed by this instruction. - const TableEntry *InstTable; + ArrayRef<TableEntry> InstTable; bool isForward = TOS == Op0; bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0); if (updateST0) { @@ -1208,8 +1198,7 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) { InstTable = ReverseSTiTable; } - int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table), - MI->getOpcode()); + int Opcode = Lookup(InstTable, MI->getOpcode()); assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!"); // NotTOS - The register which is not on the top of stack... @@ -1520,31 +1509,6 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) { return; } - case X86::WIN_FTOL_32: - case X86::WIN_FTOL_64: { - // Push the operand into ST0. - MachineOperand &Op = MI->getOperand(0); - assert(Op.isUse() && Op.isReg() && - Op.getReg() >= X86::FP0 && Op.getReg() <= X86::FP6); - unsigned FPReg = getFPReg(Op); - if (Op.isKill()) - moveToTop(FPReg, Inst); - else - duplicateToTop(FPReg, ScratchFPReg, Inst); - - // Emit the call. This will pop the operand. - BuildMI(*MBB, Inst, MI->getDebugLoc(), TII->get(X86::CALLpcrel32)) - .addExternalSymbol("_ftol2") - .addReg(X86::ST0, RegState::ImplicitKill) - .addReg(X86::ECX, RegState::ImplicitDefine) - .addReg(X86::EAX, RegState::Define | RegState::Implicit) - .addReg(X86::EDX, RegState::Define | RegState::Implicit) - .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); - --StackTop; - - break; - } - case X86::RETQ: case X86::RETL: case X86::RETIL: diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 3a21b57..242d0333 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -18,25 +18,23 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Debug.h" #include <cstdlib> using namespace llvm; -// FIXME: completely move here. -extern cl::opt<bool> ForceStackAlign; - X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, unsigned StackAlignOverride) : TargetFrameLowering(StackGrowsDown, StackAlignOverride, @@ -80,6 +78,27 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences(); } +/// usesTheStack - This function checks if any of the users of EFLAGS +/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has +/// to use the stack, and if we don't adjust the stack we clobber the first +/// frame index. +/// See X86InstrInfo::copyPhysReg. +static bool usesTheStack(const MachineFunction &MF) { + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Conservativley assume that inline assembly might use the stack. + if (MF.hasInlineAsm()) + return true; + + return any_of(MRI.reg_instructions(X86::EFLAGS), + [](const MachineInstr &RI) { return RI.isCopy(); }); +} + +static bool doesStackUseImplyFP(const MachineFunction &MF) { + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + return IsWin64Prologue && usesTheStack(MF); +} + /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. @@ -92,8 +111,9 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const { MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasOpaqueSPAdjustment() || MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() || - MMI.callsUnwindInit() || MMI.callsEHReturn() || - MFI->hasStackMap() || MFI->hasPatchPoint()); + MMI.callsUnwindInit() || MMI.hasEHFunclets() || MMI.callsEHReturn() || + MFI->hasStackMap() || MFI->hasPatchPoint() || + doesStackUseImplyFP(MF)); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { @@ -148,21 +168,14 @@ static unsigned getLEArOpcode(unsigned IsLP64) { /// to this register without worry about clobbering it. static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - const TargetRegisterInfo *TRI, + const X86RegisterInfo *TRI, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); const Function *F = MF->getFunction(); if (!F || MF->getMMI().callsEHReturn()) return 0; - static const uint16_t CallerSavedRegs32Bit[] = { - X86::EAX, X86::EDX, X86::ECX, 0 - }; - - static const uint16_t CallerSavedRegs64Bit[] = { - X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, - X86::R8, X86::R9, X86::R10, X86::R11, 0 - }; + const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF); unsigned Opc = MBBI->getOpcode(); switch (Opc) { @@ -191,10 +204,9 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, Uses.insert(*AI); } - const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; - for (; *CS; ++CS) - if (!Uses.count(*CS)) - return *CS; + for (auto CS : AvailableRegs) + if (!Uses.count(CS) && CS != X86::RIP) + return CS; } } @@ -214,8 +226,12 @@ static bool isEAXLiveIn(MachineFunction &MF) { return false; } -/// Check whether or not the terminators of \p MBB needs to read EFLAGS. -static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { +/// Check if the flags need to be preserved before the terminators. +/// This would be the case, if the eflags is live-in of the region +/// composed by the terminators or live-out of that region, without +/// being defined by a terminator. +static bool +flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { for (const MachineInstr &MI : MBB.terminators()) { bool BreakNext = false; for (const MachineOperand &MO : MI.operands()) { @@ -225,15 +241,27 @@ static bool terminatorsNeedFlagsAsInput(const MachineBasicBlock &MBB) { if (Reg != X86::EFLAGS) continue; - // This terminator needs an eflag that is not defined - // by a previous terminator. + // This terminator needs an eflags that is not defined + // by a previous another terminator: + // EFLAGS is live-in of the region composed by the terminators. if (!MO.isDef()) return true; + // This terminator defines the eflags, i.e., we don't need to preserve it. + // However, we still need to check this specific terminator does not + // read a live-in value. BreakNext = true; } + // We found a definition of the eflags, no need to preserve them. if (BreakNext) - break; + return false; } + + // None of the terminators use or define the eflags. + // Check if they are live-out, that would imply we need to preserve them. + for (const MachineBasicBlock *Succ : MBB.successors()) + if (Succ->isLiveIn(X86::EFLAGS)) + return true; + return false; } @@ -289,6 +317,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) MI->setFlag(MachineInstr::FrameSetup); + else + MI->setFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; continue; } @@ -298,6 +328,8 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue); if (isSub) MI.setMIFlag(MachineInstr::FrameSetup); + else + MI.setMIFlag(MachineInstr::FrameDestroy); Offset -= ThisVal; } @@ -312,7 +344,11 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // is tricky. bool UseLEA; if (!InEpilogue) { - UseLEA = STI.useLeaForSP(); + // Check if inserting the prologue at the beginning + // of MBB would require to use LEA operations. + // We need to use LEA operations if EFLAGS is live in, because + // it means an instruction will read it before it gets defined. + UseLEA = STI.useLeaForSP() || MBB.isLiveIn(X86::EFLAGS); } else { // If we can use LEA for SP but we shouldn't, check that none // of the terminators uses the eflags. Otherwise we will insert @@ -321,10 +357,10 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( // and is an optimization anyway. UseLEA = canUseLEAForSPInEpilogue(*MBB.getParent()); if (UseLEA && !STI.useLeaForSP()) - UseLEA = terminatorsNeedFlagsAsInput(MBB); + UseLEA = flagsNeedToBePreservedBeforeTheTerminators(MBB); // If that assert breaks, that means we do not do the right thing // in canUseAsEpilogue. - assert((UseLEA || !terminatorsNeedFlagsAsInput(MBB)) && + assert((UseLEA || !flagsNeedToBePreservedBeforeTheTerminators(MBB)) && "We shouldn't have allowed this insertion point"); } @@ -347,30 +383,6 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment( return MI; } -/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. -static -void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, - unsigned StackPtr, uint64_t *NumBytes = nullptr) { - if (MBBI == MBB.begin()) return; - - MachineBasicBlock::iterator PI = std::prev(MBBI); - unsigned Opc = PI->getOpcode(); - if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || - Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || - Opc == X86::LEA32r || Opc == X86::LEA64_32r) && - PI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes += PI->getOperand(2).getImm(); - MBB.erase(PI); - } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || - Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && - PI->getOperand(0).getReg() == StackPtr) { - if (NumBytes) - *NumBytes -= PI->getOperand(2).getImm(); - MBB.erase(PI); - } -} - int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, bool doMergeWithPrevious) const { @@ -436,27 +448,265 @@ X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, } } -/// usesTheStack - This function checks if any of the users of EFLAGS -/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has -/// to use the stack, and if we don't adjust the stack we clobber the first -/// frame index. -/// See X86InstrInfo::copyPhysReg. -static bool usesTheStack(const MachineFunction &MF) { - const MachineRegisterInfo &MRI = MF.getRegInfo(); +MachineInstr *X86FrameLowering::emitStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + if (STI.isTargetWindowsCoreCLR()) { + if (InProlog) { + return emitStackProbeInlineStub(MF, MBB, MBBI, DL, true); + } else { + return emitStackProbeInline(MF, MBB, MBBI, DL, false); + } + } else { + return emitStackProbeCall(MF, MBB, MBBI, DL, InProlog); + } +} - for (MachineRegisterInfo::reg_instr_iterator - ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end(); - ri != re; ++ri) - if (ri->isCopy()) - return true; +void X86FrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const { + const StringRef ChkStkStubSymbol = "__chkstk_stub"; + MachineInstr *ChkStkStub = nullptr; - return false; + for (MachineInstr &MI : PrologMBB) { + if (MI.isCall() && MI.getOperand(0).isSymbol() && + ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) { + ChkStkStub = &MI; + break; + } + } + + if (ChkStkStub != nullptr) { + MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator()); + assert(std::prev(MBBI).operator==(ChkStkStub) && + "MBBI expected after __chkstk_stub."); + DebugLoc DL = PrologMBB.findDebugLoc(MBBI); + emitStackProbeInline(MF, PrologMBB, MBBI, DL, true); + ChkStkStub->eraseFromParent(); + } } -void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) const { +MachineInstr *X86FrameLowering::emitStackProbeInline( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + assert(STI.is64Bit() && "different expansion needed for 32 bit"); + assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR"); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + const BasicBlock *LLVM_BB = MBB.getBasicBlock(); + + // RAX contains the number of bytes of desired stack adjustment. + // The handling here assumes this value has already been updated so as to + // maintain stack alignment. + // + // We need to exit with RSP modified by this amount and execute suitable + // page touches to notify the OS that we're growing the stack responsibly. + // All stack probing must be done without modifying RSP. + // + // MBB: + // SizeReg = RAX; + // ZeroReg = 0 + // CopyReg = RSP + // Flags, TestReg = CopyReg - SizeReg + // FinalReg = !Flags.Ovf ? TestReg : ZeroReg + // LimitReg = gs magic thread env access + // if FinalReg >= LimitReg goto ContinueMBB + // RoundBB: + // RoundReg = page address of FinalReg + // LoopMBB: + // LoopReg = PHI(LimitReg,ProbeReg) + // ProbeReg = LoopReg - PageSize + // [ProbeReg] = 0 + // if (ProbeReg > RoundReg) goto LoopMBB + // ContinueMBB: + // RSP = RSP - RAX + // [rest of original MBB] + + // Set up the new basic blocks + MachineBasicBlock *RoundMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *ContinueMBB = MF.CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator MBBIter = std::next(MBB.getIterator()); + MF.insert(MBBIter, RoundMBB); + MF.insert(MBBIter, LoopMBB); + MF.insert(MBBIter, ContinueMBB); + + // Split MBB and move the tail portion down to ContinueMBB. + MachineBasicBlock::iterator BeforeMBBI = std::prev(MBBI); + ContinueMBB->splice(ContinueMBB->begin(), &MBB, MBBI, MBB.end()); + ContinueMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + // Some useful constants + const int64_t ThreadEnvironmentStackLimit = 0x10; + const int64_t PageSize = 0x1000; + const int64_t PageMask = ~(PageSize - 1); + + // Registers we need. For the normal case we use virtual + // registers. For the prolog expansion we use RAX, RCX and RDX. + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetRegisterClass *RegClass = &X86::GR64RegClass; + const unsigned SizeReg = InProlog ? (unsigned)X86::RAX + : MRI.createVirtualRegister(RegClass), + ZeroReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + CopyReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + TestReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + FinalReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + RoundedReg = InProlog ? (unsigned)X86::RDX + : MRI.createVirtualRegister(RegClass), + LimitReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + JoinReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass), + ProbeReg = InProlog ? (unsigned)X86::RCX + : MRI.createVirtualRegister(RegClass); + + // SP-relative offsets where we can save RCX and RDX. + int64_t RCXShadowSlot = 0; + int64_t RDXShadowSlot = 0; + + // If inlining in the prolog, save RCX and RDX. + // Future optimization: don't save or restore if not live in. + if (InProlog) { + // Compute the offsets. We need to account for things already + // pushed onto the stack at this point: return address, frame + // pointer (if used), and callee saves. + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + const int64_t CalleeSaveSize = X86FI->getCalleeSavedFrameSize(); + const bool HasFP = hasFP(MF); + RCXShadowSlot = 8 + CalleeSaveSize + (HasFP ? 8 : 0); + RDXShadowSlot = RCXShadowSlot + 8; + // Emit the saves. + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RCXShadowSlot) + .addReg(X86::RCX); + addRegOffset(BuildMI(&MBB, DL, TII.get(X86::MOV64mr)), X86::RSP, false, + RDXShadowSlot) + .addReg(X86::RDX); + } else { + // Not in the prolog. Copy RAX to a virtual reg. + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), SizeReg).addReg(X86::RAX); + } + + // Add code to MBB to check for overflow and set the new target stack pointer + // to zero if so. + BuildMI(&MBB, DL, TII.get(X86::XOR64rr), ZeroReg) + .addReg(ZeroReg, RegState::Undef) + .addReg(ZeroReg, RegState::Undef); + BuildMI(&MBB, DL, TII.get(X86::MOV64rr), CopyReg).addReg(X86::RSP); + BuildMI(&MBB, DL, TII.get(X86::SUB64rr), TestReg) + .addReg(CopyReg) + .addReg(SizeReg); + BuildMI(&MBB, DL, TII.get(X86::CMOVB64rr), FinalReg) + .addReg(TestReg) + .addReg(ZeroReg); + + // FinalReg now holds final stack pointer value, or zero if + // allocation would overflow. Compare against the current stack + // limit from the thread environment block. Note this limit is the + // lowest touched page on the stack, not the point at which the OS + // will cause an overflow exception, so this is just an optimization + // to avoid unnecessarily touching pages that are below the current + // SP but already commited to the stack by the OS. + BuildMI(&MBB, DL, TII.get(X86::MOV64rm), LimitReg) + .addReg(0) + .addImm(1) + .addReg(0) + .addImm(ThreadEnvironmentStackLimit) + .addReg(X86::GS); + BuildMI(&MBB, DL, TII.get(X86::CMP64rr)).addReg(FinalReg).addReg(LimitReg); + // Jump if the desired stack pointer is at or above the stack limit. + BuildMI(&MBB, DL, TII.get(X86::JAE_1)).addMBB(ContinueMBB); + + // Add code to roundMBB to round the final stack pointer to a page boundary. + BuildMI(RoundMBB, DL, TII.get(X86::AND64ri32), RoundedReg) + .addReg(FinalReg) + .addImm(PageMask); + BuildMI(RoundMBB, DL, TII.get(X86::JMP_1)).addMBB(LoopMBB); + + // LimitReg now holds the current stack limit, RoundedReg page-rounded + // final RSP value. Add code to loopMBB to decrement LimitReg page-by-page + // and probe until we reach RoundedReg. + if (!InProlog) { + BuildMI(LoopMBB, DL, TII.get(X86::PHI), JoinReg) + .addReg(LimitReg) + .addMBB(RoundMBB) + .addReg(ProbeReg) + .addMBB(LoopMBB); + } + + addRegOffset(BuildMI(LoopMBB, DL, TII.get(X86::LEA64r), ProbeReg), JoinReg, + false, -PageSize); + + // Probe by storing a byte onto the stack. + BuildMI(LoopMBB, DL, TII.get(X86::MOV8mi)) + .addReg(ProbeReg) + .addImm(1) + .addReg(0) + .addImm(0) + .addReg(0) + .addImm(0); + BuildMI(LoopMBB, DL, TII.get(X86::CMP64rr)) + .addReg(RoundedReg) + .addReg(ProbeReg); + BuildMI(LoopMBB, DL, TII.get(X86::JNE_1)).addMBB(LoopMBB); + + MachineBasicBlock::iterator ContinueMBBI = ContinueMBB->getFirstNonPHI(); + + // If in prolog, restore RDX and RCX. + if (InProlog) { + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RCX), + X86::RSP, false, RCXShadowSlot); + addRegOffset(BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::MOV64rm), + X86::RDX), + X86::RSP, false, RDXShadowSlot); + } + + // Now that the probing is done, add code to continueMBB to update + // the stack pointer for real. + BuildMI(*ContinueMBB, ContinueMBBI, DL, TII.get(X86::SUB64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(SizeReg); + + // Add the control flow edges we need. + MBB.addSuccessor(ContinueMBB); + MBB.addSuccessor(RoundMBB); + RoundMBB->addSuccessor(LoopMBB); + LoopMBB->addSuccessor(ContinueMBB); + LoopMBB->addSuccessor(LoopMBB); + + // Mark all the instructions added to the prolog as frame setup. + if (InProlog) { + for (++BeforeMBBI; BeforeMBBI != MBB.end(); ++BeforeMBBI) { + BeforeMBBI->setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *RoundMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineInstr &MI : *LoopMBB) { + MI.setFlag(MachineInstr::FrameSetup); + } + for (MachineBasicBlock::iterator CMBBI = ContinueMBB->begin(); + CMBBI != ContinueMBBI; ++CMBBI) { + CMBBI->setFlag(MachineInstr::FrameSetup); + } + } + + // Possible TODO: physreg liveness for InProlog case. + + return ContinueMBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeCall( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large; unsigned CallOp; @@ -478,6 +728,7 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, Symbol = "_chkstk"; MachineInstrBuilder CI; + MachineBasicBlock::iterator ExpansionMBBI = std::prev(MBBI); // All current stack probes take AX and SP as input, clobber flags, and // preserve all registers. x86_64 probes leave RSP unmodified. @@ -507,6 +758,26 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF, .addReg(X86::RSP) .addReg(X86::RAX); } + + if (InProlog) { + // Apply the frame setup flag to all inserted instrs. + for (++ExpansionMBBI; ExpansionMBBI != MBBI; ++ExpansionMBBI) + ExpansionMBBI->setFlag(MachineInstr::FrameSetup); + } + + return MBBI; +} + +MachineInstr *X86FrameLowering::emitStackProbeInlineStub( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, bool InProlog) const { + + assert(InProlog && "ChkStkStub called outside prolog!"); + + BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32)) + .addExternalSymbol("__chkstk_stub"); + + return MBBI; } static unsigned calculateSetFPREG(uint64_t SPAdjust) { @@ -526,7 +797,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con const MachineFrameInfo *MFI = MF.getFrameInfo(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. unsigned StackAlign = getStackAlignment(); - if (ForceStackAlign) { + if (MF.getFunction()->hasFnAttribute("stackrealign")) { if (MFI->hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) @@ -537,15 +808,14 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL, + DebugLoc DL, unsigned Reg, uint64_t MaxAlign) const { uint64_t Val = -MaxAlign; - MachineInstr *MI = - BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), - StackPtr) - .addReg(StackPtr) - .addImm(Val) - .setMIFlag(MachineInstr::FrameSetup); + unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val); + MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg) + .addReg(Reg) + .addImm(Val) + .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); @@ -646,6 +916,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. + bool IsFunclet = MBB.isEHFuncletEntry(); + EHPersonality Personality = EHPersonality::Unknown; + if (Fn->hasPersonalityFn()) + Personality = classifyEHPersonality(Fn->getPersonalityFn()); + bool FnHasClrFunclet = + MMI.hasEHFunclets() && Personality == EHPersonality::CoreCLR; + bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); @@ -655,9 +932,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned FramePtr = TRI->getFrameRegister(MF); const unsigned MachineFramePtr = STI.isTarget64BitILP32() - ? getX86SubSuperRegister(FramePtr, MVT::i64, false) - : FramePtr; + ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; unsigned BasePtr = TRI->getBaseRegister(); + + // Debug location must be unknown since the first debug location is used + // to determine the end of the prologue. DebugLoc DL; // Add RETADDR move area to callee saved frame size. @@ -723,6 +1002,24 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, uint64_t NumBytes = 0; int stackGrowth = -SlotSize; + // Find the funclet establisher parameter + unsigned Establisher = X86::NoRegister; + if (IsClrFunclet) + Establisher = Uses64BitFramePtr ? X86::RCX : X86::ECX; + else if (IsFunclet) + Establisher = Uses64BitFramePtr ? X86::RDX : X86::EDX; + + if (IsWin64Prologue && IsFunclet && !IsClrFunclet) { + // Immediately spill establisher into the home slot. + // The runtime cares about this. + // MOV64mr %rdx, 16(%rsp) + unsigned MOVmr = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MOVmr)), StackPtr, true, 16) + .addReg(Establisher) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(Establisher); + } + if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; @@ -739,7 +1036,11 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. - MFI->setOffsetAdjustment(-NumBytes); + if (!IsFunclet) + MFI->setOffsetAdjustment(-NumBytes); + else + assert(MFI->getOffsetAdjustment() == -(int)NumBytes && + "should calculate same local variable offset for funclets"); // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) @@ -765,35 +1066,46 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - if (!IsWin64Prologue) { + if (!IsWin64Prologue && !IsFunclet) { // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); - } - if (NeedsDwarfCFI) { - // Mark effective beginning of when frame pointer becomes valid. - // Define the current CFA to use the EBP/RBP register. - unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); + if (NeedsDwarfCFI) { + // Mark effective beginning of when frame pointer becomes valid. + // Define the current CFA to use the EBP/RBP register. + unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); + BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister( + nullptr, DwarfFramePtr)); + } } - // Mark the FramePtr as live-in in every block. - for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) - I->addLiveIn(MachineFramePtr); + // Mark the FramePtr as live-in in every block. Don't do this again for + // funclet prologues. + if (!IsFunclet) { + for (MachineBasicBlock &EveryMBB : MF) + EveryMBB.addLiveIn(MachineFramePtr); + } } else { + assert(!IsFunclet && "funclets without FPs not yet implemented"); NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } + // For EH funclets, only allocate enough space for outgoing calls. Save the + // NumBytes value that we would've used for the parent frame. + unsigned ParentFrameNumBytes = NumBytes; + if (IsFunclet) + NumBytes = getWinEHFuncletFrameSize(MF); + // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; while (MBBI != MBB.end() && + MBBI->getFlag(MachineInstr::FrameSetup) && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; @@ -818,9 +1130,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Don't do this for Win64, it needs to realign the stack after the prologue. - if (!IsWin64Prologue && TRI->needsStackRealignment(MF)) { + if (!IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(MBB, MBBI, DL, StackPtr, MaxAlign); } // If there is an SUB32ri of ESP immediately before this instruction, merge @@ -839,7 +1151,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. uint64_t AlignedNumBytes = NumBytes; - if (IsWin64Prologue && TRI->needsStackRealignment(MF)) + if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = RoundUpToAlignment(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { // Check whether EAX is livein for this function. @@ -876,26 +1188,18 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) - .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) - .setMIFlag(MachineInstr::FrameSetup); + .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) + .setMIFlag(MachineInstr::FrameSetup); } - // Save a pointer to the MI where we set AX. - MachineBasicBlock::iterator SetRAX = MBBI; - --SetRAX; - // Call __chkstk, __chkstk_ms, or __alloca. - emitStackProbeCall(MF, MBB, MBBI, DL); - - // Apply the frame setup flag to all inserted instrs. - for (; SetRAX != MBBI; ++SetRAX) - SetRAX->setFlag(MachineInstr::FrameSetup); + emitStackProbe(MF, MBB, MBBI, DL, true); if (isEAXAlive) { // Restore EAX - MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), - X86::EAX), - StackPtr, false, NumBytes - 4); + MachineInstr *MI = + addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), + StackPtr, false, NumBytes - 4); MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } @@ -909,19 +1213,72 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); int SEHFrameOffset = 0; + unsigned SPOrEstablisher; + if (IsFunclet) { + if (IsClrFunclet) { + // The establisher parameter passed to a CLR funclet is actually a pointer + // to the (mostly empty) frame of its nearest enclosing funclet; we have + // to find the root function establisher frame by loading the PSPSym from + // the intermediate frame. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + MachinePointerInfo NoInfo; + MBB.addLiveIn(Establisher); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher), + Establisher, false, PSPSlotOffset) + .addMemOperand(MF.getMachineMemOperand( + NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize)); + ; + // Save the root establisher back into the current funclet's (mostly + // empty) frame, in case a sub-funclet or the GC needs it. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, + false, PSPSlotOffset) + .addReg(Establisher) + .addMemOperand( + MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore | + MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + SPOrEstablisher = Establisher; + } else { + SPOrEstablisher = StackPtr; + } + if (IsWin64Prologue && HasFP) { - SEHFrameOffset = calculateSetFPREG(NumBytes); + // Set RBP to a small fixed offset from RSP. In the funclet case, we base + // this calculation on the incoming establisher, which holds the value of + // RSP from the parent frame at the end of the prologue. + SEHFrameOffset = calculateSetFPREG(ParentFrameNumBytes); if (SEHFrameOffset) addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), FramePtr), - StackPtr, false, SEHFrameOffset); + SPOrEstablisher, false, SEHFrameOffset); else - BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr).addReg(StackPtr); + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rr), FramePtr) + .addReg(SPOrEstablisher); - if (NeedsWinCFI) + // If this is not a funclet, emit the CFI describing our frame pointer. + if (NeedsWinCFI && !IsFunclet) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); + if (isAsynchronousEHPersonality(Personality)) + MF.getWinEHFuncInfo()->SEHSetFrameOffset = SEHFrameOffset; + } + } else if (IsFunclet && STI.is32Bit()) { + // Reset EBP / ESI to something good for funclets. + MBBI = restoreWin32EHStackPointers(MBB, MBBI, DL); + // If we're a catch funclet, we can be returned to via catchret. Save ESP + // into the registration node so that the runtime will restore it for us. + if (!MBB.isCleanupFuncletEntry()) { + assert(Personality == EHPersonality::MSVC_CXX); + unsigned FrameReg; + int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex; + int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg); + // ESP is the first field, so no extra displacement is needed. + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32mr)), FrameReg, + false, EHRegOffset) + .addReg(X86::ESP); + } } while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) { @@ -932,7 +1289,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, int FI; if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) { if (X86::FR64RegClass.contains(Reg)) { - int Offset = getFrameIndexOffset(MF, FI); + unsigned IgnoredFrameReg; + int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg); Offset += SEHFrameOffset; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) @@ -948,14 +1306,33 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); + if (FnHasClrFunclet && !IsFunclet) { + // Save the so-called Initial-SP (i.e. the value of the stack pointer + // immediately after the prolog) into the PSPSlot so that funclets + // and the GC can recover it. + unsigned PSPSlotOffset = getPSPSlotOffsetFromSP(MF); + auto PSPInfo = MachinePointerInfo::getFixedStack( + MF, MF.getWinEHFuncInfo()->PSPSymFrameIdx); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr, false, + PSPSlotOffset) + .addReg(StackPtr) + .addMemOperand(MF.getMachineMemOperand( + PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile, + SlotSize, SlotSize)); + } + // Realign stack after we spilled callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). // Win64 requires aligning the stack after the prologue. if (IsWin64Prologue && TRI->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); - BuildStackAlignAND(MBB, MBBI, DL, MaxAlign); + BuildStackAlignAND(MBB, MBBI, DL, SPOrEstablisher, MaxAlign); } + // We already dealt with stack realignment and funclets above. + if (IsFunclet && STI.is32Bit()) + return; + // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer @@ -964,7 +1341,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) - .addReg(StackPtr) + .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens @@ -972,18 +1349,21 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) - .addReg(StackPtr) + .addReg(SPOrEstablisher) .setMIFlag(MachineInstr::FrameSetup); } - if (X86FI->getHasSEHFramePtrSave()) { + if (X86FI->getHasSEHFramePtrSave() && !IsFunclet) { // Stash the value of the frame pointer relative to the base pointer for // Win32 EH. This supports Win32 EH, which does the inverse of the above: // it recovers the frame pointer from the base pointer rather than the // other way around. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; - addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), BasePtr, true, - getFrameIndexOffset(MF, X86FI->getSEHFramePtrSaveIndex())) + unsigned UsedReg; + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), UsedReg, true, Offset) .addReg(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } @@ -1015,6 +1395,69 @@ bool X86FrameLowering::canUseLEAForSPInEpilogue( return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() || hasFP(MF); } +static bool isFuncletReturnInstr(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CATCHRET: + case X86::CLEANUPRET: + return true; + default: + return false; + } + llvm_unreachable("impossible"); +} + +// CLR funclets use a special "Previous Stack Pointer Symbol" slot on the +// stack. It holds a pointer to the bottom of the root function frame. The +// establisher frame pointer passed to a nested funclet may point to the +// (mostly empty) frame of its parent funclet, but it will need to find +// the frame of the root function to access locals. To facilitate this, +// every funclet copies the pointer to the bottom of the root function +// frame into a PSPSym slot in its own (mostly empty) stack frame. Using the +// same offset for the PSPSym in the root function frame that's used in the +// funclets' frames allows each funclet to dynamically accept any ancestor +// frame as its establisher argument (the runtime doesn't guarantee the +// immediate parent for some reason lost to history), and also allows the GC, +// which uses the PSPSym for some bookkeeping, to find it in any funclet's +// frame with only a single offset reported for the entire method. +unsigned +X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const { + const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo(); + // getFrameIndexReferenceFromSP has an out ref parameter for the stack + // pointer register; pass a dummy that we ignore + unsigned SPReg; + int Offset = getFrameIndexReferenceFromSP(MF, Info.PSPSymFrameIdx, SPReg); + assert(Offset >= 0); + return static_cast<unsigned>(Offset); +} + +unsigned +X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { + // This is the size of the pushed CSRs. + unsigned CSSize = + MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // This is the amount of stack a funclet needs to allocate. + unsigned UsedSize; + EHPersonality Personality = + classifyEHPersonality(MF.getFunction()->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + // CLR funclets need to hold enough space to include the PSPSym, at the + // same offset from the stack pointer (immediately after the prolog) as it + // resides at in the main function. + UsedSize = getPSPSlotOffsetFromSP(MF) + SlotSize; + } else { + // Other funclets just need enough stack for outgoing call arguments. + UsedSize = MF.getFrameInfo()->getMaxCallFrameSize(); + } + // RBP is not included in the callee saved register block. After pushing RBP, + // everything is 16 byte aligned. Everything we allocate before an outgoing + // call must also be 16 byte aligned. + unsigned FrameSizeMinusRBP = + RoundUpToAlignment(CSSize + UsedSize, getStackAlignment()); + // Subtract out the size of the callee saved registers. This is how much stack + // each funclet will allocate. + return FrameSizeMinusRBP - CSSize; +} + void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -1027,12 +1470,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, const bool Is64BitILP32 = STI.isTarget64BitILP32(); unsigned FramePtr = TRI->getFrameRegister(MF); unsigned MachineFramePtr = - Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) - : FramePtr; + Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinCFI = IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); + bool IsFunclet = isFuncletReturnInstr(MBBI); + MachineBasicBlock *TargetMBB = nullptr; // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); @@ -1040,7 +1484,27 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; - if (hasFP(MF)) { + if (MBBI->getOpcode() == X86::CATCHRET) { + // SEH shouldn't use catchret. + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF.getFunction()->getPersonalityFn())) && + "SEH should not use CATCHRET"); + + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + TargetMBB = MBBI->getOperand(0).getMBB(); + + // Pop EBP. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (MBBI->getOpcode() == X86::CLEANUPRET) { + NumBytes = getWinEHFuncletFrameSize(MF); + assert(hasFP(MF) && "EH funclets without FP not yet implemented"); + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } else if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; NumBytes = FrameSize - CSSize; @@ -1052,7 +1516,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Pop EBP. BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); + TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); } else { NumBytes = StackSize - CSSize; } @@ -1063,26 +1528,50 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); - if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && - !PI->isTerminator()) + if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) && + (Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) && + Opc != X86::DBG_VALUE && !PI->isTerminator()) break; --MBBI; } MachineBasicBlock::iterator FirstCSPop = MBBI; + if (TargetMBB) { + // Fill EAX/RAX with the address of the target block. + unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX; + if (STI.is64Bit()) { + // LEA64r TargetMBB(%rip), %rax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(TargetMBB) + .addReg(0); + } else { + // MOV32ri $TargetMBB, %eax + BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg) + .addMBB(TargetMBB); + } + // Record that we've taken the address of TargetMBB and no longer just + // reference it in a terminator. + TargetMBB->setHasAddressTaken(); + } + if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI->hasVarSizedObjects()) - mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); + NumBytes += mergeSPUpdates(MBB, MBBI, true); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was - // realigned. - if (TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { + // realigned. Don't do this if this was a funclet epilogue, since the funclets + // will not do realignment or dynamic stack allocation. + if ((TRI->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) && + !IsFunclet) { if (TRI->needsStackRealignment(MF)) MBBI = FirstCSPop; unsigned SEHFrameOffset = calculateSetFPREG(SEHStackAllocAmt); @@ -1134,9 +1623,24 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } -int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, - int FI) const { +// NOTE: this only has a subset of the full frame index logic. In +// particular, the FI < 0 and AfterFPPop logic is handled in +// X86RegisterInfo::eliminateFrameIndex, but not here. Possibly +// (probably?) it should be moved into here. +int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // We can't calculate offset from frame pointer if the stack is realigned, + // so enforce usage of stack/base pointer. The base pointer is used when we + // have dynamic allocas in addition to dynamic realignment. + if (TRI->hasBasePointer(MF)) + FrameReg = TRI->getBaseRegister(); + else if (TRI->needsStackRealignment(MF)) + FrameReg = TRI->getStackRegister(); + else + FrameReg = TRI->getFrameRegister(MF); + // Offset will hold the offset from the stack pointer at function entry to the // object. // We need to factor in additional offsets applied during the prologue to the @@ -1207,48 +1711,62 @@ int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, return Offset + FPDelta; } -int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { - // We can't calculate offset from frame pointer if the stack is realigned, - // so enforce usage of stack/base pointer. The base pointer is used when we - // have dynamic allocas in addition to dynamic realignment. - if (TRI->hasBasePointer(MF)) - FrameReg = TRI->getBaseRegister(); - else if (TRI->needsStackRealignment(MF)) - FrameReg = TRI->getStackRegister(); - else - FrameReg = TRI->getFrameRegister(MF); - return getFrameIndexOffset(MF, FI); -} - -// Simplified from getFrameIndexOffset keeping only StackPointer cases -int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { +// Simplified from getFrameIndexReference keeping only StackPointer cases +int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, + int FI, + unsigned &FrameReg) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); // Does not include any dynamic realign. const uint64_t StackSize = MFI->getStackSize(); { #ifndef NDEBUG - // Note: LLVM arranges the stack as: - // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) - // > "Stack Slots" (<--SP) - // We can always address StackSlots from RSP. We can usually (unless - // needsStackRealignment) address CSRs from RSP, but sometimes need to - // address them from RBP. FixedObjects can be placed anywhere in the stack - // frame depending on their specific requirements (i.e. we can actually - // refer to arguments to the function which are stored in the *callers* - // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs - // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. - - assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); - - // We don't handle tail calls, and shouldn't be seeing them - // either. + // LLVM arranges the stack as follows: + // ... + // ARG2 + // ARG1 + // RETADDR + // PUSH RBP <-- RBP points here + // PUSH CSRs + // ~~~~~~~ <-- possible stack realignment (non-win64) + // ... + // STACK OBJECTS + // ... <-- RSP after prologue points here + // ~~~~~~~ <-- possible stack realignment (win64) + // + // if (hasVarSizedObjects()): + // ... <-- "base pointer" (ESI/RBX) points here + // DYNAMIC ALLOCAS + // ... <-- RSP points here + // + // Case 1: In the simple case of no stack realignment and no dynamic + // allocas, both "fixed" stack objects (arguments and CSRs) are addressable + // with fixed offsets from RSP. + // + // Case 2: In the case of stack realignment with no dynamic allocas, fixed + // stack objects are addressed with RBP and regular stack objects with RSP. + // + // Case 3: In the case of dynamic allocas and stack realignment, RSP is used + // to address stack arguments for outgoing calls and nothing else. The "base + // pointer" points to local variables, and RBP points to fixed objects. + // + // In cases 2 and 3, we can only answer for non-fixed stack objects, and the + // answer we give is relative to the SP after the prologue, and not the + // SP in the middle of the function. + + assert((!MFI->isFixedObjectIndex(FI) || !TRI->needsStackRealignment(MF) || + STI.isTargetWin64()) && + "offset from fixed object to SP is not static"); + + // We don't handle tail calls, and shouldn't be seeing them either. int TailCallReturnAddrDelta = MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta(); assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); #endif } + // Fill in FrameReg output argument. + FrameReg = TRI->getStackRegister(); + // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is @@ -1280,15 +1798,6 @@ int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int F return Offset + StackSize; } -// Simplified from getFrameIndexReference keeping only StackPointer cases -int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, - int FI, - unsigned &FrameReg) const { - assert(!TRI->hasBasePointer(MF) && "we don't handle this case"); - - FrameReg = TRI->getStackRegister(); - return getFrameIndexOffsetFromSP(MF, FI); -} bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, @@ -1358,6 +1867,11 @@ bool X86FrameLowering::spillCalleeSavedRegisters( const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); + // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI + // for us, and there are no XMM CSRs on Win32. + if (MBB.isEHFuncletEntry() && STI.is32Bit() && STI.isOSWindows()) + return true; + // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { @@ -1399,6 +1913,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, if (CSI.empty()) return false; + if (isFuncletReturnInstr(MI) && STI.isOSWindows()) { + // Don't restore CSRs in 32-bit EH funclets. Matches + // spillCalleeSavedRegisters. + if (STI.is32Bit()) + return true; + // Don't restore CSRs before an SEH catchret. SEH except blocks do not form + // funclets. emitEpilogue transforms these to normal jumps. + if (MI->getOpcode() == X86::CATCHRET) { + const Function *Func = MBB.getParent()->getFunction(); + bool IsSEH = isAsynchronousEHPersonality( + classifyEHPersonality(Func->getPersonalityFn())); + if (IsSEH) + return true; + } + } + DebugLoc DL = MBB.findDebugLoc(MI); // Reload XMMs from stack frame. @@ -1420,7 +1950,8 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, !X86::GR32RegClass.contains(Reg)) continue; - BuildMI(MBB, MI, DL, TII.get(Opc), Reg); + BuildMI(MBB, MI, DL, TII.get(Opc), Reg) + .setMIFlag(MachineInstr::FrameDestroy); } return true; } @@ -1450,8 +1981,16 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, } // Spill the BasePtr if it's used. - if (TRI->hasBasePointer(MF)) + if (TRI->hasBasePointer(MF)) { SavedRegs.set(TRI->getBaseRegister()); + + // Allocate a spill slot for EBP if we have a base pointer and EH funclets. + if (MF.getMMI().hasEHFunclets()) { + int FI = MFI->CreateSpillStackObject(SlotSize, SlotSize); + X86FI->setHasSEHFramePtrSave(true); + X86FI->setSEHFramePtrSaveIndex(FI); + } + } } static bool @@ -1545,11 +2084,9 @@ void X86FrameLowering::adjustForSegmentedStacks( // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. - for (MachineBasicBlock::livein_iterator i = PrologueMBB.livein_begin(), - e = PrologueMBB.livein_end(); - i != e; i++) { - allocMBB->addLiveIn(*i); - checkMBB->addLiveIn(*i); + for (const auto &LI : PrologueMBB.liveins()) { + allocMBB->addLiveIn(LI); + checkMBB->addLiveIn(LI); } if (IsNested) @@ -1682,8 +2219,6 @@ void X86FrameLowering::adjustForSegmentedStacks( .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); - MF.getRegInfo().setPhysRegUsed(Reg10); - MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); @@ -1821,11 +2356,9 @@ void X86FrameLowering::adjustForHiPEPrologue( MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); - for (MachineBasicBlock::livein_iterator I = PrologueMBB.livein_begin(), - E = PrologueMBB.livein_end(); - I != E; I++) { - stackCheckMBB->addLiveIn(*I); - incStackMBB->addLiveIn(*I); + for (const auto &LI : PrologueMBB.liveins()) { + stackCheckMBB->addLiveIn(LI); + incStackMBB->addLiveIn(LI); } MF.push_front(incStackMBB); @@ -1870,16 +2403,84 @@ void X86FrameLowering::adjustForHiPEPrologue( .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); - stackCheckMBB->addSuccessor(&PrologueMBB, 99); - stackCheckMBB->addSuccessor(incStackMBB, 1); - incStackMBB->addSuccessor(&PrologueMBB, 99); - incStackMBB->addSuccessor(incStackMBB, 1); + stackCheckMBB->addSuccessor(&PrologueMBB, {99, 100}); + stackCheckMBB->addSuccessor(incStackMBB, {1, 100}); + incStackMBB->addSuccessor(&PrologueMBB, {99, 100}); + incStackMBB->addSuccessor(incStackMBB, {1, 100}); } #ifdef XDEBUG MF.verify(); #endif } +bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, int Offset) const { + + if (Offset <= 0) + return false; + + if (Offset % SlotSize) + return false; + + int NumPops = Offset / SlotSize; + // This is only worth it if we have at most 2 pops. + if (NumPops != 1 && NumPops != 2) + return false; + + // Handle only the trivial case where the adjustment directly follows + // a call. This is the most common one, anyway. + if (MBBI == MBB.begin()) + return false; + MachineBasicBlock::iterator Prev = std::prev(MBBI); + if (!Prev->isCall() || !Prev->getOperand(1).isRegMask()) + return false; + + unsigned Regs[2]; + unsigned FoundRegs = 0; + + auto RegMask = Prev->getOperand(1); + + auto &RegClass = + Is64Bit ? X86::GR64_NOREX_NOSPRegClass : X86::GR32_NOREX_NOSPRegClass; + // Try to find up to NumPops free registers. + for (auto Candidate : RegClass) { + + // Poor man's liveness: + // Since we're immediately after a call, any register that is clobbered + // by the call and not defined by it can be considered dead. + if (!RegMask.clobbersPhysReg(Candidate)) + continue; + + bool IsDef = false; + for (const MachineOperand &MO : Prev->implicit_operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == Candidate) { + IsDef = true; + break; + } + } + + if (IsDef) + continue; + + Regs[FoundRegs++] = Candidate; + if (FoundRegs == (unsigned)NumPops) + break; + } + + if (FoundRegs == 0) + return false; + + // If we found only one free register, but need two, reuse the same one twice. + while (FoundRegs < (unsigned)NumPops) + Regs[FoundRegs++] = Regs[0]; + + for (int i = 0; i < NumPops; ++i) + BuildMI(MBB, MBBI, DL, + TII.get(STI.is64Bit() ? X86::POP64r : X86::POP32r), Regs[i]); + + return true; +} + void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { @@ -1895,8 +2496,6 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, <amt>' and the // adjcallstackdown instruction into 'add ESP, <amt>' - if (Amount == 0) - return; // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next @@ -1904,15 +2503,68 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, unsigned StackAlign = getStackAlignment(); Amount = RoundUpToAlignment(Amount, StackAlign); + MachineModuleInfo &MMI = MF.getMMI(); + const Function *Fn = MF.getFunction(); + bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool DwarfCFI = !WindowsCFI && + (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + + // If we have any exception handlers in this function, and we adjust + // the SP before calls, we may need to indicate this to the unwinder + // using GNU_ARGS_SIZE. Note that this may be necessary even when + // Amount == 0, because the preceding function may have set a non-0 + // GNU_ARGS_SIZE. + // TODO: We don't need to reset this between subsequent functions, + // if it didn't change. + bool HasDwarfEHHandlers = !WindowsCFI && + !MF.getMMI().getLandingPads().empty(); + + if (HasDwarfEHHandlers && !isDestroy && + MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences()) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createGnuArgsSize(nullptr, Amount)); + + if (Amount == 0) + return; + // Factor out the amount that gets handled inside the sequence // (Pushes of argument for frame setup, callee pops for frame destroy) Amount -= InternalAmt; + // TODO: This is needed only if we require precise CFA. + // If this is a callee-pop calling convention, emit a CFA adjust for + // the amount the callee popped. + if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF)) + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt)); + if (Amount) { // Add Amount to SP to destroy a frame, and subtract to setup. int Offset = isDestroy ? Amount : -Amount; - BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); + + if (!(Fn->optForMinSize() && + adjustStackWithPops(MBB, I, DL, Offset))) + BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false); + } + + if (DwarfCFI && !hasFP(MF)) { + // If we don't have FP, but need to generate unwind information, + // we need to set the correct CFA offset after the stack adjustment. + // How much we adjust the CFA offset depends on whether we're emitting + // CFI only for EH purposes or for debugging. EH only requires the CFA + // offset to be correct at each call site, while for debugging we want + // it to be more precise. + int CFAOffset = Amount; + // TODO: When not using precise CFA, we also need to adjust for the + // InternalAmt here. + + if (CFAOffset) { + CFAOffset = isDestroy ? -CFAOffset : CFAOffset; + BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset)); + } } + return; } @@ -1933,12 +2585,136 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { assert(MBB.getParent() && "Block is not attached to a function!"); + // Win64 has strict requirements in terms of epilogue and we are + // not taking a chance at messing with them. + // I.e., unless this block is already an exit block, we can't use + // it as an epilogue. + if (STI.isTargetWin64() && !MBB.succ_empty() && !MBB.isReturnBlock()) + return false; + if (canUseLEAForSPInEpilogue(*MBB.getParent())) return true; // If we cannot use LEA to adjust SP, we may need to use ADD, which - // clobbers the EFLAGS. Check that none of the terminators reads the - // EFLAGS, and if one uses it, conservatively assume this is not + // clobbers the EFLAGS. Check that we do not need to preserve it, + // otherwise, conservatively assume this is not // safe to insert the epilogue here. - return !terminatorsNeedFlagsAsInput(MBB); + return !flagsNeedToBePreservedBeforeTheTerminators(MBB); +} + +bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { + // If we may need to emit frameless compact unwind information, give + // up as this is currently broken: PR25614. + return MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF); +} + +MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool RestoreSP) const { + assert(STI.isTargetWindowsMSVC() && "funclets only supported in MSVC env"); + assert(STI.isTargetWin32() && "EBP/ESI restoration only required on win32"); + assert(STI.is32Bit() && !Uses64BitFramePtr && + "restoring EBP/ESI on non-32-bit target"); + + MachineFunction &MF = *MBB.getParent(); + unsigned FramePtr = TRI->getFrameRegister(MF); + unsigned BasePtr = TRI->getBaseRegister(); + WinEHFuncInfo &FuncInfo = *MF.getWinEHFuncInfo(); + X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // FIXME: Don't set FrameSetup flag in catchret case. + + int FI = FuncInfo.EHRegNodeFrameIndex; + int EHRegSize = MFI->getObjectSize(FI); + + if (RestoreSP) { + // MOV32rm -EHRegSize(%ebp), %esp + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), X86::ESP), + X86::EBP, true, -EHRegSize) + .setMIFlag(MachineInstr::FrameSetup); + } + + unsigned UsedReg; + int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg); + int EndOffset = -EHRegOffset - EHRegSize; + FuncInfo.EHRegNodeEndOffset = EndOffset; + + if (UsedReg == FramePtr) { + // ADD $offset, %ebp + unsigned ADDri = getADDriOpcode(false, EndOffset); + BuildMI(MBB, MBBI, DL, TII.get(ADDri), FramePtr) + .addReg(FramePtr) + .addImm(EndOffset) + .setMIFlag(MachineInstr::FrameSetup) + ->getOperand(3) + .setIsDead(); + assert(EndOffset >= 0 && + "end of registration object above normal EBP position!"); + } else if (UsedReg == BasePtr) { + // LEA offset(%ebp), %esi + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::LEA32r), BasePtr), + FramePtr, false, EndOffset) + .setMIFlag(MachineInstr::FrameSetup); + // MOV32rm SavedEBPOffset(%esi), %ebp + assert(X86FI->getHasSEHFramePtrSave()); + int Offset = + getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg); + assert(UsedReg == BasePtr); + addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32rm), FramePtr), + UsedReg, true, Offset) + .setMIFlag(MachineInstr::FrameSetup); + } else { + llvm_unreachable("32-bit frames with WinEH must use FramePtr or BasePtr"); + } + return MBBI; +} + +unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { + // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. + unsigned Offset = 16; + // RBP is immediately pushed. + Offset += SlotSize; + // All callee-saved registers are then pushed. + Offset += MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize(); + // Every funclet allocates enough stack space for the largest outgoing call. + Offset += getWinEHFuncletFrameSize(MF); + return Offset; +} + +void X86FrameLowering::processFunctionBeforeFrameFinalized( + MachineFunction &MF, RegScavenger *RS) const { + // If this function isn't doing Win64-style C++ EH, we don't need to do + // anything. + const Function *Fn = MF.getFunction(); + if (!STI.is64Bit() || !MF.getMMI().hasEHFunclets() || + classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX) + return; + + // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset + // relative to RSP after the prologue. Find the offset of the last fixed + // object, so that we can allocate a slot immediately following it. If there + // were no fixed objects, use offset -SlotSize, which is immediately after the + // return address. Fixed objects have negative frame indices. + MachineFrameInfo *MFI = MF.getFrameInfo(); + int64_t MinFixedObjOffset = -SlotSize; + for (int I = MFI->getObjectIndexBegin(); I < 0; ++I) + MinFixedObjOffset = std::min(MinFixedObjOffset, MFI->getObjectOffset(I)); + + int64_t UnwindHelpOffset = MinFixedObjOffset - SlotSize; + int UnwindHelpFI = + MFI->CreateFixedObject(SlotSize, UnwindHelpOffset, /*Immutable=*/false); + MF.getWinEHFuncInfo()->UnwindHelpFrameIdx = UnwindHelpFI; + + // Store -2 into UnwindHelp on function entry. We have to scan forwards past + // other frame setup instructions. + MachineBasicBlock &MBB = MF.front(); + auto MBBI = MBB.begin(); + while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) + ++MBBI; + + DebugLoc DL = MBB.findDebugLoc(MBBI); + addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)), + UnwindHelpFI) + .addImm(-2); } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index 495cfcd..3ab41b4 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -47,11 +47,17 @@ public: unsigned StackPtr; - /// Emit a call to the target's stack probe function. This is required for all + /// Emit target stack probe code. This is required for all /// large stack allocations on Windows. The caller is required to materialize - /// the number of bytes to probe in RAX/EAX. - void emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL) const; + /// the number of bytes to probe in RAX/EAX. Returns instruction just + /// after the expansion. + MachineInstr *emitStackProbe(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool InProlog) const; + + /// Replace a StackProbe inline-stub with the actual probe code inline. + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologMBB) const override; void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -91,11 +97,9 @@ public: bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; bool needsFrameIndexResolution(const MachineFunction &MF) const override; - int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; - int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; @@ -103,6 +107,11 @@ public: MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override; + + void processFunctionBeforeFrameFinalized(MachineFunction &MF, + RegScavenger *RS) const override; + /// Check the instruction before/after the passed instruction. If /// it is an ADD/SUB/LEA instruction it is deleted argument and the /// stack adjustment is returned as a positive value for ADD/LEA and @@ -125,7 +134,9 @@ public: /// \p MBB will be correctly handled by the target. bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; -private: + /// Returns true if the target will correctly handle shrink wrapping. + bool enableShrinkWrapping(const MachineFunction &MF) const override; + /// convertArgMovsToPushes - This method tries to convert a call sequence /// that uses sub and mov instructions to put the argument onto the stack /// into a series of pushes. @@ -135,22 +146,56 @@ private: MachineBasicBlock::iterator I, uint64_t Amount) const; - uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; - /// Wraps up getting a CFI index and building a MachineInstr for it. void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, MCCFIInstruction CFIInst) const; + /// Sets up EBP and optionally ESI based on the incoming EBP value. Only + /// needed for 32-bit. Used in funclet prologues and at catchret destinations. + MachineBasicBlock::iterator + restoreWin32EHStackPointers(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + bool RestoreSP = false) const; + +private: + uint64_t calculateMaxStackAlign(const MachineFunction &MF) const; + + /// Emit target stack probe as a call to a helper function + MachineInstr *emitStackProbeCall(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit target stack probe as an inline sequence. + MachineInstr *emitStackProbeInline(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + + /// Emit a stub to later inline the target stack probe. + MachineInstr *emitStackProbeInlineStub(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, bool InProlog) const; + /// Aligns the stack pointer by ANDing it with -MaxAlign. void BuildStackAlignAND(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, - uint64_t MaxAlign) const; + unsigned Reg, uint64_t MaxAlign) const; + + /// Make small positive stack adjustments using POPs. + bool adjustStackWithPops(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc DL, + int Offset) const; /// Adjusts the stack pointer using LEA, SUB, or ADD. MachineInstrBuilder BuildStackAdjustment(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL, int64_t Offset, bool InEpilogue) const; + + unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const; + + unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index d5351d2..4414e47 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -46,9 +46,8 @@ STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); //===----------------------------------------------------------------------===// namespace { - /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses - /// SDValue's instead of register numbers for the leaves of the matched - /// tree. + /// This corresponds to X86AddressMode, but uses SDValue's instead of register + /// numbers for the leaves of the matched tree. struct X86ISelAddressMode { enum { RegBase, @@ -87,8 +86,7 @@ namespace { IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; } - /// isRIPRelative - Return true if this addressing mode is already RIP - /// relative. + /// Return true if this addressing mode is already RIP-relative. bool isRIPRelative() const { if (BaseType != RegBase) return false; if (RegisterSDNode *RegNode = @@ -147,21 +145,25 @@ namespace { namespace { //===--------------------------------------------------------------------===// - /// ISel - X86 specific code to select X86 machine instructions for + /// ISel - X86-specific code to select X86 machine instructions for /// SelectionDAG operations. /// class X86DAGToDAGISel final : public SelectionDAGISel { - /// Subtarget - Keep a pointer to the X86Subtarget around so that we can + /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - /// OptForSize - If true, selector should try to optimize for code size - /// instead of performance. + /// If true, selector should try to optimize for code size instead of + /// performance. bool OptForSize; + /// If true, selector should try to optimize for minimum code size. + bool OptForMinSize; + public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), OptForSize(false) {} + : SelectionDAGISel(tm, OptLevel), OptForSize(false), + OptForMinSize(false) {} const char *getPassName() const override { return "X86 DAG->DAG Instruction Selection"; @@ -184,8 +186,7 @@ namespace { return isInt<8>(cast<ConstantSDNode>(N)->getSExtValue()); } - // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit - // sign extended field. + // True if the 64-bit immediate fits in a 32-bit sign-extended field. inline bool i64immSExt32(SDNode *N) const { uint64_t v = cast<ConstantSDNode>(N)->getZExtValue(); return (int64_t)v == (int32_t)v; @@ -196,50 +197,50 @@ namespace { private: SDNode *Select(SDNode *N) override; - SDNode *SelectGather(SDNode *N, unsigned Opc); - SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT); - - bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); - bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); - bool MatchWrapper(SDValue N, X86ISelAddressMode &AM); - bool MatchAddress(SDValue N, X86ISelAddressMode &AM); - bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, + SDNode *selectGather(SDNode *N, unsigned Opc); + SDNode *selectAtomicLoadArith(SDNode *Node, MVT NVT); + + bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); + bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); + bool matchWrapper(SDValue N, X86ISelAddressMode &AM); + bool matchAddress(SDValue N, X86ISelAddressMode &AM); + bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); + bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); - bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM); - bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, + bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); + bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectMOV64Imm32(SDValue N, SDValue &Imm); - bool SelectLEAAddr(SDValue N, SDValue &Base, + bool selectMOV64Imm32(SDValue N, SDValue &Imm); + bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectLEA64_32Addr(SDValue N, SDValue &Base, + bool selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectTLSADDRAddr(SDValue N, SDValue &Base, + bool selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - bool SelectScalarSSELoad(SDNode *Root, SDValue N, + bool selectScalarSSELoad(SDNode *Root, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, SDValue &NodeWithChain); - bool TryFoldLoad(SDNode *P, SDValue N, + bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for - /// inline asm expressions. + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector<SDValue> &OutOps) override; - void EmitSpecialCodeForMain(); + void emitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, SDLoc DL, SDValue &Base, SDValue &Scale, @@ -252,7 +253,7 @@ namespace { : AM.Base_Reg; Scale = getI8Imm(AM.Scale, DL); Index = AM.IndexReg; - // These are 32-bit even in 64-bit mode since RIP relative offset + // These are 32-bit even in 64-bit mode since RIP-relative offset // is 32-bit. if (AM.GV) Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), @@ -283,32 +284,105 @@ namespace { Segment = CurDAG->getRegister(0, MVT::i32); } - /// getI8Imm - Return a target constant with the specified value, of type - /// i8. + // Utility function to determine whether we should avoid selecting + // immediate forms of instructions for better code size or not. + // At a high level, we'd like to avoid such instructions when + // we have similar constants used within the same basic block + // that can be kept in a register. + // + bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { + uint32_t UseCount = 0; + + // Do not want to hoist if we're not optimizing for size. + // TODO: We'd like to remove this restriction. + // See the comment in X86InstrInfo.td for more info. + if (!OptForSize) + return false; + + // Walk all the users of the immediate. + for (SDNode::use_iterator UI = N->use_begin(), + UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { + + SDNode *User = *UI; + + // This user is already selected. Count it as a legitimate use and + // move on. + if (User->isMachineOpcode()) { + UseCount++; + continue; + } + + // We want to count stores of immediates as real uses. + if (User->getOpcode() == ISD::STORE && + User->getOperand(1).getNode() == N) { + UseCount++; + continue; + } + + // We don't currently match users that have > 2 operands (except + // for stores, which are handled above) + // Those instruction won't match in ISEL, for now, and would + // be counted incorrectly. + // This may change in the future as we add additional instruction + // types. + if (User->getNumOperands() != 2) + continue; + + // Immediates that are used for offsets as part of stack + // manipulation should be left alone. These are typically + // used to indicate SP offsets for argument passing and + // will get pulled into stores/pushes (implicitly). + if (User->getOpcode() == X86ISD::ADD || + User->getOpcode() == ISD::ADD || + User->getOpcode() == X86ISD::SUB || + User->getOpcode() == ISD::SUB) { + + // Find the other operand of the add/sub. + SDValue OtherOp = User->getOperand(0); + if (OtherOp.getNode() == N) + OtherOp = User->getOperand(1); + + // Don't count if the other operand is SP. + RegisterSDNode *RegNode; + if (OtherOp->getOpcode() == ISD::CopyFromReg && + (RegNode = dyn_cast_or_null<RegisterSDNode>( + OtherOp->getOperand(1).getNode()))) + if ((RegNode->getReg() == X86::ESP) || + (RegNode->getReg() == X86::RSP)) + continue; + } + + // ... otherwise, count this and move on. + UseCount++; + } + + // If we have more than 1 use, then recommend for hoisting. + return (UseCount > 1); + } + + /// Return a target constant with the specified value of type i8. inline SDValue getI8Imm(unsigned Imm, SDLoc DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i8); } - /// getI32Imm - Return a target constant with the specified value, of type - /// i32. + /// Return a target constant with the specified value, of type i32. inline SDValue getI32Imm(unsigned Imm, SDLoc DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } - /// getGlobalBaseReg - Return an SDNode that returns the value of - /// the global base register. Output instructions required to - /// initialize the global base register, if necessary. - /// + /// Return an SDNode that returns the value of the global base register. + /// Output instructions required to initialize the global base register, + /// if necessary. SDNode *getGlobalBaseReg(); - /// getTargetMachine - Return a reference to the TargetMachine, casted - /// to the target-specific type. + /// Return a reference to the TargetMachine, casted to the target-specific + /// type. const X86TargetMachine &getTargetMachine() const { return static_cast<const X86TargetMachine &>(TM); } - /// getInstrInfo - Return a reference to the TargetInstrInfo, casted - /// to the target-specific type. + /// Return a reference to the TargetInstrInfo, casted to the target-specific + /// type. const X86InstrInfo *getInstrInfo() const { return Subtarget->getInstrInfo(); } @@ -386,9 +460,9 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { return true; } -/// MoveBelowCallOrigChain - Replace the original chain operand of the call with +/// Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. -static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, +static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain) { SmallVector<SDValue, 8> Ops; SDValue Chain = OrigChain.getOperand(0); @@ -418,7 +492,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } -/// isCalleeLoad - Return true if call address is a load and it can be +/// Return true if call address is a load and it can be /// moved below CALLSEQ_START and the chains leading up to the call. /// Return the CALLSEQ_START by reference as a second output. /// In the case of a tail call, there isn't a callseq node between the call @@ -461,12 +535,14 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { } void X86DAGToDAGISel::PreprocessISelDAG() { - // OptForSize is used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + // OptFor[Min]Size are used in pattern predicates that isel is matching. + OptForSize = MF->getFunction()->optForSize(); + OptForMinSize = MF->getFunction()->optForMinSize(); + assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { - SDNode *N = I++; // Preincrement iterator to avoid invalidation issues. + SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. if (OptLevel != CodeGenOpt::None && // Only does this when target favors doesn't favor register indirect @@ -500,7 +576,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { SDValue Load = N->getOperand(1); if (!isCalleeLoad(Load, Chain, HasCallSeq)) continue; - MoveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); + moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); ++NumLoadMoved; continue; } @@ -577,9 +653,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() { } -/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in -/// the main function. -void X86DAGToDAGISel::EmitSpecialCodeForMain() { +/// Emit any code that needs to be executed only in the main function. +void X86DAGToDAGISel::emitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { TargetLowering::ArgListTy Args; auto &DL = CurDAG->getDataLayout(); @@ -599,7 +674,7 @@ void X86DAGToDAGISel::EmitFunctionEntryCode() { // If this is main, emit special code for main. if (const Function *Fn = MF->getFunction()) if (Fn->hasExternalLinkage() && Fn->getName() == "main") - EmitSpecialCodeForMain(); + emitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { @@ -612,7 +687,7 @@ static bool isDispSafeForFrameIndex(int64_t Val) { return isInt<31>(Val); } -bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, +bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { // Cannot combine ExternalSymbol displacements with integer offsets. if (Offset != 0 && (AM.ES || AM.MCSym)) @@ -634,7 +709,7 @@ bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset, } -bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ +bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ SDValue Address = N->getOperand(1); // load gs:0 -> GS segment register. @@ -658,11 +733,10 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){ return true; } -/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes -/// into an addressing mode. These wrap things that will resolve down into a -/// symbol reference. If no match is possible, this returns true, otherwise it -/// returns false. -bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { +/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing +/// mode. These wrap things that will resolve down into a symbol reference. +/// If no match is possible, this returns true, otherwise it returns false. +bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { // If the addressing mode already has a symbol as the displacement, we can // never match another symbol. if (AM.hasSymbolicDisplacement()) @@ -685,7 +759,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { X86ISelAddressMode Backup = AM; AM.GV = G->getGlobal(); AM.SymbolFlags = G->getTargetFlags(); - if (FoldOffsetIntoAddress(G->getOffset(), AM)) { + if (foldOffsetIntoAddress(G->getOffset(), AM)) { AM = Backup; return true; } @@ -694,7 +768,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { AM.CP = CP->getConstVal(); AM.Align = CP->getAlignment(); AM.SymbolFlags = CP->getTargetFlags(); - if (FoldOffsetIntoAddress(CP->getOffset(), AM)) { + if (foldOffsetIntoAddress(CP->getOffset(), AM)) { AM = Backup; return true; } @@ -710,7 +784,7 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { X86ISelAddressMode Backup = AM; AM.BlockAddr = BA->getBlockAddress(); AM.SymbolFlags = BA->getTargetFlags(); - if (FoldOffsetIntoAddress(BA->getOffset(), AM)) { + if (foldOffsetIntoAddress(BA->getOffset(), AM)) { AM = Backup; return true; } @@ -758,11 +832,10 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) { return true; } -/// MatchAddress - Add the specified node to the specified addressing mode, -/// returning true if it cannot be done. This just pattern matches for the -/// addressing mode. -bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { - if (MatchAddressRecursively(N, AM, 0)) +/// Add the specified node to the specified addressing mode, returning true if +/// it cannot be done. This just pattern matches for the addressing mode. +bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { + if (matchAddressRecursively(N, AM, 0)) return true; // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has @@ -790,15 +863,49 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) { return false; } +bool X86DAGToDAGISel::matchAdd(SDValue N, X86ISelAddressMode &AM, + unsigned Depth) { + // Add an artificial use to this node so that we can keep track of + // it if it gets CSE'd with a different node. + HandleSDNode Handle(N); + + X86ISelAddressMode Backup = AM; + if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) + return false; + AM = Backup; + + // Try again after commuting the operands. + if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) && + !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + return false; + AM = Backup; + + // If we couldn't fold both operands into the address at the same time, + // see if we can just put each operand into a register and fold at least + // the add. + if (AM.BaseType == X86ISelAddressMode::RegBase && + !AM.Base_Reg.getNode() && + !AM.IndexReg.getNode()) { + N = Handle.getValue(); + AM.Base_Reg = N.getOperand(0); + AM.IndexReg = N.getOperand(1); + AM.Scale = 1; + return false; + } + N = Handle.getValue(); + return true; +} + // Insert a node into the DAG at least before the Pos node's position. This // will reposition the node as needed, and will assign it a node ID that is <= // the Pos node's ID. Note that this does *not* preserve the uniqueness of node // IDs! The selection DAG must no longer depend on their uniqueness when this // is used. -static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { +static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { if (N.getNode()->getNodeId() == -1 || N.getNode()->getNodeId() > Pos.getNode()->getNodeId()) { - DAG.RepositionNode(Pos.getNode(), N.getNode()); + DAG.RepositionNode(Pos.getNode()->getIterator(), N.getNode()); N.getNode()->setNodeId(Pos.getNode()->getNodeId()); } } @@ -807,7 +914,7 @@ static void InsertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { // safe. This allows us to convert the shift and and into an h-register // extract and a scaled index. Returns false if the simplification is // performed. -static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, +static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -835,12 +942,12 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, Eight); - InsertDAGNode(DAG, N, Srl); - InsertDAGNode(DAG, N, NewMask); - InsertDAGNode(DAG, N, And); - InsertDAGNode(DAG, N, ShlCount); - InsertDAGNode(DAG, N, Shl); + insertDAGNode(DAG, N, Eight); + insertDAGNode(DAG, N, Srl); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, And); + insertDAGNode(DAG, N, ShlCount); + insertDAGNode(DAG, N, Shl); DAG.ReplaceAllUsesWith(N, Shl); AM.IndexReg = And; AM.Scale = (1 << ScaleLog); @@ -850,7 +957,7 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this // allows us to fold the shift into this addressing mode. Returns false if the // transform succeeded. -static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, +static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -880,9 +987,9 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, NewMask); - InsertDAGNode(DAG, N, NewAnd); - InsertDAGNode(DAG, N, NewShift); + insertDAGNode(DAG, N, NewMask); + insertDAGNode(DAG, N, NewAnd); + insertDAGNode(DAG, N, NewShift); DAG.ReplaceAllUsesWith(N, NewShift); AM.Scale = 1 << ShiftAmt; @@ -917,7 +1024,7 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, // Note that this function assumes the mask is provided as a mask *after* the // value is shifted. The input chain may or may not match that, but computing // such a mask is trivial. -static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, +static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { @@ -973,7 +1080,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, assert(X.getValueType() != VT); // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); - InsertDAGNode(DAG, N, NewX); + insertDAGNode(DAG, N, NewX); X = NewX; } SDLoc DL(N); @@ -987,10 +1094,10 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. - InsertDAGNode(DAG, N, NewSRLAmt); - InsertDAGNode(DAG, N, NewSRL); - InsertDAGNode(DAG, N, NewSHLAmt); - InsertDAGNode(DAG, N, NewSHL); + insertDAGNode(DAG, N, NewSRLAmt); + insertDAGNode(DAG, N, NewSRL); + insertDAGNode(DAG, N, NewSHLAmt); + insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); AM.Scale = 1 << AMShiftAmt; @@ -998,7 +1105,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, return false; } -bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, +bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); DEBUG({ @@ -1007,7 +1114,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, }); // Limit recursion. if (Depth > 5) - return MatchAddressBase(N, AM); + return matchAddressBase(N, AM); // If this is already a %rip relative address, we can only merge immediates // into it. Instead of handling this in every case, we handle it here. @@ -1020,7 +1127,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, return true; if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) - if (!FoldOffsetIntoAddress(Cst->getSExtValue(), AM)) + if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) return false; return true; } @@ -1038,19 +1145,19 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } case ISD::Constant: { uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue(); - if (!FoldOffsetIntoAddress(Val, AM)) + if (!foldOffsetIntoAddress(Val, AM)) return false; break; } case X86ISD::Wrapper: case X86ISD::WrapperRIP: - if (!MatchWrapper(N, AM)) + if (!matchWrapper(N, AM)) return false; break; case ISD::LOAD: - if (!MatchLoadInAddress(cast<LoadSDNode>(N), AM)) + if (!matchLoadInAddress(cast<LoadSDNode>(N), AM)) return false; break; @@ -1087,7 +1194,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getNode()->getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; - if (!FoldOffsetIntoAddress(Disp, AM)) + if (!foldOffsetIntoAddress(Disp, AM)) return false; } @@ -1119,7 +1226,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Try to fold the mask and shift into the scale, and return false if we // succeed. - if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) return false; break; } @@ -1153,7 +1260,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, ConstantSDNode *AddVal = cast<ConstantSDNode>(MulVal.getNode()->getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); - if (FoldOffsetIntoAddress(Disp, AM)) + if (foldOffsetIntoAddress(Disp, AM)) Reg = N.getNode()->getOperand(0); } else { Reg = N.getNode()->getOperand(0); @@ -1179,7 +1286,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; - if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { AM = Backup; break; } @@ -1227,56 +1334,26 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, AM.Scale = 1; // Insert the new nodes into the topological ordering. - InsertDAGNode(*CurDAG, N, Zero); - InsertDAGNode(*CurDAG, N, Neg); + insertDAGNode(*CurDAG, N, Zero); + insertDAGNode(*CurDAG, N, Neg); return false; } - case ISD::ADD: { - // Add an artificial use to this node so that we can keep track of - // it if it gets CSE'd with a different node. - HandleSDNode Handle(N); - - X86ISelAddressMode Backup = AM; - if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && - !MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) - return false; - AM = Backup; - - // Try again after commuting the operands. - if (!MatchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)&& - !MatchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)) + case ISD::ADD: + if (!matchAdd(N, AM, Depth)) return false; - AM = Backup; - - // If we couldn't fold both operands into the address at the same time, - // see if we can just put each operand into a register and fold at least - // the add. - if (AM.BaseType == X86ISelAddressMode::RegBase && - !AM.Base_Reg.getNode() && - !AM.IndexReg.getNode()) { - N = Handle.getValue(); - AM.Base_Reg = N.getOperand(0); - AM.IndexReg = N.getOperand(1); - AM.Scale = 1; - return false; - } - N = Handle.getValue(); break; - } case ISD::OR: - // Handle "X | C" as "X + C" iff X is known to have C bits clear. - if (CurDAG->isBaseWithConstantOffset(N)) { - X86ISelAddressMode Backup = AM; - ConstantSDNode *CN = cast<ConstantSDNode>(N.getOperand(1)); - - // Start with the LHS as an addr mode. - if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) && - !FoldOffsetIntoAddress(CN->getSExtValue(), AM)) - return false; - AM = Backup; - } + // We want to look through a transform in InstCombine and DAGCombiner that + // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. + // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) + // An 'lea' can then be used to match the shift (multiply) and add: + // and $1, %esi + // lea (%rsi, %rdi, 8), %rax + if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && + !matchAdd(N, AM, Depth)) + return false; break; case ISD::AND: { @@ -1299,27 +1376,27 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM, uint64_t Mask = N.getConstantOperandVal(1); // Try to fold the mask and shift into an extract and scale. - if (!FoldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to fold the mask and shift directly into the scale. - if (!FoldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to swap the mask and shift to place shifts which can be done as // a scale on the outside of the mask. - if (!FoldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) + if (!foldMaskedShiftToScaledMask(*CurDAG, N, Mask, Shift, X, AM)) return false; break; } } - return MatchAddressBase(N, AM); + return matchAddressBase(N, AM); } -/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the +/// Helper for MatchAddress. Add the specified node to the /// specified addressing mode without any further recursion. -bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { +bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { // Is the base register already occupied? if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { // If so, check to see if the scale index register is set. @@ -1339,7 +1416,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) { return false; } -bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1362,7 +1439,7 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, // If Base is 0, the whole address is in index and the Scale is 1 if (isa<ConstantSDNode>(Base)) { - assert(dyn_cast<ConstantSDNode>(Base)->isNullValue() && + assert(cast<ConstantSDNode>(Base)->isNullValue() && "Unexpected base in gather/scatter"); Scale = getI8Imm(1, DL); Base = CurDAG->getRegister(0, MVT::i32); @@ -1375,14 +1452,14 @@ bool X86DAGToDAGISel::SelectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } -/// SelectAddr - returns true if it is able pattern match an addressing mode. +/// Returns true if it is able to pattern match an addressing mode. /// It returns the operands which make up the maximal addressing mode it can /// match by reference. /// /// Parent is the parent node of the addr operand that is being matched. It /// is always a load, store, atomic node, or null. It is only null when /// checking memory operands for inline asm nodes. -bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; @@ -1404,7 +1481,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); } - if (MatchAddress(N, AM)) + if (matchAddress(N, AM)) return false; MVT VT = N.getSimpleValueType(); @@ -1420,14 +1497,14 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } -/// SelectScalarSSELoad - Match a scalar SSE load. In particular, we want to -/// match a load whose top elements are either undef or zeros. The load flavor -/// is derived from the type of N, which is either v4f32 or v2f64. +/// Match a scalar SSE load. In particular, we want to match a load whose top +/// elements are either undef or zeros. The load flavor is derived from the +/// type of N, which is either v4f32 or v2f64. /// /// We also return: /// PatternChainNode: this is the matched node that has a chain input and /// output. -bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, +bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment, @@ -1439,7 +1516,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, IsProfitableToFold(N.getOperand(0), N.getNode(), Root) && IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); - if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; return true; } @@ -1457,7 +1534,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, IsLegalToFold(N.getOperand(0), N.getNode(), Root, OptLevel)) { // Okay, this is a zero extending load. Fold it. LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0)); - if (!SelectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) + if (!selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; PatternNodeWithChain = SDValue(LD, 0); return true; @@ -1466,7 +1543,7 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root, } -bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { +bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { uint64_t ImmVal = CN->getZExtValue(); if ((uint32_t)ImmVal != (uint64_t)ImmVal) @@ -1495,10 +1572,10 @@ bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) { return TM.getCodeModel() == CodeModel::Small; } -bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, +bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { - if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment)) + if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) return false; SDLoc DL(N); @@ -1533,9 +1610,9 @@ bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base, return true; } -/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing +/// Calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. -bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, +bool X86DAGToDAGISel::selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1546,7 +1623,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, SDValue Copy = AM.Segment; SDValue T = CurDAG->getRegister(0, MVT::i32); AM.Segment = T; - if (MatchAddress(N, AM)) + if (matchAddress(N, AM)) return false; assert (T == AM.Segment); AM.Segment = Copy; @@ -1572,13 +1649,12 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, Complexity++; // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA - // to a LEA. This is determined with some expermentation but is by no means + // to a LEA. This is determined with some experimentation but is by no means // optimal (especially for code size consideration). LEA is nice because of // its three-address nature. Tweak the cost function again when we can run // convertToThreeAddress() at register allocation time. if (AM.hasSymbolicDisplacement()) { - // For X86-64, we should always use lea to materialize RIP relative - // addresses. + // For X86-64, always use LEA to materialize RIP-relative addresses. if (Subtarget->is64Bit()) Complexity = 4; else @@ -1596,8 +1672,8 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N, return true; } -/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes. -bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, +/// This is only run on TargetGlobalTLSAddress nodes. +bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); @@ -1621,7 +1697,7 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base, } -bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, +bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { @@ -1630,14 +1706,13 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N, !IsLegalToFold(N, P, P, OptLevel)) return false; - return SelectAddr(N.getNode(), + return selectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } -/// getGlobalBaseReg - Return an SDNode that returns the value of -/// the global base register. Output instructions required to -/// initialize the global base register, if necessary. -/// +/// Return an SDNode that returns the value of the global base register. +/// Output instructions required to initialize the global base register, +/// if necessary. SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); auto &DL = MF->getDataLayout(); @@ -1828,7 +1903,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG, return Val; } -SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { +SDNode *X86DAGToDAGISel::selectAtomicLoadArith(SDNode *Node, MVT NVT) { if (Node->hasAnyUseOfValue(0)) return nullptr; @@ -1841,7 +1916,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { SDValue Ptr = Node->getOperand(1); SDValue Val = Node->getOperand(2); SDValue Base, Scale, Index, Disp, Segment; - if (!SelectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) + if (!selectAddr(Node, Ptr, Base, Scale, Index, Disp, Segment)) return nullptr; // Which index into the table. @@ -1933,9 +2008,9 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) { return CurDAG->getMergeValues(RetVals, dl).getNode(); } -/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has -/// any uses which require the SF or OF bits to be accurate. -static bool HasNoSignedComparisonUses(SDNode *N) { +/// Test whether the given X86ISD::CMP node has any uses which require the SF +/// or OF bits to be accurate. +static bool hasNoSignedComparisonUses(SDNode *N) { // Examine each user of the node. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { @@ -1995,9 +2070,8 @@ static bool HasNoSignedComparisonUses(SDNode *N) { return true; } -/// isLoadIncOrDecStore - Check whether or not the chain ending in StoreNode -/// is suitable for doing the {load; increment or decrement; store} to modify -/// transformation. +/// Check whether or not the chain ending in StoreNode is suitable for doing +/// the {load; increment or decrement; store} to modify transformation. static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, SDValue StoredVal, SelectionDAG *CurDAG, LoadSDNode* &LoadNode, SDValue &InputChain) { @@ -2081,8 +2155,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, return true; } -/// getFusedLdStOpcode - Get the appropriate X86 opcode for an in memory -/// increment or decrement. Opc should be X86ISD::DEC or X86ISD::INC. +/// Get the appropriate X86 opcode for an in-memory increment or decrement. +/// Opc should be X86ISD::DEC or X86ISD::INC. static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { if (Opc == X86ISD::DEC) { if (LdVT == MVT::i64) return X86::DEC64m; @@ -2099,9 +2173,8 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { llvm_unreachable("unrecognized size for LdVT"); } -/// SelectGather - Customized ISel for GATHER operations. -/// -SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) { +/// Customized ISel for GATHER operations. +SDNode *X86DAGToDAGISel::selectGather(SDNode *Node, unsigned Opc) { // Operands of Gather: VSrc, Base, VIdx, VMask, Scale SDValue Chain = Node->getOperand(0); SDValue VSrc = Node->getOperand(2); @@ -2148,6 +2221,27 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { switch (Opcode) { default: break; + case ISD::BRIND: { + if (Subtarget->isTargetNaCl()) + // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We + // leave the instruction alone. + break; + if (Subtarget->isTarget64BitILP32()) { + // Converts a 32-bit register to a 64-bit, zero-extended version of + // it. This is needed because x86-64 can do many things, but jmp %r32 + // ain't one of them. + const SDValue &Target = Node->getOperand(1); + assert(Target.getSimpleValueType() == llvm::MVT::i32); + SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64)); + SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, + Node->getOperand(0), ZextTarget); + ReplaceUses(SDValue(Node, 0), Brind); + SelectCode(ZextTarget.getNode()); + SelectCode(Brind.getNode()); + return nullptr; + } + break; + } case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue(); switch (IntNo) { @@ -2190,7 +2284,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break; case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break; } - SDNode *RetVal = SelectGather(Node, Opc); + SDNode *RetVal = selectGather(Node, Opc); if (RetVal) // We already called ReplaceUses inside SelectGather. return nullptr; @@ -2217,7 +2311,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { case ISD::ATOMIC_LOAD_AND: case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_ADD: { - SDNode *RetVal = SelectAtomicLoadArith(Node, NVT); + SDNode *RetVal = selectAtomicLoadArith(Node, NVT); if (RetVal) return RetVal; break; @@ -2404,10 +2498,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); // Multiply is commmutative. if (!foldedLoad) { - foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (foldedLoad) std::swap(N0, N1); } @@ -2549,7 +2643,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; - bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); + bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); bool signBitIsZero = CurDAG->SignBitIsZero(N0); SDValue InFlag; @@ -2557,7 +2651,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // Special case for div8, just use a move with zero extension to AX to // clear the upper 8 bits (AH). SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain; - if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; Move = SDValue(CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32, @@ -2692,7 +2786,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && - HasNoSignedComparisonUses(Node)) + hasNoSignedComparisonUses(Node)) N0 = N0.getOperand(0); // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to @@ -2709,7 +2803,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // For example, convert "testl %eax, $8" to "testb %al, $8" if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && (!(C->getZExtValue() & 0x80) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Reg = N0.getNode()->getOperand(0); @@ -2743,7 +2837,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { // For example, "testl %eax, $2048" to "testb %ah, $8". if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 && (!(C->getZExtValue() & 0x8000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { // Shift the immediate right by 8 bits. SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, dl, MVT::i8); @@ -2781,7 +2875,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 && N0.getValueType() != MVT::i16 && (!(C->getZExtValue() & 0x8000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i16); SDValue Reg = N0.getNode()->getOperand(0); @@ -2804,7 +2898,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 && N0.getValueType() == MVT::i64 && (!(C->getZExtValue() & 0x80000000) || - HasNoSignedComparisonUses(Node))) { + hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i32); SDValue Reg = N0.getNode()->getOperand(0); @@ -2854,7 +2948,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) { break; SDValue Base, Scale, Index, Disp, Segment; - if (!SelectAddr(LoadNode, LoadNode->getBasePtr(), + if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, Segment)) break; @@ -2903,7 +2997,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory case InlineAsm::Constraint_X: - if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) + if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; } @@ -2916,9 +3010,8 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, return false; } -/// createX86ISelDag - This pass converts a legalized DAG into a -/// X86-specific DAG, ready for instruction scheduling. -/// +/// This pass converts a legalized DAG into a X86-specific DAG, +/// ready for instruction scheduling. FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel) { return new X86DAGToDAGISel(TM, OptLevel); diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 0f29b51..0927c2f 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -67,19 +68,14 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization( "rather than promotion."), cl::Hidden); -// Forward declarations. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2); - X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - TD = TM.getDataLayout(); + MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. - static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); @@ -118,13 +114,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); - - // The _ftol2 runtime function has an unusual calling conv, which - // is modeled by a special pseudo-instruction. - setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); - setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); - setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { @@ -175,14 +164,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) + // f32/f64 are legal, f80 is custom. + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); + else + setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!Subtarget->useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit - // FILD for other targets. + // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } @@ -206,23 +199,29 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } - // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 - // are Legal, f80 is custom lowered. - setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); - setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); - // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); - if (X86ScalarSSEf32) { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); - // f32 and f64 cases are Legal, f80 case is not - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + if (!Subtarget->useSoftFloat()) { + // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 + // are Legal, f80 is custom lowered. + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); + setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); + + if (X86ScalarSSEf32) { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + // f32 and f64 cases are Legal, f80 case is not + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } else { + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + } } else { - setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); - setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); + setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); + setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed @@ -232,8 +231,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); - setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { + // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); + } else { + setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); + setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); + } } else if (!Subtarget->useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) @@ -242,14 +247,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else + // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); - } - if (isTargetFTOL()) { - // Use the _ftol2 runtime function, which has a pseudo-instruction - // to handle its weird calling convention. setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } @@ -274,8 +276,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. - for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { - MVT VT = IntVTs[i]; + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); @@ -295,6 +296,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); + setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); @@ -302,6 +304,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); + setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); @@ -312,7 +315,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); - setOperationAction(ISD::FREM , MVT::f32 , Expand); + + if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { + // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` + // is. We should promote the value to 64-bits to solve this. + // This is what the CRT headers do - `fmodf` is an inline header + // function casting to f64 and calling `fmod`. + setOperationAction(ISD::FREM , MVT::f32 , Promote); + } else { + setOperationAction(ISD::FREM , MVT::f32 , Expand); + } + setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); @@ -404,15 +417,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); + setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); + setOperationAction(ISD::SETCC , MVT::f128 , Custom); + setOperationAction(ISD::SETCCE , MVT::i8 , Custom); + setOperationAction(ISD::SETCCE , MVT::i16 , Custom); + setOperationAction(ISD::SETCCE , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); + setOperationAction(ISD::SETCCE , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support @@ -456,8 +475,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics - for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { - MVT VT = IntVTs[i]; + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); @@ -473,13 +491,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } - if (Subtarget->is64Bit()) { - setExceptionPointerRegister(X86::RAX); - setExceptionSelectorRegister(X86::RDX); - } else { - setExceptionPointerRegister(X86::EAX); - setExceptionSelectorRegister(X86::EDX); - } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); @@ -492,8 +503,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); - if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { - // TargetInfo::X86_64ABIBuiltinVaList + if (Subtarget->is64Bit()) { setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { @@ -505,7 +515,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); - setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(*TD), Custom); + setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); @@ -613,8 +623,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); - // Long double always uses X87. + // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { + if (Subtarget->is64Bit() && Subtarget->hasMMX()) { + addRegisterClass(MVT::f128, &X86::FR128RegClass); + ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); + setOperationAction(ISD::FABS , MVT::f128, Custom); + setOperationAction(ISD::FNEG , MVT::f128, Custom); + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + } + addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); @@ -846,15 +864,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); + // ISD::CTTZ v2i64 - scalarization is faster. + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. + // Custom lower build_vector, vector_shuffle, and extract_vector_elt. - for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - // Do not attempt to custom lower non-power-of-2 vectors - if (!isPowerOf2_32(VT.getVectorNumElements())) - continue; - // Do not attempt to custom lower non-128-bit vectors - if (!VT.is128BitVector()) - continue; + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); @@ -892,13 +912,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. - for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-128-bit vectors - if (!VT.is128BitVector()) - continue; - + for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v2i64); setOperationAction(ISD::OR, VT, Promote); @@ -1036,6 +1050,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, MVT::v4i32, Custom); } + if (Subtarget->hasXOP()) { + setOperationAction(ISD::ROTL, MVT::v16i8, Custom); + setOperationAction(ISD::ROTL, MVT::v8i16, Custom); + setOperationAction(ISD::ROTL, MVT::v4i32, Custom); + setOperationAction(ISD::ROTL, MVT::v2i64, Custom); + setOperationAction(ISD::ROTL, MVT::v32i8, Custom); + setOperationAction(ISD::ROTL, MVT::v16i16, Custom); + setOperationAction(ISD::ROTL, MVT::v8i32, Custom); + setOperationAction(ISD::ROTL, MVT::v4i64, Custom); + } + if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); @@ -1126,7 +1151,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); - if (Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512()) { + setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + + if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); @@ -1202,6 +1236,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); + + setOperationAction(ISD::SMAX, MVT::v32i8, Custom); + setOperationAction(ISD::SMAX, MVT::v16i16, Custom); + setOperationAction(ISD::SMAX, MVT::v8i32, Custom); + setOperationAction(ISD::UMAX, MVT::v32i8, Custom); + setOperationAction(ISD::UMAX, MVT::v16i16, Custom); + setOperationAction(ISD::UMAX, MVT::v8i32, Custom); + setOperationAction(ISD::SMIN, MVT::v32i8, Custom); + setOperationAction(ISD::SMIN, MVT::v16i16, Custom); + setOperationAction(ISD::SMIN, MVT::v8i32, Custom); + setOperationAction(ISD::UMIN, MVT::v32i8, Custom); + setOperationAction(ISD::UMIN, MVT::v16i16, Custom); + setOperationAction(ISD::UMIN, MVT::v8i32, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be @@ -1243,15 +1290,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget->hasInt256()) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); - // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. - for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-256-bit vectors - if (!VT.is256BitVector()) - continue; - + for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v4i64); setOperationAction(ISD::OR, VT, Promote); @@ -1293,6 +1333,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); + setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); @@ -1311,6 +1352,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); + setOperationAction(ISD::FABS, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); @@ -1318,19 +1360,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); + setOperationAction(ISD::FABS, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); - if (Subtarget->is64Bit()) { - setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); - } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); @@ -1348,12 +1381,62 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); + if (Subtarget->hasVLX()){ + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + } else { + setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); + setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); + setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); + } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { - setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); + } + } + if (Subtarget->hasVLX()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); @@ -1386,7 +1469,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); @@ -1395,6 +1478,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); @@ -1439,9 +1523,49 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); - } + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Expand); + + setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); + setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); + + if (Subtarget->hasVLX()) { + setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); + setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); + setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); + } else { + setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); + setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); + setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Expand); + } + } // Subtarget->hasCDI() + if (Subtarget->hasDQI()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); @@ -1455,7 +1579,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } - if (EltSize >= 32 && VT.getSizeInBits() <= 512) { + if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } @@ -1481,15 +1605,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MSCATTER, VT, Custom); } } - for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { - MVT VT = (MVT::SimpleValueType)i; - - // Do not attempt to promote non-512-bit vectors. - if (!VT.is512BitVector()) - continue; - + for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } @@ -1515,22 +1635,35 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); @@ -1541,19 +1674,31 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); - for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { - const MVT VT = (MVT::SimpleValueType)i; + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + if (Subtarget->hasVLX()) + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); - const unsigned EltSize = VT.getVectorElementType().getSizeInBits(); + if (Subtarget->hasCDI()) { + setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); + setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Expand); + setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Expand); + } - // Do not attempt to promote non-512-bit vectors. - if (!VT.is512BitVector()) - continue; + for (auto VT : { MVT::v64i8, MVT::v32i16 }) { + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::SRL, VT, Custom); + setOperationAction(ISD::SHL, VT, Custom); + setOperationAction(ISD::SRA, VT, Custom); - if (EltSize < 32) { - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); - } + setOperationAction(ISD::AND, VT, Promote); + AddPromotedToType (ISD::AND, VT, MVT::v8i64); + setOperationAction(ISD::OR, VT, Promote); + AddPromotedToType (ISD::OR, VT, MVT::v8i64); + setOperationAction(ISD::XOR, VT, Promote); + AddPromotedToType (ISD::XOR, VT, MVT::v8i64); } } @@ -1571,6 +1716,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v2i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); @@ -1595,8 +1742,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - if (!Subtarget->is64Bit()) + if (!Subtarget->is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); + } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. @@ -1604,9 +1753,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. - for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { + for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { + if (VT == MVT::i64 && !Subtarget->is64Bit()) + continue; // Add/Sub/Mul with overflow operations are custom lowered. - MVT VT = IntVTs[i]; setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); @@ -1615,7 +1765,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, VT, Custom); } - if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); @@ -1658,12 +1807,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); + setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); + setTargetDAGCombine(ISD::FMINNUM); + setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); + setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); @@ -1671,24 +1824,24 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); + setTargetDAGCombine(ISD::MSCATTER); + setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); - // On Darwin, -Os means optimize for size without hurting performance, - // do not reduce the limit. MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores - MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; + MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores - MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + MaxStoresPerMemmoveOptSize = 4; setPrefLoopAlignment(4); // 2^4 bytes. - // Predictable cmov don't hurt on atom because it's in-order. + // A predictable cmov does not hurt on an in-order CPU. + // FIXME: Use a CPU attribute to trigger this, not a CPU model. PredictableSelectIsExpensive = !Subtarget->isAtom(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. @@ -1716,40 +1869,43 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; - const unsigned NumElts = VT.getVectorNumElements(); - const EVT EltVT = VT.getVectorElementType(); - if (VT.is512BitVector()) { - if (Subtarget->hasAVX512()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - } - if (Subtarget->hasBWI()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 32: return MVT::v32i1; - case 64: return MVT::v64i1; - } - } + if (VT.isSimple()) { + MVT VVT = VT.getSimpleVT(); + const unsigned NumElts = VVT.getVectorNumElements(); + const MVT EltVT = VVT.getVectorElementType(); + if (VVT.is512BitVector()) { + if (Subtarget->hasAVX512()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + } + if (Subtarget->hasBWI()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 32: return MVT::v32i1; + case 64: return MVT::v64i1; + } + } - if (VT.is256BitVector() || VT.is128BitVector()) { - if (Subtarget->hasVLX()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 2: return MVT::v2i1; - case 4: return MVT::v4i1; - case 8: return MVT::v8i1; - } - if (Subtarget->hasBWI() && Subtarget->hasVLX()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - case 32: return MVT::v32i1; - } + if (VVT.is256BitVector() || VVT.is128BitVector()) { + if (Subtarget->hasVLX()) + if (EltVT == MVT::i32 || EltVT == MVT::i64 || + EltVT == MVT::f32 || EltVT == MVT::f64) + switch(NumElts) { + case 2: return MVT::v2i1; + case 4: return MVT::v4i1; + case 8: return MVT::v8i1; + } + if (Subtarget->hasBWI() && Subtarget->hasVLX()) + if (EltVT == MVT::i8 || EltVT == MVT::i16) + switch(NumElts) { + case 8: return MVT::v8i1; + case 16: return MVT::v16i1; + case 32: return MVT::v32i1; + } + } } return VT.changeVectorElementTypeToInteger(); @@ -1769,9 +1925,9 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast<StructType>(Ty)) { - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; - getMaxByValAlign(STy->getElementType(i), EltAlign); + getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) @@ -1821,10 +1977,11 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && - (Subtarget->isUnalignedMemAccessFast() || + (!Subtarget->isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { + // FIXME: Check if unaligned 32-byte accesses are slow. if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) @@ -1842,6 +1999,9 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, return MVT::f64; } } + // This is a compromise. If we reach here, unaligned accesses may be slow on + // this target. However, creating smaller, aligned accesses could be even + // slower and would certainly be a lot more code. if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; @@ -1860,8 +2020,22 @@ X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { - if (Fast) - *Fast = Subtarget->isUnalignedMemAccessFast(); + if (Fast) { + switch (VT.getSizeInBits()) { + default: + // 8-byte and under are always assumed to be fast. + *Fast = true; + break; + case 128: + *Fast = !Subtarget->isUnalignedMem16Slow(); + break; + case 256: + *Fast = !Subtarget->isUnalignedMem32Slow(); + break; + // TODO: What about AVX-512 (512-bit) accesses? + } + } + // Misaligned accesses of any size are always allowed. return true; } @@ -1964,6 +2138,32 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, return true; } +Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { + if (!Subtarget->isTargetAndroid()) + return TargetLowering::getSafeStackPointerLocation(IRB); + + // Android provides a fixed TLS slot for the SafeStack pointer. See the + // definition of TLS_SLOT_SAFESTACK in + // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h + unsigned AddressSpace, Offset; + if (Subtarget->is64Bit()) { + // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: + Offset = 0x48; + if (getTargetMachine().getCodeModel() == CodeModel::Kernel) + AddressSpace = 256; + else + AddressSpace = 257; + } else { + // %gs:0x24 on i386 + Offset = 0x24; + AddressSpace = 256; + } + + return ConstantExpr::getIntToPtr( + ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), + Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); +} + bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); @@ -1977,11 +2177,9 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, #include "X86GenCallingConv.inc" -bool -X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, - MachineFunction &MF, bool isVarArg, - const SmallVectorImpl<ISD::OutputArg> &Outs, - LLVMContext &Context) const { +bool X86TargetLowering::CanLowerReturn( + CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); @@ -2001,6 +2199,9 @@ X86TargetLowering::LowerReturn(SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + if (CallConv == CallingConv::X86_INTR && !Outs.empty()) + report_fatal_error("X86 interrupts may not return any value"); + SmallVector<CCValAssign, 16> RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); @@ -2025,7 +2226,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { - if (ValVT.isVector() && ValVT.getScalarType() == MVT::i1) + if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); @@ -2114,7 +2315,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, if (Flag.getNode()) RetOps.push_back(Flag); - return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); + X86ISD::NodeType opcode = X86ISD::RET_FLAG; + if (CallConv == CallingConv::X86_INTR) + opcode = X86ISD::IRET; + return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { @@ -2193,7 +2397,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values - if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && + if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -2244,28 +2448,28 @@ enum StructReturnType { StackStructReturn }; static StructReturnType -callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { +callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType -argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { +argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; - if (Flags.isInReg()) + if (Flags.isInReg() || IsMCU) return RegStructReturn; return StackStructReturn; } @@ -2285,17 +2489,34 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, MachinePointerInfo(), MachinePointerInfo()); } -/// Return true if the calling convention is one that -/// supports tail call optimization. -static bool IsTailCallConvention(CallingConv::ID CC) { +/// Return true if the calling convention is one that we can guarantee TCO for. +static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || - CC == CallingConv::HiPE); + CC == CallingConv::HiPE || CC == CallingConv::HHVM); } -/// \brief Return true if the calling convention is a C calling convention. -static bool IsCCallConvention(CallingConv::ID CC) { - return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || - CC == CallingConv::X86_64_SysV); +/// Return true if we might ever do TCO for calls with this calling convention. +static bool mayTailCallThisCC(CallingConv::ID CC) { + switch (CC) { + // C calling conventions: + case CallingConv::C: + case CallingConv::X86_64_Win64: + case CallingConv::X86_64_SysV: + // Callee pop conventions: + case CallingConv::X86_ThisCall: + case CallingConv::X86_StdCall: + case CallingConv::X86_VectorCall: + case CallingConv::X86_FastCall: + return true; + default: + return canGuaranteeTCO(CC); + } +} + +/// Return true if the function is being made into a tailcall target by +/// changing its ABI. +static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { + return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { @@ -2306,19 +2527,12 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); - if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) + if (!mayTailCallThisCC(CalleeCC)) return false; return true; } -/// Return true if the function is being made into -/// a tailcall target by changing its ABI. -static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, - bool GuaranteedTailCallOpt) { - return GuaranteedTailCallOpt && IsTailCallConvention(CC); -} - SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, @@ -2329,7 +2543,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; - bool AlwaysUseMutable = FuncIsMadeTailCallSafe( + bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; @@ -2344,6 +2558,19 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, else ValVT = VA.getValVT(); + // Calculate SP offset of interrupt parameter, re-arrange the slot normally + // taken by a return address. + int Offset = 0; + if (CallConv == CallingConv::X86_INTR) { + const X86Subtarget& Subtarget = + static_cast<const X86Subtarget&>(DAG.getSubtarget()); + // X86 interrupts may take one or two arguments. + // On the stack there will be no return address as in regular call. + // Offset of last argument need to be set to -4/-8 bytes. + // Where offset of the first argument out of two, should be set to 0 bytes. + Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); + } + // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they @@ -2352,14 +2579,24 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); + // Adjust SP offset of interrupt parameter. + if (CallConv == CallingConv::X86_INTR) { + MFI->setObjectOffset(FI, Offset); + } + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); - SDValue Val = DAG.getLoad(ValVT, dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + SDValue Val = DAG.getLoad( + ValVT, dl, Chain, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, false, 0); return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } @@ -2413,15 +2650,10 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } -SDValue -X86TargetLowering::LowerFormalArguments(SDValue Chain, - CallingConv::ID CallConv, - bool isVarArg, - const SmallVectorImpl<ISD::InputArg> &Ins, - SDLoc dl, - SelectionDAG &DAG, - SmallVectorImpl<SDValue> &InVals) - const { +SDValue X86TargetLowering::LowerFormalArguments( + SDValue Chain, CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); @@ -2436,9 +2668,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - assert(!(isVarArg && IsTailCallConvention(CallConv)) && + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); + if (CallConv == CallingConv::X86_INTR) { + bool isLegal = Ins.size() == 1 || + (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || + (!Is64Bit && Ins[1].VT == MVT::i32))); + if (!isLegal) + report_fatal_error("X86 interrupts may take one or two arguments"); + } + // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); @@ -2471,6 +2711,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; + else if (RegVT == MVT::f128) + RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) @@ -2547,8 +2789,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. - if (FuncIsMadeTailCallSafe(CallConv, - MF.getTarget().Options.GuaranteedTailCallOpt)) + if (shouldGuaranteeTCO(CallConv, + MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for @@ -2561,13 +2803,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, MFI->CreateFixedObject(1, StackSize, true)); } - MachineModuleInfo &MMI = MF.getMMI(); - const Function *WinEHParent = nullptr; - if (MMI.hasWinEHFuncInfo(Fn)) - WinEHParent = MMI.getWinEHParent(Fn); - bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn; - bool IsWinEHParent = WinEHParent && WinEHParent == Fn; - // Figure out if XMM registers are in use. assert(!(Subtarget->useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && @@ -2631,10 +2866,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = - DAG.getStore(Val.getValue(1), dl, Val, FIN, - MachinePointerInfo::getFixedStack( - FuncInfo->getRegSaveFrameIndex(), Offset), - false, false, 0); + DAG.getStore(Val.getValue(1), dl, Val, FIN, + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), + FuncInfo->getRegSaveFrameIndex(), Offset), + false, false, 0); MemOps.push_back(Store); Offset += 8; } @@ -2656,27 +2892,6 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); - } else if (IsWin64 && IsWinEHOutlined) { - // Get to the caller-allocated home save location. Add 8 to account - // for the return address. - int HomeOffset = TFI.getOffsetOfLocalArea() + 8; - FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject( - /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false)); - - MMI.getWinEHFuncInfo(Fn) - .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] = - FuncInfo->getRegSaveFrameIndex(); - - // Store the second integer parameter (rdx) into rsp+16 relative to the - // stack pointer at the entry of the function. - SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), - getPointerTy(DAG.getDataLayout())); - unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass); - SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64); - Chain = DAG.getStore( - Val.getValue(1), dl, Val, RSFIN, - MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()), - /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0); } if (isVarArg && MFI->hasMustTailInVarArgFunc()) { @@ -2723,12 +2938,15 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. + } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { + // X86 interrupts must pop the error code if present + FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. - if (!Is64Bit && !IsTailCallConvention(CallConv) && + if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && - argsAreStructReturn(Ins) == StackStructReturn) + argsAreStructReturn(Ins, Subtarget->isTargetMCU()) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } @@ -2743,21 +2961,20 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, FuncInfo->setArgumentStackSize(StackSize); - if (IsWinEHParent) { - if (Is64Bit) { - int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); - SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64); - MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI; - SDValue Neg2 = DAG.getConstant(-2, dl, MVT::i64); - Chain = DAG.getStore(Chain, dl, Neg2, StackSlot, - MachinePointerInfo::getFixedStack(UnwindHelpFI), - /*isVolatile=*/true, - /*isNonTemporal=*/false, /*Alignment=*/0); - } else { - // Functions using Win32 EH are considered to have opaque SP adjustments - // to force local variables to be addressed from the frame or base - // pointers. - MFI->setHasOpaqueSPAdjustment(true); + if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { + EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); + if (Personality == EHPersonality::CoreCLR) { + assert(Is64Bit); + // TODO: Add a mechanism to frame lowering that will allow us to indicate + // that we'd prefer this slot be allocated towards the bottom of the frame + // (i.e. near the stack pointer after allocating the frame). Every + // funclet needs a copy of this slot in its (mostly empty) frame, and the + // offset from the bottom of this and each funclet's frame must be the + // same, so the size of funclets' (mostly empty) frames is dictated by + // how far this slot is from the bottom (since they allocate just enough + // space to accomodate holding this slot at the correct offset). + int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); + EHInfo->PSPSymFrameIdx = PSPSymFI; } } @@ -2777,9 +2994,10 @@ X86TargetLowering::LowerMemOpCallTo(SDValue Chain, if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); - return DAG.getStore(Chain, dl, Arg, PtrOff, - MachinePointerInfo::getStack(LocMemOffset), - false, false, 0); + return DAG.getStore( + Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), + false, false, 0); } /// Emit a load of return address if tail call @@ -2813,11 +3031,24 @@ static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, - MachinePointerInfo::getFixedStack(NewReturnAddrFI), + MachinePointerInfo::getFixedStack( + DAG.getMachineFunction(), NewReturnAddrFI), false, false, 0); return Chain; } +/// Returns a vector_shuffle mask for an movs{s|d}, movd +/// operation of specified width. +static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, + SDValue V2) { + unsigned NumElems = VT.getVectorNumElements(); + SmallVector<int, 8> Mask; + Mask.push_back(NumElems); + for (unsigned i = 1; i != NumElems; ++i) + Mask.push_back(i); + return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); +} + SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { @@ -2835,11 +3066,14 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); - StructReturnType SR = callIsStructReturn(Outs); + StructReturnType SR = callIsStructReturn(Outs, Subtarget->isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); + if (CallConv == CallingConv::X86_INTR) + report_fatal_error("X86 interrupts may not be called directly"); + if (Attr.getValueAsString() == "true") isTailCall = false; @@ -2878,7 +3112,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ++NumTailCalls; } - assert(!(isVarArg && IsTailCallConvention(CallConv)) && + assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. @@ -2892,13 +3126,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. - unsigned NumBytes = CCInfo.getNextStackOffset(); + unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && - IsTailCallConvention(CallConv)) + canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; @@ -2970,7 +3204,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && - Arg.getValueType().getScalarType() == MVT::i1) + Arg.getValueType().getVectorElementType() == MVT::i1) Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. @@ -2987,9 +3221,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); - Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, - MachinePointerInfo::getFixedStack(FI), - false, false, 0); + Chain = DAG.getStore( + Chain, dl, Arg, SpillSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0); Arg = SpillSlot; break; } @@ -3125,10 +3360,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Flags, DAG, dl)); } else { // Store relative to framepointer. - MemOpChains2.push_back( - DAG.getStore(ArgChain, dl, Arg, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, 0)); + MemOpChains2.push_back(DAG.getStore( + ArgChain, dl, Arg, FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), + false, false, 0)); } } @@ -3207,7 +3442,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (ExtraLoad) Callee = DAG.getLoad( getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, + false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { unsigned char OpFlags = 0; @@ -3261,9 +3497,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); - // If this is an invoke in a 32-bit function using an MSVC personality, assume - // the function clobbers all registers. If an exception is thrown, the runtime - // will not restore CSRs. + // If this is an invoke in a 32-bit function using a funclet-based + // personality, assume the function clobbers all registers. If an exception + // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { @@ -3272,7 +3508,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, CallerFn->hasPersonalityFn() ? classifyEHPersonality(CallerFn->getPersonalityFn()) : EHPersonality::Unknown; - if (isMSVCEHPersonality(Pers)) + if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } @@ -3300,7 +3536,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything - else if (!Is64Bit && !IsTailCallConvention(CallConv) && + else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee @@ -3358,8 +3594,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // EDI // local1 .. -/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned -/// for a 16 byte align requirement. +/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align +/// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { @@ -3380,9 +3616,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, return Offset; } -/// MatchingStackOffset - Return true if the given stack call argument is -/// already available in the same position (relatively) of the caller's -/// incoming argument stack. +/// Return true if the given stack call argument is already available in the +/// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, @@ -3435,25 +3670,19 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } -/// IsEligibleForTailCallOptimization - Check whether the call is eligible -/// for tail call optimization. Targets which want to do tail call -/// optimization should implement this function. -bool -X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, - CallingConv::ID CalleeCC, - bool isVarArg, - bool isCalleeStructRet, - bool isCallerStructRet, - Type *RetTy, - const SmallVectorImpl<ISD::OutputArg> &Outs, - const SmallVectorImpl<SDValue> &OutVals, - const SmallVectorImpl<ISD::InputArg> &Ins, - SelectionDAG &DAG) const { - if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) +/// Check whether the call is eligible for tail call optimization. Targets +/// that want to do tail call optimization should implement this function. +bool X86TargetLowering::IsEligibleForTailCallOptimization( + SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, + bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. - const MachineFunction &MF = DAG.getMachineFunction(); + MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, @@ -3474,7 +3703,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, return false; if (DAG.getTarget().Options.GuaranteedTailCallOpt) { - if (IsTailCallConvention(CalleeCC) && CCMatch) + if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } @@ -3493,19 +3722,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, if (isCalleeStructRet || isCallerStructRet) return false; - // An stdcall/thiscall caller is expected to clean up its arguments; the - // callee isn't going to do that. - // FIXME: this is more restrictive than needed. We could produce a tailcall - // when the stack adjustment matches. For example, with a thiscall that takes - // only one argument. - if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || - CallerCC == CallingConv::X86_ThisCall)) - return false; - // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. if (isVarArg && !Outs.empty()) { - // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) @@ -3573,6 +3792,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + unsigned StackArgsSize = 0; + // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { @@ -3587,11 +3808,9 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); - if (CCInfo.getNextStackOffset()) { - MachineFunction &MF = DAG.getMachineFunction(); - if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) - return false; + StackArgsSize = CCInfo.getNextStackOffset(); + if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -3642,6 +3861,21 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, } } + bool CalleeWillPop = + X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, + MF.getTarget().Options.GuaranteedTailCallOpt); + + if (unsigned BytesToPop = + MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { + // If we have bytes to pop, the callee must pop them. + bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; + if (!CalleePopMatches) + return false; + } else if (CalleeWillPop && StackArgsSize > 0) { + // If we don't have bytes to pop, make sure the callee doesn't pop any. + return false; + } + return true; } @@ -3688,11 +3922,13 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: + case X86ISD::VPERMV: + case X86ISD::VPERMV3: return true; } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { @@ -3707,7 +3943,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, } } -static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, +static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); @@ -3772,23 +4008,23 @@ bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, return false; } -/// isCalleePop - Determines whether the callee is required to pop its -/// own arguments. Callee pop is necessary to support tail calls. +/// Determines whether the callee is required to pop its own arguments. +/// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, - bool is64Bit, bool IsVarArg, bool TailCallOpt) { + bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { + // If GuaranteeTCO is true, we force some calls to be callee pop so that we + // can guarantee TCO. + if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) + return true; + switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: + case CallingConv::X86_VectorCall: return !is64Bit; - case CallingConv::Fast: - case CallingConv::GHC: - case CallingConv::HiPE: - if (IsVarArg) - return false; - return TailCallOpt; } } @@ -3807,11 +4043,26 @@ static bool isX86CCUnsigned(unsigned X86CC) { case X86::COND_BE: return true; case X86::COND_AE: return true; } - llvm_unreachable("covered switch fell through?!"); } -/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 -/// specific condition code, returning the condition code and the LHS/RHS of the +static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { + switch (SetCCOpcode) { + default: llvm_unreachable("Invalid integer condition!"); + case ISD::SETEQ: return X86::COND_E; + case ISD::SETGT: return X86::COND_G; + case ISD::SETGE: return X86::COND_GE; + case ISD::SETLT: return X86::COND_L; + case ISD::SETLE: return X86::COND_LE; + case ISD::SETNE: return X86::COND_NE; + case ISD::SETULT: return X86::COND_B; + case ISD::SETUGT: return X86::COND_A; + case ISD::SETULE: return X86::COND_BE; + case ISD::SETUGE: return X86::COND_AE; + } +} + +/// Do a one-to-one translation of a ISD::CondCode to the X86-specific +/// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { @@ -3833,19 +4084,7 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } - switch (SetCCOpcode) { - default: llvm_unreachable("Invalid integer condition!"); - case ISD::SETEQ: return X86::COND_E; - case ISD::SETGT: return X86::COND_G; - case ISD::SETGE: return X86::COND_GE; - case ISD::SETLT: return X86::COND_L; - case ISD::SETLE: return X86::COND_LE; - case ISD::SETNE: return X86::COND_NE; - case ISD::SETULT: return X86::COND_B; - case ISD::SETUGT: return X86::COND_A; - case ISD::SETULE: return X86::COND_BE; - case ISD::SETUGE: return X86::COND_AE; - } + return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. @@ -3898,8 +4137,8 @@ static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, } } -/// hasFPCMov - is there a floating point cmov for the specific X86 condition -/// code. Current x86 isa includes the following FP cmov instructions: +/// Is there a floating point cmov for the specific X86 condition code? +/// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { @@ -3917,7 +4156,7 @@ static bool hasFPCMov(unsigned X86CC) { } } -/// isFPImmLegal - Returns true if the target can instruction select the +/// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { @@ -3970,7 +4209,7 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasLZCNT(); } -/// isUndefInRange - Return true if every element in Mask, beginning +/// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef. static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) @@ -3979,19 +4218,18 @@ static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { return true; } -/// isUndefOrInRange - Return true if Val is undef or if its value falls within -/// the specified range (L, H]. +/// Return true if Val is undef or if its value falls within the +/// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } -/// isUndefOrEqual - Val is either less than zero (undef) or equal to the -/// specified value. +/// Val is either less than zero (undef) or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } -/// isSequentialOrUndefInRange - Return true if every element in Mask, beginning +/// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, @@ -4002,9 +4240,8 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, return true; } -/// isVEXTRACTIndex - Return true if the specified -/// EXTRACT_SUBVECTOR operand specifies a vector extract that is -/// suitable for instruction that extract 128 or 256 bit vectors +/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector +/// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) @@ -4021,7 +4258,7 @@ static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { return Result; } -/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR +/// Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { @@ -4057,8 +4294,8 @@ bool X86::isVEXTRACT256Index(SDNode *N) { static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) - llvm_unreachable("Illegal extract subvector for VEXTRACT"); + assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && + "Illegal extract subvector for VEXTRACT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue(); @@ -4072,8 +4309,8 @@ static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) - llvm_unreachable("Illegal insert subvector for VINSERT"); + assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && + "Illegal insert subvector for VINSERT"); uint64_t Index = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue(); @@ -4085,53 +4322,71 @@ static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { return Index / NumElemsPerChunk; } -/// getExtractVEXTRACT128Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } -/// getExtractVEXTRACT256Immediate - Return the appropriate immediate -/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to extract the specified +/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } -/// getInsertVINSERT128Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 -/// and VINSERTI128 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } -/// getInsertVINSERT256Immediate - Return the appropriate immediate -/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 -/// and VINSERTI64x4 instructions. +/// Return the appropriate immediate to insert at the specified +/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } -/// isZero - Returns true if Elt is a constant integer zero -static bool isZero(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isNullValue(); -} - -/// isZeroNode - Returns true if Elt is a constant zero or a floating point -/// constant +0.0. +/// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { - if (isZero(Elt)) - return true; - if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt)) - return CFP->getValueAPF().isPosZero(); - return false; + return isNullConstant(Elt) || isNullFPConstant(Elt); } -/// getZeroVector - Returns a vector of specified type with all zero elements. -/// -static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, +// Build a vector of constants +// Use an UNDEF node if MaskElt == -1. +// Spilt 64-bit constants in the 32-bit mode. +static SDValue getConstVector(ArrayRef<int> Values, MVT VT, + SelectionDAG &DAG, + SDLoc dl, bool IsMask = false) { + + SmallVector<SDValue, 32> Ops; + bool Split = false; + + MVT ConstVecVT = VT; + unsigned NumElts = VT.getVectorNumElements(); + bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); + if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { + ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); + Split = true; + } + + MVT EltVT = ConstVecVT.getVectorElementType(); + for (unsigned i = 0; i < NumElts; ++i) { + bool IsUndef = Values[i] < 0 && IsMask; + SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(Values[i], dl, EltVT); + Ops.push_back(OpNode); + if (Split) + Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : + DAG.getConstant(0, dl, EltVT)); + } + SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); + if (Split) + ConstsNode = DAG.getBitcast(VT, ConstsNode); + return ConstsNode; +} + +/// Returns a vector of specified type with all zero elements. +static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); @@ -4163,7 +4418,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); - } else if (VT.getScalarType() == MVT::i1) { + } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); @@ -4195,19 +4450,18 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) - * ElemsPerChunk); + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, - makeArrayRef(Vec->op_begin() + NormalizedIdxVal, - ElemsPerChunk)); + makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } @@ -4245,13 +4499,13 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec, // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk - // we want. - unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) - * ElemsPerChunk); + // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. + IdxVal &= ~(ElemsPerChunk - 1); - SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal, dl); + SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } @@ -4279,7 +4533,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, Vec, ZeroIndex); // The blend instruction, and therefore its mask, depend on the data type. - MVT ScalarType = ResultVT.getScalarType().getSimpleVT(); + MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); if (ScalarType.isFloatingPoint()) { // Choose either vblendps (float) or vblendpd (double). unsigned ScalarSize = ScalarType.getSizeInBits(); @@ -4316,6 +4570,81 @@ static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } +/// Insert i1-subvector to i1-vector. +static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { + + SDLoc dl(Op); + SDValue Vec = Op.getOperand(0); + SDValue SubVec = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + if (IdxVal == 0 && Vec.isUndef()) // the operation is legal + return Op; + + MVT OpVT = Op.getSimpleValueType(); + MVT SubVecVT = SubVec.getSimpleValueType(); + unsigned NumElems = OpVT.getVectorNumElements(); + unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); + + assert(IdxVal + SubVecNumElems <= NumElems && + IdxVal % SubVecVT.getSizeInBits() == 0 && + "Unexpected index value in INSERT_SUBVECTOR"); + + // There are 3 possible cases: + // 1. Subvector should be inserted in the lower part (IdxVal == 0) + // 2. Subvector should be inserted in the upper part + // (IdxVal + SubVecNumElems == NumElems) + // 3. Subvector should be inserted in the middle (for example v2i1 + // to v16i1, index 2) + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + SDValue Undef = DAG.getUNDEF(OpVT); + SDValue WideSubVec = + DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); + if (Vec.isUndef()) + return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + unsigned ShiftLeft = NumElems - SubVecNumElems; + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, + DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; + } + + if (IdxVal == 0) { + // Zero lower bits of the Vec + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + // Merge them together + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + + // Simple case when we put subvector in the upper part + if (IdxVal + SubVecNumElems == NumElems) { + // Zero upper bits of the Vec + WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); + return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); + } + // Subvector should be inserted in the middle - use shuffle + SmallVector<int, 64> Mask; + for (unsigned i = 0; i < NumElems; ++i) + Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? + i : i + NumElems); + return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); +} + /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower @@ -4334,18 +4663,22 @@ static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } -/// getOnesVector - Returns a vector of specified type with all bits set. +/// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. -static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, - SDLoc dl) { +static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); SDValue Vec; - if (VT.is256BitVector()) { - if (HasInt256) { // AVX2 + if (VT.is512BitVector()) { + SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, + Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; + Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); + } else if (VT.is256BitVector()) { + if (Subtarget->hasInt256()) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX @@ -4360,19 +4693,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, return DAG.getBitcast(VT, Vec); } -/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd -/// operation of specified width. -static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, - SDValue V2) { - unsigned NumElems = VT.getVectorNumElements(); - SmallVector<int, 8> Mask; - Mask.push_back(NumElems); - for (unsigned i = 1; i != NumElems; ++i) - Mask.push_back(i); - return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); -} - -/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. +/// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4384,7 +4705,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getUnpackh - Returns a vector_shuffle node for an unpackh operation. +/// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); @@ -4396,10 +4717,10 @@ static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } -/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified -/// vector of zero or undef vector. This produces a shuffle where the low -/// element of V2 is swizzled into the zero/undef vector, landing at element -/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). +/// Return a vector_shuffle of the specified vector of zero or undef vector. +/// This produces a shuffle where the low element of V2 is swizzled into the +/// zero/undef vector, landing at element Idx. +/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, @@ -4415,10 +4736,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } -/// getTargetShuffleMask - Calculates the shuffle mask corresponding to the -/// target specific opcode. Returns true if the Mask could be calculated. Sets -/// IsUnary to true if only uses one source. Note that this will set IsUnary for -/// shuffles which use a single input multiple times, and in those cases it will +/// Calculates the shuffle mask corresponding to the target-specific opcode. +/// Returns true if the Mask could be calculated. Sets IsUnary to true if only +/// uses one source. Note that this will set IsUnary for shuffles which use a +/// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. /// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. static bool getTargetShuffleMask(SDNode *N, MVT VT, @@ -4482,7 +4803,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. - EVT VT = MaskNode.getValueType(); + MVT VT = MaskNode.getSimpleValueType(); assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); if (!VT.isInteger()) @@ -4572,6 +4893,119 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, case X86ISD::MOVLPS: // Not yet implemented return false; + case X86ISD::VPERMV: { + IsUnary = true; + SDValue MaskNode = N->getOperand(0); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(0); + + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); + SmallVector<uint64_t, 32> RawMask; + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else if (isa<ConstantSDNode>(Op)) { + APInt MaskElement = cast<ConstantSDNode>(Op)->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } else + return false; + } + DecodeVPERMVMask(RawMask, Mask); + break; + } + if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { + unsigned NumEltsInMask = MaskNode->getNumOperands(); + MaskNode = MaskNode->getOperand(0); + if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode)) { + APInt MaskEltValue = CN->getAPIntValue(); + for (unsigned i = 0; i < NumEltsInMask; ++i) + RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); + DecodeVPERMVMask(RawMask, Mask); + break; + } + // It may be a scalar load + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMVMask(C, VT, Mask); + if (Mask.empty()) + return false; + break; + } + return false; + } + case X86ISD::VPERMV3: { + IsUnary = false; + SDValue MaskNode = N->getOperand(1); + while (MaskNode->getOpcode() == ISD::BITCAST) + MaskNode = MaskNode->getOperand(1); + + if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { + // If we have a build-vector, then things are easy. + assert(MaskNode.getSimpleValueType().isInteger() && + MaskNode.getSimpleValueType().getVectorNumElements() == + VT.getVectorNumElements()); + + SmallVector<uint64_t, 32> RawMask; + unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); + + for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { + SDValue Op = MaskNode->getOperand(i); + if (Op->getOpcode() == ISD::UNDEF) + RawMask.push_back((uint64_t)SM_SentinelUndef); + else { + auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()); + if (!CN) + return false; + APInt MaskElement = CN->getAPIntValue(); + RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); + } + } + DecodeVPERMV3Mask(RawMask, Mask); + break; + } + + auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode); + if (!MaskLoad) + return false; + + SDValue Ptr = MaskLoad->getBasePtr(); + if (Ptr->getOpcode() == X86ISD::Wrapper || + Ptr->getOpcode() == X86ISD::WrapperRIP) + Ptr = Ptr->getOperand(0); + + auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr); + if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) + return false; + + if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) { + DecodeVPERMV3Mask(C, VT, Mask); + if (Mask.empty()) + return false; + break; + } + return false; + } default: llvm_unreachable("unknown target shuffle node"); } @@ -4586,7 +5020,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, return true; } -/// getShuffleScalarElt - Returns the scalar element that will make up the ith +/// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { @@ -4650,8 +5084,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } -/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. -/// +/// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4721,8 +5154,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, return DAG.getBitcast(MVT::v16i8, V); } -/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. -/// +/// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, @@ -4753,7 +5185,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, return V; } -/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. +/// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { @@ -4924,7 +5356,7 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); - int64_t StartOffset = Offset & ~(RequiredAlign-1); + int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, @@ -5157,8 +5589,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - const Function *F = DAG.getMachineFunction().getFunction(); - bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -5188,9 +5619,10 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment(); - Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, + false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } @@ -5329,7 +5761,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { return NV; } -static SDValue ConvertI1VectorToInterger(SDValue Op, SelectionDAG &DAG) { +static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); @@ -5366,7 +5798,7 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { } if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { - SDValue Imm = ConvertI1VectorToInterger(Op, DAG); + SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, Imm); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); @@ -5600,7 +6032,7 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, /// node. static SDValue LowerToAddSub(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = BV->getValueType(0); + MVT VT = BV->getSimpleValueType(0); if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); @@ -5662,12 +6094,12 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, // Update InVec0 and InVec1. if (InVec0.getOpcode() == ISD::UNDEF) { InVec0 = Op0.getOperand(0); - if (InVec0.getValueType() != VT) + if (InVec0.getSimpleValueType() != VT) return SDValue(); } if (InVec1.getOpcode() == ISD::UNDEF) { InVec1 = Op1.getOperand(0); - if (InVec1.getValueType() != VT) + if (InVec1.getSimpleValueType() != VT) return SDValue(); } @@ -5703,7 +6135,7 @@ static SDValue LowerToAddSub(const BuildVectorSDNode *BV, static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - EVT VT = BV->getValueType(0); + MVT VT = BV->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; @@ -5845,7 +6277,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. - if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) + if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); // Vectors containing all zeros can be matched by pxor and xorps later @@ -5866,7 +6298,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return Op; if (!VT.is512BitVector()) - return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); + return getOnesVector(VT, Subtarget, DAG, dl); } BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); @@ -5881,7 +6313,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { unsigned NumZero = 0; unsigned NumNonZero = 0; - unsigned NonZeros = 0; + uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet<SDValue, 8> Values; for (unsigned i = 0; i < NumElems; ++i) { @@ -5895,7 +6327,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (X86::isZeroNode(Elt)) NumZero++; else { - NonZeros |= (1 << i); + assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. + NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } @@ -5919,7 +6352,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); - EVT VecVT = MVT::v4i32; + MVT VecVT = MVT::v4i32; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). @@ -6051,7 +6484,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, - Op.getOperand(Idx)); + Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); @@ -6059,13 +6492,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) - if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; if (EVTBits == 16 && NumElems == 8) - if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, - Subtarget, *this)) + if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, + DAG, Subtarget, *this)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS @@ -6077,7 +6510,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SmallVector<SDValue, 8> V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { - bool isZero = !(NonZeros & (1 << i)); + bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else @@ -6177,7 +6610,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction +// 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); @@ -6193,8 +6626,8 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { - MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), - ResVT.getVectorNumElements()/2); + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), + ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), @@ -6213,8 +6646,27 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); + SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { - MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), + // Specialize the cases when all, or all but one, of the operands are undef. + unsigned NumOfDefinedOps = 0; + unsigned OpIdx = 0; + for (unsigned i = 0; i < NumOfOperands; i++) + if (!Op.getOperand(i).isUndef()) { + NumOfDefinedOps++; + OpIdx = i; + } + if (NumOfDefinedOps == 0) + return Undef; + if (NumOfDefinedOps == 1) { + unsigned SubVecNumElts = + Op.getOperand(OpIdx).getValueType().getVectorNumElements(); + SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, + Op.getOperand(OpIdx), IdxVal); + } + + MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SmallVector<SDValue, 2> Ops; for (unsigned i = 0; i < NumOfOperands/2; i++) @@ -6227,31 +6679,38 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } + // 2 operands SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); + unsigned NumElems = ResVT.getVectorNumElements(); + assert(V1.getValueType() == V2.getValueType() && + V1.getValueType().getVectorNumElements() == NumElems/2 && + "Unexpected operands in CONCAT_VECTORS"); + + if (ResVT.getSizeInBits() >= 16) + return Op; // The operation is legal with KUNPCK + bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); - + SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); if (IsZeroV1 && IsZeroV2) - return getZeroVector(ResVT, Subtarget, DAG, dl); + return ZeroVec; SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(ResVT); - unsigned NumElems = ResVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); + if (V2.isUndef()) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); + if (IsZeroV2) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); + + SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); + if (V1.isUndef()) + V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); - V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx); - V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits); if (IsZeroV1) - return V2; + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); - // Zero the upper bits of V1 - V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits); - V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits); - if (IsZeroV2) - return V1; - return DAG.getNode(ISD::OR, dl, ResVT, V1, V2); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); } static SDValue LowerCONCAT_VECTORS(SDValue Op, @@ -6272,7 +6731,6 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, return LowerAVXCONCAT_VECTORS(Op, DAG); } - //===----------------------------------------------------------------------===// // Vector shuffle lowering // @@ -6422,6 +6880,127 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL, return DAG.getConstant(Imm, DL, MVT::i8); } +/// \brief Compute whether each element of a shuffle is zeroable. +/// +/// A "zeroable" vector shuffle element is one which can be lowered to zero. +/// Either it is an undef element in the shuffle mask, the element of the input +/// referenced is undef, or the element of the input referenced is known to be +/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle +/// as many lanes with this technique as possible to simplify the remaining +/// shuffle. +static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, + SDValue V1, SDValue V2) { + SmallBitVector Zeroable(Mask.size(), false); + + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + int M = Mask[i]; + // Handle the easy cases. + if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + Zeroable[i] = true; + continue; + } + + // If this is an index into a build_vector node (which has the same number + // of elements), dig out the input value and use it. + SDValue V = M < Size ? V1 : V2; + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + continue; + + SDValue Input = V.getOperand(M % Size); + // The UNDEF opcode check really should be dead code here, but not quite + // worth asserting on (it isn't invalid, just unexpected). + if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) + Zeroable[i] = true; + } + + return Zeroable; +} + +// X86 has dedicated unpack instructions that can handle specific blend +// operations: UNPCKH and UNPCKL. +static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + int NumElts = VT.getVectorNumElements(); + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + SmallVector<int, 8> Unpckl; + SmallVector<int, 8> Unpckh; + + for (int i = 0; i < NumElts; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); + int HiPos = LoPos + NumEltsInLane / 2; + Unpckl.push_back(LoPos); + Unpckh.push_back(HiPos); + } + + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); + + // Commute and try again. + ShuffleVectorSDNode::commuteMask(Unpckl); + if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) + return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); + + ShuffleVectorSDNode::commuteMask(Unpckh); + if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); + + return SDValue(); +} + +/// \brief Try to emit a bitmask instruction for a shuffle. +/// +/// This handles cases where we can model a blend exactly as a bitmask due to +/// one of the inputs being zeroable. +static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + SelectionDAG &DAG) { + MVT EltVT = VT.getVectorElementType(); + int NumEltBits = EltVT.getSizeInBits(); + MVT IntEltVT = MVT::getIntegerVT(NumEltBits); + SDValue Zero = DAG.getConstant(0, DL, IntEltVT); + SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, + IntEltVT); + if (EltVT.isFloatingPoint()) { + Zero = DAG.getBitcast(EltVT, Zero); + AllOnes = DAG.getBitcast(EltVT, AllOnes); + } + SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + SDValue V; + for (int i = 0, Size = Mask.size(); i < Size; ++i) { + if (Zeroable[i]) + continue; + if (Mask[i] % Size != i) + return SDValue(); // Not a blend. + if (!V) + V = Mask[i] < Size ? V1 : V2; + else if (V != (Mask[i] < Size ? V1 : V2)) + return SDValue(); // Can only let one input through the mask. + + VMaskOps[i] = AllOnes; + } + if (!V) + return SDValue(); // No non-zeroable elements! + + SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); + V = DAG.getNode(VT.isFloatingPoint() + ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, + DL, VT, V, VMask); + return V; +} + /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are @@ -6431,7 +7010,7 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); - MVT EltVT = VT.getScalarType(); + MVT EltVT = VT.getVectorElementType(); int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, @@ -6458,22 +7037,62 @@ static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is -/// that the shuffle mask is in fact a blend. +/// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, + SDValue V2, ArrayRef<int> Original, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector<int, 8> Mask(Original.begin(), Original.end()); + SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); + bool ForceV1Zero = false, ForceV2Zero = false; + + // Attempt to generate the binary blend mask. If an input is zero then + // we can use any lane. + // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Mask[i] >= Size) { - if (Mask[i] != i + Size) - return SDValue(); // Shuffled V2 input! + int M = Mask[i]; + if (M < 0) + continue; + if (M == i) + continue; + if (M == i + Size) { BlendMask |= 1u << i; continue; } - if (Mask[i] >= 0 && Mask[i] != i) - return SDValue(); // Shuffled V1 input! + if (Zeroable[i]) { + if (V1IsZero) { + ForceV1Zero = true; + Mask[i] = i; + continue; + } + if (V2IsZero) { + ForceV2Zero = true; + BlendMask |= 1u << i; + Mask[i] = i + Size; + continue; + } + } + return SDValue(); // Shuffled input! } + + // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. + if (ForceV1Zero) + V1 = getZeroVector(VT, Subtarget, DAG, DL); + if (ForceV2Zero) + V2 = getZeroVector(VT, Subtarget, DAG, DL); + + auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { + unsigned ScaledMask = 0; + for (int i = 0; i != Size; ++i) + if (BlendMask & (1u << i)) + for (int j = 0; j != Scale; ++j) + ScaledMask |= 1u << (i * Scale + j); + return ScaledMask; + }; + switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: @@ -6493,12 +7112,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, if (Subtarget->hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); @@ -6511,12 +7125,7 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); - BlendMask = 0; - for (int i = 0, Size = Mask.size(); i < Size; ++i) - if (Mask[i] >= Size) - for (int j = 0; j < Scale; ++j) - BlendMask |= 1u << (i * Scale + j); - + BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, @@ -6541,9 +7150,13 @@ static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, // FALLTHROUGH case MVT::v16i8: case MVT::v32i8: { - assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) && + assert((VT.is128BitVector() || Subtarget->hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. + if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) + return Masked; + // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; @@ -6760,11 +7373,11 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, Hi = DAG.getBitcast(AlignVT, Hi); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo, + VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, DAG.getConstant(Rotation * Scale, DL, MVT::i8))); } - assert(VT.getSizeInBits() == 128 && + assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); @@ -6785,92 +7398,6 @@ static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } -/// \brief Compute whether each element of a shuffle is zeroable. -/// -/// A "zeroable" vector shuffle element is one which can be lowered to zero. -/// Either it is an undef element in the shuffle mask, the element of the input -/// referenced is undef, or the element of the input referenced is known to be -/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle -/// as many lanes with this technique as possible to simplify the remaining -/// shuffle. -static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask, - SDValue V1, SDValue V2) { - SmallBitVector Zeroable(Mask.size(), false); - - while (V1.getOpcode() == ISD::BITCAST) - V1 = V1->getOperand(0); - while (V2.getOpcode() == ISD::BITCAST) - V2 = V2->getOperand(0); - - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); - - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - int M = Mask[i]; - // Handle the easy cases. - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { - Zeroable[i] = true; - continue; - } - - // If this is an index into a build_vector node (which has the same number - // of elements), dig out the input value and use it. - SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) - continue; - - SDValue Input = V.getOperand(M % Size); - // The UNDEF opcode check really should be dead code here, but not quite - // worth asserting on (it isn't invalid, just unexpected). - if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) - Zeroable[i] = true; - } - - return Zeroable; -} - -/// \brief Try to emit a bitmask instruction for a shuffle. -/// -/// This handles cases where we can model a blend exactly as a bitmask due to -/// one of the inputs being zeroable. -static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { - MVT EltVT = VT.getScalarType(); - int NumEltBits = EltVT.getSizeInBits(); - MVT IntEltVT = MVT::getIntegerVT(NumEltBits); - SDValue Zero = DAG.getConstant(0, DL, IntEltVT); - SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, - IntEltVT); - if (EltVT.isFloatingPoint()) { - Zero = DAG.getBitcast(EltVT, Zero); - AllOnes = DAG.getBitcast(EltVT, AllOnes); - } - SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); - SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); - SDValue V; - for (int i = 0, Size = Mask.size(); i < Size; ++i) { - if (Zeroable[i]) - continue; - if (Mask[i] % Size != i) - return SDValue(); // Not a blend. - if (!V) - V = Mask[i] < Size ? V1 : V2; - else if (V != (Mask[i] < Size ? V1 : V2)) - return SDValue(); // Can only let one input through the mask. - - VMaskOps[i] = AllOnes; - } - if (!V) - return SDValue(); // No non-zeroable elements! - - SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); - V = DAG.getNode(VT.isFloatingPoint() - ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, - DL, VT, V, VMask); - return V; -} - /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and @@ -6982,7 +7509,7 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; - for (; Len >= 0; --Len) + for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); @@ -6997,8 +7524,9 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, SDValue &V = (M < Size ? V1 : V2); M = M % Size; - // All mask elements must be in the lower half. - if (M > HalfSize) + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) return SDValue(); if (Idx < 0 || (Src == V && Idx == (M - i))) { @@ -7095,64 +7623,104 @@ static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available -/// features of the subtarget. +/// features of the subtarget. The extended elements are consecutive and +/// begin and can start from an offseted element index in the input; to +/// avoid excess shuffling the offset must either being in the bottom lane +/// or at the start of a higher lane. All extended elements must be from +/// the same lane. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( - SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV, + SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); - int NumElements = VT.getVectorNumElements(); int EltBits = VT.getScalarSizeInBits(); + int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = 128 / EltBits; + int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); + assert(0 <= Offset && "Extension offset must be positive."); + assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && + "Extension offset must be in the first lane or start an upper lane."); + + // Check that an index is in same lane as the base offset. + auto SafeOffset = [&](int Idx) { + return OffsetLane == (Idx / NumEltsPerLane); + }; + + // Shift along an input so that the offset base moves to the first element. + auto ShuffleOffset = [&](SDValue V) { + if (!Offset) + return V; + + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = 0; i * Scale < NumElements; ++i) { + int SrcIdx = i + Offset; + ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; + } + return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); + }; // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { + // Not worth offseting 128-bit vectors if scale == 2, a pattern using + // PUNPCK will catch this in a later shuffle match. + if (Offset && Scale == 2 && VT.is128BitVector()) + return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); - return DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV)); + InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); + return DAG.getBitcast(VT, InputV); } + assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); + // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { - int PSHUFDMask[4] = {0, -1, 1, -1}; + int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, + -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { - int PSHUFDMask[4] = {0, -1, 0, -1}; + int PSHUFDMask[4] = {Offset / 2, -1, + SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); - int PSHUFHWMask[4] = {1, -1, -1, -1}; + int PSHUFWMask[4] = {1, -1, -1, -1}; + unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); return DAG.getBitcast( - VT, DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, + VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), - getV4X86ShuffleImm8ForMask(PSHUFHWMask, DL, DAG))); + getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); - assert(VT.getSizeInBits() == 128 && "Unexpected vector width!"); + assert(VT.is128BitVector() && "Unexpected vector width!"); + int LoIdx = Offset * EltBits; SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(0, DL, MVT::i8))); - if (isUndefInRange(Mask, NumElements/2, NumElements/2)) + DAG.getConstant(LoIdx, DL, MVT::i8))); + + if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || + !SafeOffset(Offset + 1)) return DAG.getNode(ISD::BITCAST, DL, VT, Lo); - SDValue Hi = - DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, - DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, - DAG.getConstant(EltBits, DL, MVT::i8), - DAG.getConstant(EltBits, DL, MVT::i8))); + int HiIdx = (Offset + 1) * EltBits; + SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, + DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, + DAG.getConstant(EltBits, DL, MVT::i8), + DAG.getConstant(HiIdx, DL, MVT::i8))); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } @@ -7163,9 +7731,11 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; - for (int i = 0; i < 16; ++i) - PSHUFBMask[i] = - DAG.getConstant((i % Scale == 0) ? i / Scale : 0x80, DL, MVT::i8); + for (int i = 0; i < 16; ++i) { + int Idx = Offset + (i / Scale); + PSHUFBMask[i] = DAG.getConstant( + (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); + } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast(VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, @@ -7173,13 +7743,30 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( MVT::v16i8, PSHUFBMask))); } + // If we are extending from an offset, ensure we start on a boundary that + // we can unpack from. + int AlignToUnpack = Offset % (NumElements / Scale); + if (AlignToUnpack) { + SmallVector<int, 8> ShMask((unsigned)NumElements, -1); + for (int i = AlignToUnpack; i < NumElements; ++i) + ShMask[i - AlignToUnpack] = i; + InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); + Offset -= AlignToUnpack; + } + // Otherwise emit a sequence of unpacks. do { + unsigned UnpackLoHi = X86ISD::UNPCKL; + if (Offset >= (NumElements / 2)) { + UnpackLoHi = X86ISD::UNPCKH; + Offset -= (NumElements / 2); + } + MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); - InputV = DAG.getNode(X86ISD::UNPCKL, DL, InputVT, InputV, Ext); + InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; @@ -7205,7 +7792,9 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); + int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); + int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); @@ -7215,8 +7804,11 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; + int Offset = 0; + int Matches = 0; for (int i = 0; i < NumElements; ++i) { - if (Mask[i] == -1) + int M = Mask[i]; + if (M == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. @@ -7230,14 +7822,29 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( // Each of the base elements needs to be consecutive indices into the // same input vector. - SDValue V = Mask[i] < NumElements ? V1 : V2; - if (!InputV) + SDValue V = M < NumElements ? V1 : V2; + M = M % NumElements; + if (!InputV) { InputV = V; - else if (InputV != V) + Offset = M - (i / Scale); + } else if (InputV != V) return SDValue(); // Flip-flopping inputs. - if (Mask[i] % NumElements != i / Scale) + // Offset must start in the lowest 128-bit lane or at the start of an + // upper lane. + // FIXME: Is it ever worth allowing a negative base offset? + if (!((0 <= Offset && Offset < NumEltsPerLane) || + (Offset % NumEltsPerLane) == 0)) + return SDValue(); + + // If we are offsetting, all referenced entries must come from the same + // lane. + if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) + return SDValue(); + + if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. + Matches++; } // If we fail to find an input, we have a zero-shuffle which should always @@ -7246,8 +7853,13 @@ static SDValue lowerVectorShuffleAsZeroOrAnyExtend( if (!InputV) return SDValue(); + // If we are offsetting, don't extend if we only match a single input, we + // can always do better by using a basic PSHUF or PUNPCK. + if (Offset != 0 && Matches < 2) + return SDValue(); + return lowerVectorShuffleAsSpecificZeroOrAnyExtend( - DL, VT, Scale, AnyExt, InputV, Mask, Subtarget, DAG); + DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. @@ -7355,8 +7967,9 @@ static SDValue lowerVectorShuffleAsElementInsertion( // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. - if (SDValue V2S = getScalarValueForVectorElement( - V2, Mask[V2Index] - Mask.size(), DAG)) { + SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), + DAG); + if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { @@ -7431,11 +8044,65 @@ static SDValue lowerVectorShuffleAsElementInsertion( return V2; } +/// \brief Try to lower broadcast of a single - truncated - integer element, +/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. +/// +/// This assumes we have AVX2. +static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, + int BroadcastIdx, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(Subtarget->hasAVX2() && + "We can only lower integer broadcasts with AVX2!"); + + EVT EltVT = VT.getVectorElementType(); + EVT V0VT = V0.getValueType(); + + assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); + assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); + + EVT V0EltVT = V0VT.getVectorElementType(); + if (!V0EltVT.isInteger()) + return SDValue(); + + const unsigned EltSize = EltVT.getSizeInBits(); + const unsigned V0EltSize = V0EltVT.getSizeInBits(); + + // This is only a truncation if the original element type is larger. + if (V0EltSize <= EltSize) + return SDValue(); + + assert(((V0EltSize % EltSize) == 0) && + "Scalar type sizes must all be powers of 2 on x86!"); + + const unsigned V0Opc = V0.getOpcode(); + const unsigned Scale = V0EltSize / EltSize; + const unsigned V0BroadcastIdx = BroadcastIdx / Scale; + + if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && + V0Opc != ISD::BUILD_VECTOR) + return SDValue(); + + SDValue Scalar = V0.getOperand(V0BroadcastIdx); + + // If we're extracting non-least-significant bits, shift so we can truncate. + // Hopefully, we can fold away the trunc/srl/load into the broadcast. + // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer + // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. + if (const int OffsetIdx = BroadcastIdx % Scale) + Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, + DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); + + return DAG.getNode(X86ISD::VBROADCAST, DL, VT, + DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); +} + /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. +/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef<int> Mask, const X86Subtarget *Subtarget, @@ -7476,7 +8143,7 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, int BeginIdx = (int)ConstantIdx->getZExtValue(); int EndIdx = - BeginIdx + (int)VInner.getValueType().getVectorNumElements(); + BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { BroadcastIdx -= BeginIdx; V = VInner; @@ -7491,6 +8158,15 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. + // First, look through bitcast: if the original value has a larger element + // type than the shuffle, the broadcast element is in essence truncated. + // Make that explicit to ease folding. + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) + if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( + DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) + return TruncBroadcast; + + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); @@ -7499,6 +8175,20 @@ static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); + } else if (MayFoldLoad(V) && !cast<LoadSDNode>(V)->isVolatile()) { + // If we are broadcasting a load that is only used by the shuffle + // then we can reduce the vector load to the broadcasted scalar load. + LoadSDNode *Ld = cast<LoadSDNode>(V); + SDValue BaseAddr = Ld->getOperand(1); + EVT AddrVT = BaseAddr.getValueType(); + EVT SVT = VT.getScalarType(); + unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + SDValue NewAddr = DAG.getNode( + ISD::ADD, DL, AddrVT, BaseAddr, + DAG.getConstant(Offset, DL, AddrVT)); + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. @@ -7595,9 +8285,10 @@ static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. -static SDValue lowerVectorShuffleAsUnpack(SDLoc DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { +static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef<int> Mask, + SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(!isSingleInputShuffleMask(Mask) && @@ -7774,10 +8465,9 @@ static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2f64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) + return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, @@ -7869,10 +8559,9 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v2i64, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8077,14 +8766,9 @@ static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, } // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) + return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); @@ -8161,14 +8845,9 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 1, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 6, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 5, 1})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {6, 2, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. @@ -8184,8 +8863,8 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, Mask, DAG); // Try to lower by permuting the inputs into an unpack instruction. - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, + V2, Mask, DAG)) return Unpack; // We implement this with SHUFPS because it can blend from two vectors. @@ -8218,7 +8897,7 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, static SDValue lowerV8I16GeneralSingleInputVectorShuffle( SDLoc DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(VT.getScalarType() == MVT::i16 && "Bad input type!"); + assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); @@ -8286,16 +8965,18 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); + bool ThreeAInputs = AToAInputs.size() == 3; + // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; - int &TripleDWord = AToAInputs.size() == 3 ? ADWord : BDWord; - int &OneInputDWord = AToAInputs.size() == 3 ? BDWord : ADWord; - int TripleInputOffset = AToAInputs.size() == 3 ? AOffset : BOffset; - ArrayRef<int> TripleInputs = AToAInputs.size() == 3 ? AToAInputs : BToAInputs; - int OneInput = AToAInputs.size() == 3 ? BToAInputs[0] : AToAInputs[0]; + int &TripleDWord = ThreeAInputs ? ADWord : BDWord; + int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; + int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; + ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; + int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); @@ -8364,8 +9045,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle( FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); - int APinnedIdx = - AToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; + int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } @@ -8751,10 +9431,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V1, Mask, {0, 0, 1, 1, 2, 2, 3, 3})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V1); - if (isShuffleEquivalent(V1, V1, Mask, {4, 4, 5, 5, 6, 6, 7, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, @@ -8798,10 +9477,9 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Masked; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 2, 10, 3, 11})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 12, 5, 13, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) + return V; // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( @@ -8812,8 +9490,8 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) return BitBlend; - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v8i16, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, + V2, Mask, DAG)) return Unpack; // If we can't directly blend but can use PSHUFB, that will be better as it @@ -9037,17 +9715,14 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return V; } + if (SDValue Masked = + lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, DAG)) + return Masked; + // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 0, 16, 1, 17, 2, 18, 3, 19, - // High half. - 4, 20, 5, 21, 6, 22, 7, 23})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {// Low half. - 8, 24, 9, 25, 10, 26, 11, 27, - // High half. - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) + return V; // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly // with PSHUFB. It is important to do this before we attempt to generate any @@ -9086,8 +9761,8 @@ static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, // FIXME: It might be worth trying to detect if the unpack-feeding // shuffles will both be pshufb, in which case we shouldn't bother with // this. - if (SDValue Unpack = - lowerVectorShuffleAsUnpack(DL, MVT::v16i8, V1, V2, Mask, DAG)) + if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack( + DL, MVT::v16i8, V1, V2, Mask, DAG)) return Unpack; } @@ -9296,7 +9971,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, int NumElements = VT.getVectorNumElements(); int SplitNumElements = NumElements / 2; - MVT ScalarVT = VT.getScalarType(); + MVT ScalarVT = VT.getVectorElementType(); MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2); // Rather than splitting build-vectors, just build two narrower build @@ -9308,7 +9983,7 @@ static SDValue splitAndLowerVectorShuffle(SDLoc DL, MVT VT, SDValue V1, MVT OrigVT = V.getSimpleValueType(); int OrigNumElements = OrigVT.getVectorNumElements(); int OrigSplitNumElements = OrigNumElements / 2; - MVT OrigScalarVT = OrigVT.getScalarType(); + MVT OrigScalarVT = OrigVT.getVectorElementType(); MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2); SDValue LoV, HiV; @@ -9478,7 +10153,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(SDLoc DL, MVT VT, ArrayRef<int> Mask, SelectionDAG &DAG) { // FIXME: This should probably be generalized for 512-bit vectors as well. - assert(VT.getSizeInBits() == 256 && "Only for 256-bit vector shuffles!"); + assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int LaneSize = Mask.size() / 2; // If there are only inputs from one 128-bit lane, splitting will in fact be @@ -9682,6 +10357,108 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } +/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// This allows for fast cases such as subvector extraction/insertion +/// or shuffling smaller vector types which can lower more efficiently. +static SDValue lowerVectorShuffleWithUndefHalf(SDLoc DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + assert(VT.getSizeInBits() == 256 && "Expected 256-bit vector"); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned HalfNumElts = NumElts / 2; + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts); + + bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts); + bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts); + if (!UndefLower && !UndefUpper) + return SDValue(); + + // Upper half is undef and lower half is whole upper subvector. + // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> + if (UndefUpper && + isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(HalfNumElts, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(0, DL)); + } + + // Lower half is undef and upper half is whole lower subvector. + // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> + if (UndefLower && + isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { + SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, + DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, + DAG.getIntPtrConstant(HalfNumElts, DL)); + } + + // AVX2 supports efficient immediate 64-bit element cross-lane shuffles. + if (UndefLower && Subtarget->hasAVX2() && + (VT == MVT::v4f64 || VT == MVT::v4i64)) + return SDValue(); + + // If the shuffle only uses the lower halves of the input operands, + // then extract them and perform the 'half' shuffle at half width. + // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u> + int HalfIdx1 = -1, HalfIdx2 = -1; + SmallVector<int, 8> HalfMask; + unsigned Offset = UndefLower ? HalfNumElts : 0; + for (unsigned i = 0; i != HalfNumElts; ++i) { + int M = Mask[i + Offset]; + if (M < 0) { + HalfMask.push_back(M); + continue; + } + + // Determine which of the 4 half vectors this element is from. + // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. + int HalfIdx = M / HalfNumElts; + + // Only shuffle using the lower halves of the inputs. + // TODO: Investigate usefulness of shuffling with upper halves. + if (HalfIdx != 0 && HalfIdx != 2) + return SDValue(); + + // Determine the element index into its half vector source. + int HalfElt = M % HalfNumElts; + + // We can shuffle with up to 2 half vectors, set the new 'half' + // shuffle mask accordingly. + if (-1 == HalfIdx1 || HalfIdx1 == HalfIdx) { + HalfMask.push_back(HalfElt); + HalfIdx1 = HalfIdx; + continue; + } + if (-1 == HalfIdx2 || HalfIdx2 == HalfIdx) { + HalfMask.push_back(HalfElt + HalfNumElts); + HalfIdx2 = HalfIdx; + continue; + } + + // Too many half vectors referenced. + return SDValue(); + } + assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length"); + + auto GetHalfVector = [&](int HalfIdx) { + if (HalfIdx < 0) + return DAG.getUNDEF(HalfVT); + SDValue V = (HalfIdx < 2 ? V1 : V2); + HalfIdx = (HalfIdx % 2) * HalfNumElts; + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, + DAG.getIntPtrConstant(HalfIdx, DL)); + }; + + SDValue Half1 = GetHalfVector(HalfIdx1); + SDValue Half2 = GetHalfVector(HalfIdx2); + SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, + DAG.getIntPtrConstant(Offset, DL)); +} + /// \brief Test whether the specified input (0 or 1) is in-place blended by the /// given mask. /// @@ -9776,16 +10553,10 @@ static SDValue lowerV4F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, DAG); } - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4f64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4f64, V2, V1); + // Use dedicated unpack instructions for masks that match their pattern. + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) + return V; if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) @@ -9876,14 +10647,9 @@ static SDValue lowerV4I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Shift; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 4, 2, 6})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 5, 3, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {4, 0, 6, 2})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i64, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {5, 1, 7, 3})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i64, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) + return V; // Try to simplify this by merging 128-bit lanes to enable a lane-based // shuffle. However, if we have AVX2 and either inputs are already in place, @@ -9941,14 +10707,9 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) + return V; // Otherwise, fall back to a SHUFPS sequence. Here it is important that we // have already handled any direct blends. We also need to squash the @@ -9974,9 +10735,7 @@ static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, if (Subtarget->hasAVX2()) return DAG.getNode( X86ISD::VPERMV, DL, MVT::v8f32, - DAG.getBitcast(MVT::v8f32, DAG.getNode(ISD::BUILD_VECTOR, DL, - MVT::v8i32, VPermMask)), - V1); + DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask), V1); // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, @@ -10041,14 +10800,9 @@ static SDValue lowerV8I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 1, 9, 4, 12, 5, 13})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 10, 3, 11, 6, 14, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {8, 0, 9, 1, 12, 4, 13, 5})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i32, V2, V1); - if (isShuffleEquivalent(V1, V2, Mask, {10, 2, 11, 3, 14, 6, 15, 7})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i32, V2, V1); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) + return V; } // Try to use shift instructions. @@ -10115,18 +10869,9 @@ static SDValue lowerV16I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 0, 16, 1, 17, 2, 18, 3, 19, - // Second 128-bit lane: - 8, 24, 9, 25, 10, 26, 11, 27})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane: - 4, 20, 5, 21, 6, 22, 7, 23, - // Second 128-bit lane: - 12, 28, 13, 29, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = @@ -10215,22 +10960,9 @@ static SDValue lowerV32I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, return Blend; // Use dedicated unpack instructions for masks that match their pattern. - // Note that these are repeated 128-bit lane unpacks, not unpacks across all - // 256-bit lanes. - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, - // Second 128-bit lane: - 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v32i8, V1, V2); - if (isShuffleEquivalent( - V1, V2, Mask, - {// First 128-bit lane: - 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, - // Second 128-bit lane: - 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v32i8, V1, V2); + if (SDValue V = + lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) + return V; // Try to use shift instructions. if (SDValue Shift = @@ -10296,12 +11028,17 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, DL, VT, V1, V2, Mask, Subtarget, DAG)) return Insertion; - // There is a really nice hard cut-over between AVX1 and AVX2 that means we can - // check for those subtargets here and avoid much of the subtarget querying in - // the per-vector-type lowering routines. With AVX1 we have essentially *zero* - // ability to manipulate a 256-bit vector with integer types. Since we'll use - // floating point types there eventually, just immediately cast everything to - // a float and operate entirely in that domain. + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + + // There is a really nice hard cut-over between AVX1 and AVX2 that means we + // can check for those subtargets here and avoid much of the subtarget + // querying in the per-vector-type lowering routines. With AVX1 we have + // essentially *zero* ability to manipulate a 256-bit vector with integer + // types. Since we'll use floating point types there eventually, just + // immediately cast everything to a float and operate entirely in that domain. if (VT.isInteger() && !Subtarget->hasAVX2()) { int ElementBits = VT.getScalarSizeInBits(); if (ElementBits < 32) @@ -10334,6 +11071,57 @@ static SDValue lower256BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, } } +/// \brief Try to lower a vector shuffle as a 128-bit shuffles. +static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, + ArrayRef<int> Mask, + SDValue V1, SDValue V2, + SelectionDAG &DAG) { + assert(VT.getScalarSizeInBits() == 64 && + "Unexpected element type size for 128bit shuffle."); + + // To handle 256 bit vector requires VLX and most probably + // function lowerV2X128VectorShuffle() is better solution. + assert(VT.is512BitVector() && "Unexpected vector size for 128bit shuffle."); + + SmallVector<int, 4> WidenedMask; + if (!canWidenShuffleElements(Mask, WidenedMask)) + return SDValue(); + + // Form a 128-bit permutation. + // Convert the 64-bit shuffle mask selection values into 128-bit selection + // bits defined by a vshuf64x2 instruction's immediate control byte. + unsigned PermMask = 0, Imm = 0; + unsigned ControlBitsNum = WidenedMask.size() / 2; + + for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) { + if (WidenedMask[i] == SM_SentinelZero) + return SDValue(); + + // Use first element in place of undef mask. + Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i]; + PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum); + } + + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); +} + +static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG) { + + assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV"); + + MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements()); + + SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true); + if (isSingleInputShuffleMask(Mask)) + return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1); + + return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2); +} + /// \brief Handle lowering of 8-lane 64-bit floating point shuffles. static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, @@ -10345,21 +11133,21 @@ static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Shuf128; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!"); @@ -10367,22 +11155,11 @@ static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) + return Unpck; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 8-lane 64-bit integer shuffles. @@ -10396,21 +11173,21 @@ static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - // X86 has dedicated unpack instructions that can handle specific blend - // operations: UNPCKH and UNPCKL. - if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2); + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Unpck; + + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } /// \brief Handle lowering of 16-lane 32-bit integer shuffles. static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc DL(Op); assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!"); @@ -10418,22 +11195,11 @@ static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, ArrayRef<int> Mask = SVOp->getMask(); assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"); - // Use dedicated unpack instructions for masks that match their pattern. - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 0, 16, 1, 17, 4, 20, 5, 21, - // Second 128-bit lane. - 8, 24, 9, 25, 12, 28, 13, 29})) - return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, - {// First 128-bit lane. - 2, 18, 3, 19, 6, 22, 7, 23, - // Second 128-bit lane. - 10, 26, 11, 27, 14, 30, 15, 31})) - return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2); + if (SDValue Unpck = + lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) + return Unpck; - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } /// \brief Handle lowering of 32-lane 16-bit integer shuffles. @@ -10448,8 +11214,7 @@ static SDValue lowerV32I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"); assert(Subtarget->hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"); - // FIXME: Implement direct support for this type! - return splitAndLowerVectorShuffle(DL, MVT::v32i16, V1, V2, Mask, DAG); + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } /// \brief Handle lowering of 64-lane 8-bit integer shuffles. @@ -10517,6 +11282,60 @@ static SDValue lower512BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); } +// Lower vXi1 vector shuffles. +// There is no a dedicated instruction on AVX-512 that shuffles the masks. +// The only way to shuffle bits is to sign-extend the mask vector to SIMD +// vector, shuffle and then truncate it back. +static SDValue lower1BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, + MVT VT, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + SDLoc DL(Op); + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + ArrayRef<int> Mask = SVOp->getMask(); + assert(Subtarget->hasAVX512() && + "Cannot lower 512-bit vectors w/o basic ISA!"); + MVT ExtVT; + switch (VT.SimpleTy) { + default: + llvm_unreachable("Expected a vector of i1 elements"); + case MVT::v2i1: + ExtVT = MVT::v2i64; + break; + case MVT::v4i1: + ExtVT = MVT::v4i32; + break; + case MVT::v8i1: + ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL + break; + case MVT::v16i1: + ExtVT = MVT::v16i32; + break; + case MVT::v32i1: + ExtVT = MVT::v32i16; + break; + case MVT::v64i1: + ExtVT = MVT::v64i8; + break; + } + + if (ISD::isBuildVectorAllZeros(V1.getNode())) + V1 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V1.getNode())) + V1 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); + + if (V2.isUndef()) + V2 = DAG.getUNDEF(ExtVT); + else if (ISD::isBuildVectorAllZeros(V2.getNode())) + V2 = getZeroVector(ExtVT, Subtarget, DAG, DL); + else if (ISD::isBuildVectorAllOnes(V2.getNode())) + V2 = getOnesVector(ExtVT, Subtarget, DAG, DL); + else + V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); + return DAG.getNode(ISD::TRUNCATE, DL, VT, + DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask)); +} /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 @@ -10533,8 +11352,10 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc dl(Op); + bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); - assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); + assert((VT.getSizeInBits() != 64 || Is1BitVector) && + "Can't lower MMX shuffles"); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; @@ -10572,7 +11393,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector<int, 16> WidenedMask; - if (VT.getScalarSizeInBits() < 64 && + if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(Mask, WidenedMask)) { MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) @@ -10640,17 +11461,17 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, } // For each vector width, delegate to a specialized lowering routine. - if (VT.getSizeInBits() == 128) + if (VT.is128BitVector()) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); - if (VT.getSizeInBits() == 256) + if (VT.is256BitVector()) return lower256BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); - // Force AVX-512 vectors to be scalarized for now. - // FIXME: Implement AVX-512 support! - if (VT.getSizeInBits() == 512) + if (VT.is512BitVector()) return lower512BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); + if (Is1BitVector) + return lower1BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } @@ -10661,11 +11482,16 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, unsigned &MaskValue) { MaskValue = 0; unsigned NumElems = BuildVector->getNumOperands(); + // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. + // We don't handle the >2 lanes case right now. unsigned NumLanes = (NumElems - 1) / 8 + 1; + if (NumLanes > 2) + return false; + unsigned NumElemsInLane = NumElems / NumLanes; - // Blend for v16i16 should be symetric for the both lanes. + // Blend for v16i16 should be symmetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { SDValue EltCond = BuildVector->getOperand(i); SDValue SndLaneEltCond = @@ -10673,20 +11499,25 @@ static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector, int Lane1Cond = -1, Lane2Cond = -1; if (isa<ConstantSDNode>(EltCond)) - Lane1Cond = !isZero(EltCond); + Lane1Cond = !isNullConstant(EltCond); if (isa<ConstantSDNode>(SndLaneEltCond)) - Lane2Cond = !isZero(SndLaneEltCond); + Lane2Cond = !isNullConstant(SndLaneEltCond); + unsigned LaneMask = 0; if (Lane1Cond == Lane2Cond || Lane2Cond < 0) // Lane1Cond != 0, means we want the first argument. // Lane1Cond == 0, means we want the second argument. // The encoding of this argument is 0 for the first argument, 1 // for the second. Therefore, invert the condition. - MaskValue |= !Lane1Cond << i; + LaneMask = !Lane1Cond << i; else if (Lane1Cond < 0) - MaskValue |= !Lane2Cond << i; + LaneMask = !Lane2Cond << i; else return false; + + MaskValue |= LaneMask; + if (NumLanes == 2) + MaskValue |= LaneMask << NumElemsInLane; } return true; } @@ -10711,7 +11542,8 @@ static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) { SDValue CondElt = CondBV->getOperand(i); Mask.push_back( - isa<ConstantSDNode>(CondElt) ? i + (isZero(CondElt) ? Size : 0) : -1); + isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0) + : -1); } return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask); } @@ -10776,9 +11608,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { } if (VT.getSizeInBits() == 16) { - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); // If Idx is 0, it's cheaper to do a move instead of a pextrw. - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode( ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, @@ -10801,8 +11632,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { return SDValue(); SDNode *User = *Op.getNode()->use_begin(); if ((User->getOpcode() != ISD::STORE || - (isa<ConstantSDNode>(Op.getOperand(1)) && - cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && + isNullConstant(Op.getOperand(1))) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); @@ -10900,10 +11730,11 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, MVT EltVT = VecVT.getVectorElementType(); unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); + assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); - //if (IdxVal >= NumElems/2) - // IdxVal -= NumElems/2; - IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk; + // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 + // this can be done with a mask. + IdxVal &= ElemsPerChunk - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, DAG.getConstant(IdxVal, dl, MVT::i32)); } @@ -10918,8 +11749,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // TODO: handle v16i8. if (VT.getSizeInBits() == 16) { SDValue Vec = Op.getOperand(0); - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), @@ -10951,8 +11781,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught // to match extract_elt for f64. - unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - if (Idx == 0) + if (isNullConstant(Op.getOperand(1))) return Op; // UNPCKHPD the element to the lowest double word, then movsd. @@ -11039,7 +11868,9 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Insert the element into the desired chunk. unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits(); - unsigned IdxIn128 = IdxVal - (IdxVal / NumEltsIn128) * NumEltsIn128; + assert(isPowerOf2_32(NumEltsIn128)); + // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. + unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, DAG.getConstant(IdxIn128, dl, MVT::i32)); @@ -11078,8 +11909,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - const Function *F = DAG.getMachineFunction().getFunction(); - bool MinSize = F->hasFnAttribute(Attribute::MinSize); + bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -11199,14 +12029,25 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, // --> load32 addr if ((IdxVal == OpVT.getVectorNumElements() / 2) && Vec.getOpcode() == ISD::INSERT_SUBVECTOR && - OpVT.is256BitVector() && SubVecVT.is128BitVector() && - !Subtarget->isUnalignedMem32Slow()) { - SDValue SubVec2 = Vec.getOperand(1); - if (auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2))) { - if (Idx2->getZExtValue() == 0) { - SDValue Ops[] = { SubVec2, SubVec }; - if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) - return Ld; + OpVT.is256BitVector() && SubVecVT.is128BitVector()) { + auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2)); + if (Idx2 && Idx2->getZExtValue() == 0) { + SDValue SubVec2 = Vec.getOperand(1); + // If needed, look through a bitcast to get to the load. + if (SubVec2.getNode() && SubVec2.getOpcode() == ISD::BITCAST) + SubVec2 = SubVec2.getOperand(0); + + if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) { + bool Fast; + unsigned Alignment = FirstLd->getAlignment(); + unsigned AS = FirstLd->getAddressSpace(); + const X86TargetLowering *TLI = Subtarget->getTargetLowering(); + if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), + OpVT, AS, Alignment, &Fast) && Fast) { + SDValue Ops[] = { SubVec2, SubVec }; + if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false)) + return Ld; + } } } } @@ -11218,37 +12059,9 @@ static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, if (OpVT.is512BitVector() && SubVecVT.is256BitVector()) return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl); - if (OpVT.getVectorElementType() == MVT::i1) { - if (IdxVal == 0 && Vec.getOpcode() == ISD::UNDEF) // the operation is legal - return Op; - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - SDValue Undef = DAG.getUNDEF(OpVT); - unsigned NumElems = OpVT.getVectorNumElements(); - SDValue ShiftBits = DAG.getConstant(NumElems/2, dl, MVT::i8); - - if (IdxVal == OpVT.getVectorNumElements() / 2) { - // Zero upper bits of the Vec - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - if (IdxVal == 0) { - SDValue Vec2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - // Zero upper bits of the Vec2 - Vec2 = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec2, ShiftBits); - Vec2 = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec2, ShiftBits); - // Zero lower bits of the Vec - Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); - // Merge them together - return DAG.getNode(ISD::OR, dl, OpVT, Vec, Vec2); - } - } + if (OpVT.getVectorElementType() == MVT::i1) + return Insert1BitVector(Op, DAG); + return SDValue(); } @@ -11363,7 +12176,8 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // load. if (isGlobalStubReference(OpFlag)) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); return Result; } @@ -11430,7 +12244,8 @@ X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl, // load. if (isGlobalStubReference(OpFlags)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); // If there was a non-zero offset that we didn't fold, create an explicit // addition for it. @@ -11587,7 +12402,8 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, } Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, - MachinePointerInfo::getGOT(), false, false, false, 0); + MachinePointerInfo::getGOT(DAG.getMachineFunction()), + false, false, false, 0); } // The address of the thread local variable is the add of the thread @@ -11599,10 +12415,18 @@ SDValue X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); + + // Cygwin uses emutls. + // FIXME: It may be EmulatedTLS-generic also for X86-Android. + if (Subtarget->isTargetWindowsCygwin()) + return LowerToTLSEmulatedModel(GA, DAG); + const GlobalValue *GV = GA->getGlobal(); auto PtrVT = getPointerTy(DAG.getDataLayout()); if (Subtarget->isTargetELF()) { + if (DAG.getTarget().Options.EmulatedTLS) + return LowerToTLSEmulatedModel(GA, DAG); TLSModel::Model model = DAG.getTarget().getTLSModel(GV); switch (model) { case TLSModel::GeneralDynamic: @@ -11830,10 +12654,10 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, auto PtrVT = getPointerTy(MF.getDataLayout()); int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); - SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), - StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + SDValue Chain = DAG.getStore( + DAG.getEntryNode(), dl, Op.getOperand(0), StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), false, + false, 0); return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); } @@ -11855,10 +12679,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, MachineMemOperand *MMO; if (FI) { int SSFI = FI->getIndex(); - MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, ByteSize, ByteSize); + MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, ByteSize, ByteSize); } else { MMO = cast<LoadSDNode>(StackSlot)->getMemOperand(); StackSlot = StackSlot.getOperand(1); @@ -11884,16 +12707,16 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue Ops[] = { Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag }; - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, SSFISize, SSFISize); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOStore, SSFISize, SSFISize); Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, Ops, Op.getValueType(), MMO); - Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, false, 0); + Result = DAG.getLoad( + Op.getValueType(), DL, Chain, StackSlot, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + false, false, false, 0); } return Result; @@ -11937,16 +12760,19 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, // Load the 64-bit value into an XMM register. SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); - SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue CLod0 = + DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); - SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue CLod1 = + DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Result; @@ -11996,10 +12822,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); // Subtract the bias. + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); // Handle final rounding. - EVT DestVT = Op.getValueType(); + MVT DestVT = Op.getSimpleValueType(); if (DestVT.bitsLT(MVT::f64)) return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, @@ -12025,14 +12852,23 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); // return (float4) lo + fhi; + // We shouldn't use it when unsafe-fp-math is enabled though: we might later + // reassociate the two FADDs, and if we do that, the algorithm fails + // spectacularly (PR24512). + // FIXME: If we ever have some kind of Machine FMF, this should be marked + // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because + // there's also the MachineCombiner reassociations happening on Machine IR. + if (DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + SDLoc DL(Op); SDValue V = Op->getOperand(0); - EVT VecIntVT = V.getValueType(); + MVT VecIntVT = V.getSimpleValueType(); bool Is128 = VecIntVT == MVT::v4i32; - EVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; + MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; // If we convert to something else than the supported type, e.g., to v4f64, // abort early. - if (VecFloatVT != Op->getValueType(0)) + if (VecFloatVT != Op->getSimpleValueType(0)) return SDValue(); unsigned NumElts = VecIntVT.getVectorNumElements(); @@ -12070,7 +12906,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, SDValue Low, High; if (Subtarget.hasSSE41()) { - EVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; + MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); @@ -12108,6 +12944,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); + // TODO: Are there any fast-math-flags to propagate here? SDValue FHigh = DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd); // return (float4) lo + fhi; @@ -12137,11 +12974,10 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, return lowerUINT_TO_FP_vXi32(Op, DAG, *Subtarget); case MVT::v16i8: case MVT::v16i16: - if (Subtarget->hasAVX512()) - return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); + assert(Subtarget->hasAVX512()); + return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); } - llvm_unreachable(nullptr); } SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, @@ -12150,7 +12986,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (Op.getValueType().isVector()) + if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't @@ -12161,6 +12997,14 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, MVT SrcVT = N0.getSimpleValueType(); MVT DstVT = Op.getSimpleValueType(); + + if (Subtarget->hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && + (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget->is64Bit()))) { + // Conversions from unsigned i32 to f32/f64 are legal, + // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. + return Op; + } + if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) return LowerUINT_TO_FP_i64(Op, DAG); if (SrcVT == MVT::i32 && X86ScalarSSEf64) @@ -12193,10 +13037,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // we must be careful to do the computation in x87 extended precision, not // in SSE. (The generic code can't know it's OK to do this, or how to.) int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); - MachineMemOperand *MMO = - DAG.getMachineFunction() - .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, 8, 8); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), + MachineMemOperand::MOLoad, 8, 8); SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; @@ -12223,24 +13066,52 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Load the value out, extending it from f32 to f80. // FIXME: Avoid the extend by constructing the right constant pool? - SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), - FudgePtr, MachinePointerInfo::getConstantPool(), - MVT::f32, false, false, false, 4); + SDValue Fudge = DAG.getExtLoad( + ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, + false, false, false, 4); // Extend everything to 80 bits to force it to be done on x87. + // TODO: Are there any fast-math-flags to propagate here? SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0, dl)); } +// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation +// is legal, or has an fp128 or f16 source (which needs to be promoted to f32), +// just return an <SDValue(), SDValue()> pair. +// Otherwise it is assumed to be a conversion from one of f32, f64 or f80 +// to i16, i32 or i64, and we lower it to a legal sequence. +// If lowered to the final integer result we return a <result, SDValue()> pair. +// Otherwise we lower it to a sequence ending with a FIST, return a +// <FIST, StackSlot> pair, and the caller is responsible for loading +// the final integer result from StackSlot. std::pair<SDValue,SDValue> -X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, - bool IsSigned, bool IsReplace) const { +X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, + bool IsSigned, bool IsReplace) const { SDLoc DL(Op); EVT DstTy = Op.getValueType(); + EVT TheVT = Op.getOperand(0).getValueType(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - if (!IsSigned && !isIntegerTypeFTOL(DstTy)) { + if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { + // f16 must be promoted before using the lowering in this routine. + // fp128 does not use this lowering. + return std::make_pair(SDValue(), SDValue()); + } + + // If using FIST to compute an unsigned i64, we'll need some fixup + // to handle values above the maximum signed i64. A FIST is always + // used for the 32-bit subtarget, but also for f80 on a 64-bit target. + bool UnsignedFixup = !IsSigned && + DstTy == MVT::i64 && + (!Subtarget->is64Bit() || + !isScalarFPTypeInSSEReg(TheVT)); + + if (!IsSigned && DstTy != MVT::i64 && !Subtarget->hasAVX512()) { + // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. + // The low 32 bits of the fist result will have the correct uint32 result. assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); DstTy = MVT::i64; } @@ -12258,42 +13129,87 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) return std::make_pair(SDValue(), SDValue()); - // We lower FP->int64 either into FISTP64 followed by a load from a temporary - // stack slot, or into the FTOL runtime function. + // We lower FP->int64 into FISTP64 followed by a load from a temporary + // stack slot. MachineFunction &MF = DAG.getMachineFunction(); unsigned MemSize = DstTy.getSizeInBits()/8; int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); unsigned Opc; - if (!IsSigned && isIntegerTypeFTOL(DstTy)) - Opc = X86ISD::WIN_FTOL; - else - switch (DstTy.getSimpleVT().SimpleTy) { - default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); - case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; - case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; - case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; - } + switch (DstTy.getSimpleVT().SimpleTy) { + default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); + case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; + case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; + case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; + } SDValue Chain = DAG.getEntryNode(); SDValue Value = Op.getOperand(0); - EVT TheVT = Op.getOperand(0).getValueType(); + SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. + + if (UnsignedFixup) { + // + // Conversion to unsigned i64 is implemented with a select, + // depending on whether the source value fits in the range + // of a signed i64. Let Thresh be the FP equivalent of + // 0x8000000000000000ULL. + // + // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000; + // FistSrc = (Value < Thresh) ? Value : (Value - Thresh); + // Fist-to-mem64 FistSrc + // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent + // to XOR'ing the high 32 bits with Adjust. + // + // Being a power of 2, Thresh is exactly representable in all FP formats. + // For X87 we'd like to use the smallest FP type for this constant, but + // for DAG type consistency we have to match the FP operand type. + + APFloat Thresh(APFloat::IEEEsingle, APInt(32, 0x5f000000)); + LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK; + bool LosesInfo = false; + if (TheVT == MVT::f64) + // The rounding mode is irrelevant as the conversion should be exact. + Status = Thresh.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, + &LosesInfo); + else if (TheVT == MVT::f80) + Status = Thresh.convert(APFloat::x87DoubleExtended, + APFloat::rmNearestTiesToEven, &LosesInfo); + + assert(Status == APFloat::opOK && !LosesInfo && + "FP conversion should have been exact"); + + SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); + + SDValue Cmp = DAG.getSetCC(DL, + getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Adjust = DAG.getSelect(DL, MVT::i32, Cmp, + DAG.getConstant(0, DL, MVT::i32), + DAG.getConstant(0x80000000, DL, MVT::i32)); + SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal); + Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(), + *DAG.getContext(), TheVT), + Value, ThreshVal, ISD::SETLT); + Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub); + } + // FIXME This causes a redundant load/store if the SSE-class value is already // in memory, such as if it is on the callstack. if (isScalarFPTypeInSSEReg(TheVT)) { assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); Chain = DAG.getStore(Chain, DL, Value, StackSlot, - MachinePointerInfo::getFixedStack(SSFI), - false, false, 0); + MachinePointerInfo::getFixedStack(MF, SSFI), false, + false, 0); SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(TheVT) }; MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOLoad, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOLoad, MemSize, MemSize); Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO); Chain = Value.getValue(1); SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); @@ -12301,28 +13217,52 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, } MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, MemSize, MemSize); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, MemSize, MemSize); + + if (UnsignedFixup) { + + // Insert the FIST, load its result as two i32's, + // and XOR the high i32 with Adjust. + + SDValue FistOps[] = { Chain, Value, StackSlot }; + SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), + FistOps, DstTy, MMO); + + SDValue Low32 = DAG.getLoad(MVT::i32, DL, FIST, StackSlot, + MachinePointerInfo(), + false, false, false, 0); + SDValue HighAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackSlot, + DAG.getConstant(4, DL, PtrVT)); - if (Opc != X86ISD::WIN_FTOL) { + SDValue High32 = DAG.getLoad(MVT::i32, DL, FIST, HighAddr, + MachinePointerInfo(), + false, false, false, 0); + High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust); + + if (Subtarget->is64Bit()) { + // Join High32 and Low32 into a 64-bit result. + // (High32 << 32) | Low32 + Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32); + High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32); + High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32, + DAG.getConstant(32, DL, MVT::i8)); + SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32); + return std::make_pair(Result, SDValue()); + } + + SDValue ResultOps[] = { Low32, High32 }; + + SDValue pair = IsReplace + ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps) + : DAG.getMergeValues(ResultOps, DL); + return std::make_pair(pair, SDValue()); + } else { // Build the FP_TO_INT*_IN_MEM SDValue Ops[] = { Chain, Value, StackSlot }; SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), Ops, DstTy, MMO); return std::make_pair(FIST, StackSlot); - } else { - SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL, - DAG.getVTList(MVT::Other, MVT::Glue), - Chain, Value); - SDValue eax = DAG.getCopyFromReg(ftol, DL, X86::EAX, - MVT::i32, ftol.getValue(1)); - SDValue edx = DAG.getCopyFromReg(eax.getValue(1), DL, X86::EDX, - MVT::i32, eax.getValue(2)); - SDValue Ops[] = { eax, edx }; - SDValue pair = IsReplace - ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops) - : DAG.getMergeValues(Ops, DL); - return std::make_pair(pair, SDValue()); } } @@ -12333,7 +13273,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); - if (VT.is512BitVector() || InVT.getScalarType() == MVT::i1) + if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); // Optimize vectors in AVX mode: @@ -12426,6 +13366,62 @@ static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } +static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue In = Op.getOperand(0); + MVT InVT = In.getSimpleValueType(); + + assert(VT.getVectorElementType() == MVT::i1 && "Unexected vector type."); + + // Shift LSB to MSB and use VPMOVB2M - SKX. + unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && + Subtarget->hasBWI()) || // legal, will go to VPMOVB2M, VPMOVW2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() <= 16 && Subtarget->hasBWI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVB2M, VPMOVW2M + // Shift packed bytes not supported natively, bitcast to dword + MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT, + DAG.getBitcast(ExtVT, In), + DAG.getConstant(ShiftInx, DL, ExtVT)); + ShiftNode = DAG.getBitcast(InVT, ShiftNode); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + if ((InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && + Subtarget->hasDQI()) || // legal, will go to VPMOVD2M, VPMOVQ2M + ((InVT.is256BitVector() || InVT.is128BitVector()) && + InVT.getScalarSizeInBits() >= 32 && Subtarget->hasDQI() && + Subtarget->hasVLX())) { // legal, will go to VPMOVD2M, VPMOVQ2M + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode); + } + + // Shift LSB to MSB, extend if necessary and use TESTM. + unsigned NumElts = InVT.getVectorNumElements(); + if (InVT.getSizeInBits() < 512 && + (InVT.getScalarType() == MVT::i8 || InVT.getScalarType() == MVT::i16 || + !Subtarget->hasVLX())) { + assert((NumElts == 8 || NumElts == 16) && "Unexected vector type."); + + // TESTD/Q should be used (if BW supported we use CVT2MASK above), + // so vector should be extended to packed dword/qword. + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); + InVT = ExtVT; + ShiftInx = InVT.getScalarSizeInBits() - 1; + } + + SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In, + DAG.getConstant(ShiftInx, DL, InVT)); + return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode); +} + SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); @@ -12443,42 +13439,17 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); - // move vector to mask - truncate solution for SKX - if (VT.getVectorElementType() == MVT::i1) { - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() <= 16 && - Subtarget->hasBWI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVW2M - if (InVT.is512BitVector() && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI()) - return Op; // legal, will go to VPMOVD2M, VPMOVQ2M - if ((InVT.is256BitVector() || InVT.is128BitVector()) - && InVT.getScalarSizeInBits() >= 32 && - Subtarget->hasDQI() && Subtarget->hasVLX()) - return Op; // legal, will go to VPMOVB2M, VPMOVQ2M - } - if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) { - if (VT.getVectorElementType().getSizeInBits() >=8) - return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); - - assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type"); - unsigned NumElts = InVT.getVectorNumElements(); - assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type"); - if (InVT.getSizeInBits() < 512) { - MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64; - In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); - InVT = ExtVT; - } - - SDValue OneV = - DAG.getConstant(APInt::getSignBit(InVT.getScalarSizeInBits()), DL, InVT); - SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In); - return DAG.getNode(X86ISD::TESTM, DL, VT, And, And); - } + if (VT.getVectorElementType() == MVT::i1) + return LowerTruncateVecI1(Op, DAG, Subtarget); + // vpmovqb/w/d, vpmovdb/w, vpmovwb + if (Subtarget->hasAVX512()) { + // word to byte only under BWI + if (InVT == MVT::v16i16 && !Subtarget->hasBWI()) // v16i16 -> v16i8 + return DAG.getNode(X86ISD::VTRUNC, DL, VT, + DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In)); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); + } if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { // On AVX2, v4i64 -> v4i32 becomes VPERMD. if (Subtarget->hasInt256()) { @@ -12583,7 +13554,8 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, /*IsSigned=*/ true, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. - if (!FIST.getNode()) return Op; + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12600,7 +13572,9 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ false, /*IsReplace=*/ false); SDValue FIST = Vals.first, StackSlot = Vals.second; - assert(FIST.getNode() && "Unexpected failure"); + // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. + if (!FIST.getNode()) + return Op; if (StackSlot.getNode()) // Load the result. @@ -12643,6 +13617,8 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); + // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to // decide if we should generate a 16-byte constant mask when we only need 4 or // 8 bytes for the scalar case. @@ -12650,11 +13626,16 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { MVT LogicVT; MVT EltVT; unsigned NumElts; - + if (VT.isVector()) { LogicVT = VT; EltVT = VT.getVectorElementType(); NumElts = VT.getVectorNumElements(); + } else if (IsF128) { + // SSE instructions are used for optimized f128 logical operations. + LogicVT = MVT::f128; + EltVT = VT; + NumElts = 1; } else { // There are no scalar bitwise logical SSE/AVX instructions, so we // generate a 16-byte vector constant and logic op even for the scalar case. @@ -12675,9 +13656,10 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CPIdx = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); - SDValue Mask = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, Alignment); + SDValue Mask = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, Alignment); SDValue Op0 = Op.getOperand(0); bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); @@ -12685,7 +13667,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR; SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; - if (VT.isVector()) + if (VT.isVector() || IsF128) return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); // For the scalar case extend to a 128-bit vector, perform the logic op, @@ -12704,6 +13686,7 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT SrcVT = Op1.getSimpleValueType(); + bool IsF128 = (VT == MVT::f128); // If second operand is smaller, extend it first. if (SrcVT.bitsLT(VT)) { @@ -12718,13 +13701,16 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. + assert((VT == MVT::f64 || VT == MVT::f32 || IsF128) && + "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = - VT == MVT::f64 ? APFloat::IEEEdouble : APFloat::IEEEsingle; + VT == MVT::f64 ? APFloat::IEEEdouble : + (IsF128 ? APFloat::IEEEquad : APFloat::IEEEsingle); const unsigned SizeInBits = VT.getSizeInBits(); SmallVector<Constant *, 4> CV( - VT == MVT::f64 ? 2 : 4, + VT == MVT::f64 ? 2 : (IsF128 ? 1 : 4), ConstantFP::get(*Context, APFloat(Sem, APInt(SizeInBits, 0)))); // First, clear all bits but the sign bit from the second operand (sign). @@ -12737,11 +13723,13 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { // Perform all logic operations as 16-byte vectors because there are no // scalar FP logic instructions in SSE. This allows load folding of the // constants into the logic instructions. - MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; - SDValue Mask1 = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); - Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); + MVT LogicVT = (VT == MVT::f64) ? MVT::v2f64 : (IsF128 ? MVT::f128 : MVT::v4f32); + SDValue Mask1 = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); + if (!IsF128) + Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op1); SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op1, Mask1); // Next, clear the sign bit from the first operand (magnitude). @@ -12750,8 +13738,9 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { APFloat APF = Op0CN->getValueAPF(); // If the magnitude is a positive zero, the sign bit alone is enough. if (APF.isPosZero()) - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? SignBit : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, SignBit, + DAG.getIntPtrConstant(0, dl)); APF.clearSign(); CV[0] = ConstantFP::get(*Context, APF); } else { @@ -12761,18 +13750,21 @@ static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { } C = ConstantVector::get(CV); CPIdx = DAG.getConstantPool(C, PtrVT, 16); - SDValue Val = DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, - MachinePointerInfo::getConstantPool(), - false, false, false, 16); + SDValue Val = + DAG.getLoad(LogicVT, dl, DAG.getEntryNode(), CPIdx, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + false, false, false, 16); // If the magnitude operand wasn't a constant, we need to AND out the sign. if (!isa<ConstantFPSDNode>(Op0)) { - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); + if (!IsF128) + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Op0); Val = DAG.getNode(X86ISD::FAND, dl, LogicVT, Op0, Val); } // OR the magnitude value with the sign bit. Val = DAG.getNode(X86ISD::FOR, dl, LogicVT, Val, SignBit); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, - DAG.getIntPtrConstant(0, dl)); + return IsF128 ? Val : + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcVT, Val, + DAG.getIntPtrConstant(0, dl)); } static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { @@ -12859,7 +13851,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget, return SDValue(); } - EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; // Cast all vectors into TestVT for PTEST. for (unsigned i = 0, e = VecIns.size(); i < e; ++i) @@ -12999,14 +13991,14 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. - if (C->getAPIntValue() == 1 && !Subtarget->slowIncDec()) { + if (C->isOne() && !Subtarget->slowIncDec()) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->getAPIntValue().isAllOnesValue() && !Subtarget->slowIncDec()) { + if (C->isAllOnesValue() && !Subtarget->slowIncDec()) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -13135,13 +14127,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl, /// equivalent. SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl, SelectionDAG &DAG) const { - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) { - if (C->getAPIntValue() == 0) - return EmitTest(Op0, X86CC, dl, DAG); + if (isNullConstant(Op1)) + return EmitTest(Op0, X86CC, dl, DAG); - if (Op0.getValueType() == MVT::i1) - llvm_unreachable("Unexpected comparison operation for MVT::i1 operands"); - } + assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) && + "Unexpected comparison operation for MVT::i1 operands"); if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 || Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) { @@ -13150,8 +14140,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // if we're optimizing for size, however, as that'll allow better folding // of memory operations. if (Op0.getValueType() != MVT::i32 && Op0.getValueType() != MVT::i64 && - !DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::MinSize) && + !DAG.getMachineFunction().getFunction()->optForMinSize() && !Subtarget->isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -13188,6 +14177,9 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp, SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW, DAG.getConstant(8, dl, MVT::i8)); SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl); + + // Some 64-bit targets lack SAHF support, but they do support FCOMI. + assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl); } @@ -13261,13 +14253,8 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, /// This is because we still need one division to calculate the reciprocal and /// then we need two multiplies by that reciprocal as replacements for the /// original divisions. -bool X86TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { - return NumUsers > 1; -} - -static bool isAllOnes(SDValue V) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V); - return C && C->isAllOnesValue(); +unsigned X86TargetLowering::combineRepeatedFPDivisors() const { + return 2; } /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node @@ -13285,8 +14272,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, if (Op1.getOpcode() == ISD::SHL) std::swap(Op0, Op1); if (Op0.getOpcode() == ISD::SHL) { - if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) - if (And00C->getZExtValue() == 1) { + if (isOneConstant(Op0.getOperand(0))) { // If we looked past a truncate, check that it's only truncating away // known zeros. unsigned BitWidth = Op0.getValueSizeInBits(); @@ -13423,7 +14409,7 @@ static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType() == MVT::i1 && + assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected type for boolean compare operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0, @@ -13467,8 +14453,8 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG, MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); - assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 8 && - Op.getValueType().getScalarType() == MVT::i1 && + assert(Op0.getSimpleValueType().getVectorElementType().getSizeInBits() >= 8 && + Op.getSimpleValueType().getVectorElementType() == MVT::i1 && "Cannot set masked compare for this operation"); ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); @@ -13515,7 +14501,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG) for (unsigned i = 0; i < n; ++i) { ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); - if (!Elt || Elt->isOpaque() || Elt->getValueType(0) != EVT) + if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT) return SDValue(); // Avoid underflow. @@ -13606,13 +14592,13 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntVSETCC(Op, DAG); - EVT OpVT = Op1.getValueType(); + MVT OpVT = Op1.getSimpleValueType(); if (OpVT.getVectorElementType() == MVT::i1) return LowerBoolVSETCC_AVX512(Op, DAG); bool MaskResult = (VT.getVectorElementType() == MVT::i1); if (Subtarget->hasAVX512()) { - if (Op1.getValueType().is512BitVector() || + if (Op1.getSimpleValueType().is512BitVector() || (Subtarget->hasBWI() && Subtarget->hasVLX()) || (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32)) return LowerIntVSETCC_AVX512(Op, DAG, Subtarget); @@ -13628,6 +14614,33 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC)); } + // Lower using XOP integer comparisons. + if ((VT == MVT::v16i8 || VT == MVT::v8i16 || + VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget->hasXOP()) { + // Translate compare code to XOP PCOM compare mode. + unsigned CmpMode = 0; + switch (SetCCOpcode) { + default: llvm_unreachable("Unexpected SETCC condition"); + case ISD::SETULT: + case ISD::SETLT: CmpMode = 0x00; break; + case ISD::SETULE: + case ISD::SETLE: CmpMode = 0x01; break; + case ISD::SETUGT: + case ISD::SETGT: CmpMode = 0x02; break; + case ISD::SETUGE: + case ISD::SETGE: CmpMode = 0x03; break; + case ISD::SETEQ: CmpMode = 0x04; break; + case ISD::SETNE: CmpMode = 0x05; break; + } + + // Are we comparing unsigned or signed integers? + unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) + ? X86ISD::VPCOMU : X86ISD::VPCOM; + + return DAG.getNode(Opc, dl, VT, Op0, Op1, + DAG.getConstant(CmpMode, dl, MVT::i8)); + } + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. @@ -13777,7 +14790,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // Since SSE has no unsigned integer comparisons, we need to flip the sign // bits of the inputs before performing those operations. if (FlipSigns) { - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl, VT); Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); @@ -13818,11 +14831,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - Op1.getOpcode() == ISD::Constant && - cast<ConstantSDNode>(Op1)->isNullValue() && + isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { if (VT == MVT::i1) return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); return NewSetCC; @@ -13831,17 +14842,14 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of // these. - if (Op1.getOpcode() == ISD::Constant && - (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || - cast<ConstantSDNode>(Op1)->isNullValue()) && + if ((isOneConstant(Op1) || isNullConstant(Op1)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // If the input is a setcc, then reuse the input setcc or use a new one with // the inverted condition. if (Op0.getOpcode() == X86ISD::SETCC) { X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); - bool Invert = (CC == ISD::SETNE) ^ - cast<ConstantSDNode>(Op1)->isNullValue(); + bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); if (!Invert) return Op0; @@ -13854,8 +14862,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } } - if ((Op0.getValueType() == MVT::i1) && (Op1.getOpcode() == ISD::Constant) && - (cast<ConstantSDNode>(Op1)->getZExtValue() == 1) && + if ((Op0.getValueType() == MVT::i1) && isOneConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); @@ -13876,6 +14883,23 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return SetCC; } +SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const { + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue Carry = Op.getOperand(2); + SDValue Cond = Op.getOperand(3); + SDLoc DL(Op); + + assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only."); + X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); + + assert(Carry.getOpcode() != ISD::CARRY_FALSE); + SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); + SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry); + return DAG.getNode(X86ISD::SETCC, DL, Op.getValueType(), + DAG.getConstant(CC, DL, MVT::i8), Cmp.getValue(1)); +} + // isX86LogicalCmp - Return true if opcode is a X86 logical comparison. static bool isX86LogicalCmp(SDValue Op) { unsigned Opc = Op.getNode()->getOpcode(); @@ -13918,7 +14942,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Op1 = Op.getOperand(1); SDValue Op2 = Op.getOperand(2); SDLoc DL(Op); - EVT VT = Op1.getValueType(); + MVT VT = Op1.getSimpleValueType(); SDValue CC; // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops @@ -13927,7 +14951,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC && ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) || (Subtarget->hasSSE1() && VT == MVT::f32)) && - VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) { + VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); int SSECC = translateX86FSETCC( cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); @@ -13961,12 +14985,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Convert to vectors, do a VSELECT, and convert back to scalar. // All of the conversions should be optimized away. - EVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; + MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); - EVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; + MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); @@ -13980,26 +15004,26 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } - if (VT.isVector() && VT.getScalarType() == MVT::i1) { - SDValue Op1Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) - Op1Scalar = ConvertI1VectorToInterger(Op1, DAG); - else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) - Op1Scalar = Op1.getOperand(0); - SDValue Op2Scalar; - if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) - Op2Scalar = ConvertI1VectorToInterger(Op2, DAG); - else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) - Op2Scalar = Op2.getOperand(0); - if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); - if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) - return DAG.getBitcast(VT, newSelect); - SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, - DAG.getIntPtrConstant(0, DL)); + if (VT.isVector() && VT.getVectorElementType() == MVT::i1) { + SDValue Op1Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode())) + Op1Scalar = ConvertI1VectorToInteger(Op1, DAG); + else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0)) + Op1Scalar = Op1.getOperand(0); + SDValue Op2Scalar; + if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode())) + Op2Scalar = ConvertI1VectorToInteger(Op2, DAG); + else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) + Op2Scalar = Op2.getOperand(0); + if (Op1Scalar.getNode() && Op2Scalar.getNode()) { + SDValue newSelect = DAG.getNode(ISD::SELECT, DL, + Op1Scalar.getValueType(), + Cond, Op1Scalar, Op2Scalar); + if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) + return DAG.getBitcast(VT, newSelect); + SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec, + DAG.getIntPtrConstant(0, DL)); } } @@ -14026,22 +15050,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y if (Cond.getOpcode() == X86ISD::SETCC && Cond.getOperand(1).getOpcode() == X86ISD::CMP && - isZero(Cond.getOperand(1).getOperand(1))) { + isNullConstant(Cond.getOperand(1).getOperand(1))) { SDValue Cmp = Cond.getOperand(1); unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue(); - if ((isAllOnes(Op1) || isAllOnes(Op2)) && + if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { - SDValue Y = isAllOnes(Op2) ? Op1 : Op2; + SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; SDValue CmpOp0 = Cmp.getOperand(0); // Apply further optimizations for special cases // (select (x != 0), -1, 0) -> neg & sbb // (select (x == 0), 0, -1) -> neg & sbb - if (ConstantSDNode *YC = dyn_cast<ConstantSDNode>(Y)) - if (YC->isNullValue() && - (isAllOnes(Op1) == (CondCode == X86::COND_NE))) { + if (isNullConstant(Y) && + (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) { SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, DAG.getConstant(0, DL, @@ -14061,11 +15084,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp); - if (isAllOnes(Op1) != (CondCode == X86::COND_E)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E)) Res = DAG.getNOT(DL, Res, Res.getValueType()); - ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); - if (!N2C || !N2C->isNullValue()) + if (!isNullConstant(Op2)) Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y); return Res; } @@ -14073,11 +15095,9 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // Look past (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14136,15 +15156,14 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } if (addTest) { - // Look pass the truncate if the high bits are known zero. + // Look past the truncate if the high bits are known zero. if (isTruncWithZeroHighBitsInput(Cond, DAG)) - Cond = Cond.getOperand(0); + Cond = Cond.getOperand(0); // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -14166,11 +15185,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && - (isAllOnes(Op1) || isAllOnes(Op2)) && (isZero(Op1) || isZero(Op2))) { + (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && + (isNullConstant(Op1) || isNullConstant(Op2))) { SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), DAG.getConstant(X86::COND_B, DL, MVT::i8), Cond); - if (isAllOnes(Op1) != (CondCode == X86::COND_B)) + if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) return DAG.getNOT(DL, Res, Res.getValueType()); return Res; } @@ -14256,8 +15276,8 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, MVT InVT = In.getSimpleValueType(); assert(VT.getSizeInBits() == InVT.getSizeInBits()); - MVT InSVT = InVT.getScalarType(); - assert(VT.getScalarType().getScalarSizeInBits() > InSVT.getScalarSizeInBits()); + MVT InSVT = InVT.getVectorElementType(); + assert(VT.getVectorElementType().getSizeInBits() > InSVT.getSizeInBits()); if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) return SDValue(); @@ -14276,7 +15296,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, // As SRAI is only available on i16/i32 types, we expand only up to i32 // and handle i64 separately. - while (CurrVT != VT && CurrVT.getScalarType() != MVT::i32) { + while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) { Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr); MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2); CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2); @@ -14286,7 +15306,7 @@ static SDValue LowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SDValue SignExt = Curr; if (CurrVT != InVT) { unsigned SignExtShift = - CurrVT.getScalarSizeInBits() - InSVT.getScalarSizeInBits(); + CurrVT.getVectorElementType().getSizeInBits() - InSVT.getSizeInBits(); SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr, DAG.getConstant(SignExtShift, dl, MVT::i8)); } @@ -14346,7 +15366,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget, SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]); - MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), + MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), VT.getVectorNumElements()/2); OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo); @@ -14470,7 +15490,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, // memory. In practice, we ''widen'' MemVT. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), - loadRegZize / MemVT.getScalarType().getSizeInBits()); + loadRegZize / MemVT.getScalarSizeInBits()); assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() && "Invalid vector type"); @@ -14518,29 +15538,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, return Sext; } - // Otherwise we'll shuffle the small elements in the high bits of the - // larger type and perform an arithmetic shift. If the shift is not legal - // it's better to scalarize. - assert(TLI.isOperationLegalOrCustom(ISD::SRA, RegVT) && - "We can't implement a sext load without an arithmetic right shift!"); - - // Redistribute the loaded elements into the different locations. - SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i != NumElems; ++i) - ShuffleVec[i * SizeRatio + SizeRatio - 1] = i; - - SDValue Shuff = DAG.getVectorShuffle( - WideVecVT, dl, SlicedVec, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); - - Shuff = DAG.getBitcast(RegVT, Shuff); - - // Build the arithmetic shift. - unsigned Amt = RegVT.getVectorElementType().getSizeInBits() - - MemVT.getVectorElementType().getSizeInBits(); - Shuff = - DAG.getNode(ISD::SRA, dl, RegVT, Shuff, - DAG.getConstant(Amt, dl, RegVT)); + // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest + // lanes. + assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) && + "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!"); + SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); return Shuff; } @@ -14577,11 +15580,9 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { static bool isXor1OfSetCC(SDValue Op) { if (Op.getOpcode() != ISD::XOR) return false; - ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); - if (N1C && N1C->getAPIntValue() == 1) { + if (isOneConstant(Op.getOperand(1))) return Op.getOperand(0).getOpcode() == X86ISD::SETCC && - Op.getOperand(0).hasOneUse(); - } + Op.getOperand(0).hasOneUse(); return false; } @@ -14597,8 +15598,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (Cond.getOpcode() == ISD::SETCC) { // Check for setcc([su]{add,sub,mul}o == 0). if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && - isa<ConstantSDNode>(Cond.getOperand(1)) && - cast<ConstantSDNode>(Cond.getOperand(1))->isNullValue() && + isNullConstant(Cond.getOperand(1)) && Cond.getOperand(0).getResNo() == 1 && (Cond.getOperand(0).getOpcode() == ISD::SADDO || Cond.getOperand(0).getOpcode() == ISD::UADDO || @@ -14625,11 +15625,9 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // Look pass (and (setcc_carry (cmp ...)), 1). if (Cond.getOpcode() == ISD::AND && - Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); - if (C && C->getAPIntValue() == 1) - Cond = Cond.getOperand(0); - } + Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && + isOneConstant(Cond.getOperand(1))) + Cond = Cond.getOperand(0); // If condition flag is set by a X86ISD::CMP, then use it as the condition // setting operand in place of the X86ISD::SETCC. @@ -14673,16 +15671,14 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { switch (CondOpcode) { case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break; case ISD::SADDO: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::INC; X86Cond = X86::COND_O; break; } X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break; case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break; case ISD::SSUBO: - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O; break; } @@ -14844,8 +15840,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); - if (NewSetCC.getNode()) { + if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -14877,54 +15872,40 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SplitStack; SDLoc dl(Op); + // Get the inputs. + SDNode *Node = Op.getNode(); + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); + EVT VT = Node->getValueType(0); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + + bool Is64Bit = Subtarget->is64Bit(); + MVT SPTy = getPointerTy(DAG.getDataLayout()); + + SDValue Result; if (!Lower) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - SDNode* Node = Op.getNode(); - unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore(); assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" - " not tell us which reg is the stack pointer!"); + " not tell us which reg is the stack pointer!"); EVT VT = Node->getValueType(0); - SDValue Tmp1 = SDValue(Node, 0); - SDValue Tmp2 = SDValue(Node, 1); SDValue Tmp3 = Node->getOperand(2); - SDValue Chain = Tmp1.getOperand(0); - - // Chain the dynamic stack allocation so that it doesn't modify the stack - // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), - SDLoc(Node)); - SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlign = TFI.getStackAlignment(); - Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value + Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value if (Align > StackAlign) - Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align, dl, VT)); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain - - Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), - DAG.getIntPtrConstant(0, dl, true), SDValue(), - SDLoc(Node)); - - SDValue Ops[2] = { Tmp1, Tmp2 }; - return DAG.getMergeValues(Ops, dl); - } - - // Get the inputs. - SDValue Chain = Op.getOperand(0); - SDValue Size = Op.getOperand(1); - unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - EVT VT = Op.getNode()->getValueType(0); - - bool Is64Bit = Subtarget->is64Bit(); - MVT SPTy = getPointerTy(DAG.getDataLayout()); - - if (SplitStack) { + Result = DAG.getNode(ISD::AND, dl, VT, Result, + DAG.getConstant(-(uint64_t)Align, dl, VT)); + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain + } else if (SplitStack) { MachineRegisterInfo &MRI = MF.getRegInfo(); if (Is64Bit) { @@ -14942,10 +15923,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); unsigned Vreg = MRI.createVirtualRegister(AddrRegClass); Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); - SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, + Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, DAG.getRegister(Vreg, SPTy)); - SDValue Ops1[2] = { Value, Chain }; - return DAG.getMergeValues(Ops1, dl); } else { SDValue Flag; const unsigned Reg = (Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX); @@ -14967,9 +15946,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); } - SDValue Ops1[2] = { SP, Chain }; - return DAG.getMergeValues(Ops1, dl); + Result = SP; } + + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + SDValue Ops[2] = {Result, Chain}; + return DAG.getMergeValues(Ops, dl); } SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { @@ -14980,7 +15964,8 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); SDLoc DL(Op); - if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { + if (!Subtarget->is64Bit() || + Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); @@ -15019,10 +16004,11 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MemOps.push_back(Store); // Store ptr to reg_save_area. - FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(8, DL)); + FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( + Subtarget->isTarget64BitLP64() ? 8 : 4, DL)); SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); - Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, - MachinePointerInfo(SV, 16), false, false, 0); + Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, MachinePointerInfo( + SV, Subtarget->isTarget64BitLP64() ? 16 : 12), false, false, 0); MemOps.push_back(Store); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } @@ -15030,10 +16016,13 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->is64Bit() && "LowerVAARG only handles 64-bit va_arg!"); - assert((Subtarget->isTargetLinux() || - Subtarget->isTargetDarwin()) && - "Unhandled target in LowerVAARG"); assert(Op.getNode()->getNumOperands() == 4); + + MachineFunction &MF = DAG.getMachineFunction(); + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + // The Win64 ABI uses char* instead of a structure. + return DAG.expandVAArg(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue SrcPtr = Op.getOperand(1); const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); @@ -15061,8 +16050,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget->useSoftFloat() && - !(DAG.getMachineFunction().getFunction()->hasFnAttribute( - Attribute::NoImplicitFloat)) && + !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -15091,8 +16079,14 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - // X86-64 va_list is a struct { i32, i32, i8*, i8* }. + // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, + // where a va_list is still an i8*. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); + if (Subtarget->isCallingConvWin64( + DAG.getMachineFunction().getFunction()->getCallingConv())) + // Probably a Win64 va_copy. + return DAG.expandVACopy(Op.getNode()); + SDValue Chain = Op.getOperand(0); SDValue DstPtr = Op.getOperand(1); SDValue SrcPtr = Op.getOperand(2); @@ -15230,72 +16224,126 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT, // The return type has to be a 128-bit type with the same element // type as the input type. MVT EltVT = VT.getVectorElementType(); - EVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); + MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits()); ShAmt = DAG.getBitcast(ShVT, ShAmt); return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -/// \brief Return (and \p Op, \p Mask) for compare instructions or -/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the -/// necessary casting for \p Mask when lowering masking intrinsics. -static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, - SDValue PreservedSrc, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), - MVT::i1, VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); +/// \brief Return Mask with the necessary casting or extending +/// for \p Mask according to \p MaskVT when lowering masking intrinsics +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget *Subtarget, + SelectionDAG &DAG, SDLoc dl) { - assert(MaskVT.isSimple() && "invalid mask type"); + if (MaskVT.bitsGT(Mask.getSimpleValueType())) { + // Mask should be extended + Mask = DAG.getNode(ISD::ANY_EXTEND, dl, + MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask); + } - if (isAllOnes(Mask)) - return Op; + if (Mask.getSimpleValueType() == MVT::i64 && Subtarget->is32Bit()) { + if (MaskVT == MVT::v64i1) { + assert(Subtarget->hasBWI() && "Expected AVX512BW target!"); + // In case 32bit mode, bitcast i64 is illegal, extend/split it. + SDValue Lo, Hi; + Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(0, dl, MVT::i32)); + Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask, + DAG.getConstant(1, dl, MVT::i32)); + + Lo = DAG.getBitcast(MVT::v32i1, Lo); + Hi = DAG.getBitcast(MVT::v32i1, Hi); + return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); + } else { + // MaskVT require < 64bit. Truncate mask (should succeed in any case), + // and bitcast. + MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits()); + return DAG.getBitcast(MaskVT, + DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask)); + } + + } else { + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); + } +} - switch (Op.getOpcode()) { - default: break; - case X86ISD::PCMPEQM: - case X86ISD::PCMPGTM: - case X86ISD::CMPM: - case X86ISD::CMPMU: - return DAG.getNode(ISD::AND, dl, VT, Op, VMask); - } - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); +/// \brief Return (and \p Op, \p Mask) for compare instructions or +/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the +/// necessary casting or extending for \p Mask when lowering masking intrinsics +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + unsigned OpcodeSelect = ISD::VSELECT; + SDLoc dl(Op); + + if (isAllOnesConstant(Mask)) + return Op; + + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + + switch (Op.getOpcode()) { + default: break; + case X86ISD::PCMPEQM: + case X86ISD::PCMPGTM: + case X86ISD::CMPM: + case X86ISD::CMPMU: + return DAG.getNode(ISD::AND, dl, VT, Op, VMask); + case X86ISD::VFPCLASS: + case X86ISD::VFPCLASSS: + return DAG.getNode(ISD::OR, dl, VT, Op, VMask); + case X86ISD::VTRUNC: + case X86ISD::VTRUNCS: + case X86ISD::VTRUNCUS: + // We can't use ISD::VSELECT here because it is not always "Legal" + // for the destination type. For example vpmovqb require only AVX512 + // and vselect that can operate on byte element type require BWI + OpcodeSelect = X86ISD::SELECT; + break; + } + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); } /// \brief Creates an SDNode for a predicated scalar operation. /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). -/// The mask is comming as MVT::i8 and it should be truncated +/// The mask is coming as MVT::i8 and it should be truncated /// to MVT::i1 while lowering masking intrinsics. /// The main difference between ScalarMaskingNode and VectorMaskingNode is using -/// "X86select" instead of "vselect". We just can't create the "vselect" node for -/// a scalar instruction. +/// "X86select" instead of "vselect". We just can't create the "vselect" node +/// for a scalar instruction. static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - if (isAllOnes(Mask)) - return Op; + if (isAllOnesConstant(Mask)) + return Op; - EVT VT = Op.getValueType(); - SDLoc dl(Op); - // The mask should be of type MVT::i1 - SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); + MVT VT = Op.getSimpleValueType(); + SDLoc dl(Op); + // The mask should be of type MVT::i1 + SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask); - if (PreservedSrc.getOpcode() == ISD::UNDEF) - PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); + if (Op.getOpcode() == X86ISD::FSETCC) + return DAG.getNode(ISD::AND, dl, VT, Op, IMask); + if (Op.getOpcode() == X86ISD::VFPCLASS || + Op.getOpcode() == X86ISD::VFPCLASSS) + return DAG.getNode(ISD::OR, dl, VT, Op, IMask); + + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc); } static int getSEHRegistrationNodeSize(const Function *Fn) { @@ -15309,15 +16357,16 @@ static int getSEHRegistrationNodeSize(const Function *Fn) { case EHPersonality::MSVC_CXX: return 16; default: break; } - report_fatal_error("can only recover FP for MSVC EH personality functions"); + report_fatal_error( + "can only recover FP for 32-bit MSVC EH personality functions"); } -/// When the 32-bit MSVC runtime transfers control to us, either to an outlined +/// When the MSVC runtime transfers control to us, either to an outlined /// function or when returning to a parent frame after catching an exception, we /// recover the parent frame pointer by doing arithmetic on the incoming EBP. /// Here's the math: /// RegNodeBase = EntryEBP - RegNodeSize -/// ParentFP = RegNodeBase - RegNodeFrameOffset +/// ParentFP = RegNodeBase - ParentFrameOffset /// Subtracting RegNodeSize takes us to the offset of the registration node, and /// subtracting the offset (negative on x86) takes us back to the parent FP. static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, @@ -15334,29 +16383,35 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, if (!Fn->hasPersonalityFn()) return EntryEBP; - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - // Get an MCSymbol that will ultimately resolve to the frame offset of the EH - // registration. + // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( GlobalValue::getRealLinkageName(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); - SDValue RegNodeFrameOffset = + SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); + // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after + // prologue to RBP in the parent function. + const X86Subtarget &Subtarget = + static_cast<const X86Subtarget &>(DAG.getSubtarget()); + if (Subtarget.is64Bit()) + return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); + + int RegNodeSize = getSEHRegistrationNodeSize(Fn); // RegNodeBase = EntryEBP - RegNodeSize - // ParentFP = RegNodeBase - RegNodeFrameOffset + // ParentFP = RegNodeBase - ParentFrameOffset SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, DAG.getConstant(RegNodeSize, dl, PtrVT)); - return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, RegNodeFrameOffset); + return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { @@ -15365,6 +16420,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget case INTR_TYPE_2OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); + case INTR_TYPE_2OP_IMM8: + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), + DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(2))); case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -15376,28 +16434,53 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue RoundingMode; + // We allways add rounding mode to the Node. + // If the rounding mode is not specified, we add the + // "current direction" mode. if (Op.getNumOperands() == 4) - RoundingMode = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + RoundingMode = + DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); else RoundingMode = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - unsigned Round = cast<ConstantSDNode>(RoundingMode)->getZExtValue(); - if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) + if (IntrWithRoundingModeOpcode != 0) + if (cast<ConstantSDNode>(RoundingMode)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), Src, RoundingMode), Mask, PassThru, Subtarget, DAG); - } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, RoundingMode), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_1OP_MASK: { SDValue Src = Op.getOperand(1); - SDValue Passthru = Op.getOperand(2); + SDValue PassThru = Op.getOperand(2); SDValue Mask = Op.getOperand(3); + // We add rounding mode to the Node when + // - RM Opcode is specified and + // - RM is not "current direction". + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(4); + unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue(); + if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) { + return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, Op.getValueType(), + Src, Rnd), + Mask, PassThru, Subtarget, DAG); + } + } return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, Passthru, Subtarget, DAG); + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_SCALAR_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue passThru = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), + Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); @@ -15405,7 +16488,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src0 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); // There are 2 kinds of intrinsics in this group: - // (1) With supress-all-exceptions (sae) or rounding mode- 6 operands + // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands // (2) With rounding mode and sae - 7 operands. if (Op.getNumOperands() == 6) { SDValue Sae = Op.getOperand(5); @@ -15421,11 +16504,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget RoundingMode, Sae), Mask, Src0, Subtarget, DAG); } - case INTR_TYPE_2OP_MASK: { + case INTR_TYPE_2OP_MASK: + case INTR_TYPE_2OP_IMM8_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); + + if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK) + Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2); + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15440,8 +16528,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - Src1,Src2), + // TODO: Intrinsics should have fast-math-flags to propagate. + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_2OP_MASK_RM: { @@ -15449,7 +16537,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue PassThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - // We specify 2 possible modes for intrinsics, with/without rounding modes. + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. // First, we check if the intrinsic have rounding mode (6 operands), // if not, we set rounding mode to "current". SDValue Rnd; @@ -15461,12 +16550,56 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_MASK: { + case INTR_TYPE_3OP_SCALAR_MASK_RM: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); + SDValue Sae = Op.getOperand(6); + + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + Src2, Src3, Sae), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_MASK_RM: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Imm = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + // We specify 2 possible modes for intrinsics, with/without rounding + // modes. + // First, we check if the intrinsic have rounding mode (7 operands), + // if not, we set rounding mode to "current". + SDValue Rnd; + if (Op.getNumOperands() == 7) + Rnd = Op.getOperand(6); + else + Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Imm, Rnd), + Mask, PassThru, Subtarget, DAG); + } + case INTR_TYPE_3OP_IMM8_MASK: + case INTR_TYPE_3OP_MASK: + case INSERT_SUBVEC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue PassThru = Op.getOperand(4); + SDValue Mask = Op.getOperand(5); + + if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK) + Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3); + else if (IntrData->Type == INSERT_SUBVEC) { + // imm should be adapted to ISD::INSERT_SUBVECTOR behavior + assert(isa<ConstantSDNode>(Src3) && "Expected a ConstantSDNode here!"); + unsigned Imm = cast<ConstantSDNode>(Src3)->getZExtValue(); + Imm *= Src2.getSimpleValueType().getVectorNumElements(); + Src3 = DAG.getTargetConstant(Imm, dl, MVT::i32); + } + // We specify 2 possible opcodes for intrinsics with rounding modes. // First, we check if the intrinsic may have non-default rounding mode, // (IntrData->Opc1 != 0), then we check the rounding mode operand. @@ -15486,7 +16619,27 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Mask, PassThru, Subtarget, DAG); } case VPERM_3OP_MASKZ: - case VPERM_3OP_MASK: + case VPERM_3OP_MASK:{ + // Src2 is the PassThru + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = SDValue(); + + // set PassThru element + if (IntrData->Type == VPERM_3OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + else + PassThru = DAG.getBitcast(VT, Src2); + + // Swap Src1 and Src2 in the node creation + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src2, Src1, Src3), + Mask, PassThru, Subtarget, DAG); + } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { @@ -15494,11 +16647,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); SDValue PassThru = SDValue(); // set PassThru element - if (IntrData->Type == VPERM_3OP_MASKZ || IntrData->Type == FMA_OP_MASKZ) + if (IntrData->Type == FMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); else if (IntrData->Type == FMA_OP_MASK3) PassThru = Src3; @@ -15523,6 +16676,50 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case TERLOG_OP_MASK: + case TERLOG_OP_MASKZ: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4)); + SDValue Mask = Op.getOperand(5); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = Src1; + // Set PassThru element. + if (IntrData->Type == TERLOG_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + Src1, Src2, Src3, Src4), + Mask, PassThru, Subtarget, DAG); + } + case FPCLASS: { + // FPclass intrinsics with mask + SDValue Src1 = Op.getOperand(1); + MVT VT = Src1.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); + SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MaskVT), + Subtarget, DAG); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), FPclassMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case FPCLASSS: { + SDValue Src1 = Op.getOperand(1); + SDValue Imm = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm); + SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, + DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); + return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, FPclassMask); + } case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. @@ -15534,12 +16731,11 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget // (v2i1 (and (PCMPEQM %a, %b), // (extract_subvector // (v8i1 (bitcast %mask)), 0))), 0)))) - EVT VT = Op.getOperand(1).getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); + MVT VT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); SDValue Cmp; if (IntrData->Type == CMP_MASK_CC) { SDValue CC = Op.getOperand(3); @@ -15573,6 +16769,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } + case CMP_MASK_SCALAR_CC: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3)); + SDValue Mask = Op.getOperand(4); + + SDValue Cmp; + if (IntrData->Opc1 != 0) { + SDValue Rnd = Op.getOperand(5); + if (cast<ConstantSDNode>(Rnd)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd); + } + //default rounding mode + if(!Cmp.getNode()) + Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC); + + SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, + DAG.getTargetConstant(0, dl, + MVT::i1), + Subtarget, DAG); + + return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i8, + DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, CmpMask), + DAG.getValueType(MVT::i1)); + } case COMI: { // Comparison intrinsics ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; SDValue LHS = Op.getOperand(1); @@ -15584,6 +16806,24 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget DAG.getConstant(X86CC, dl, MVT::i8), Cond); return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); } + case COMI_RM: { // Comparison intrinsics with Sae + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + SDValue CC = Op.getOperand(3); + SDValue Sae = Op.getOperand(4); + auto ComiType = TranslateX86ConstCondToX86CC(CC); + // choose between ordered and unordered (comi/ucomi) + unsigned comiOp = std::get<0>(ComiType) ? IntrData->Opc0 : IntrData->Opc1; + SDValue Cond; + if (cast<ConstantSDNode>(Sae)->getZExtValue() != + X86::STATIC_ROUNDING::CUR_DIRECTION) + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS, Sae); + else + Cond = DAG.getNode(comiOp, dl, MVT::i32, LHS, RHS); + SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(std::get<1>(ComiType), dl, MVT::i8), Cond); + return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); + } case VSHIFT: return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), Op.getOperand(1), Op.getOperand(2), DAG); @@ -15598,27 +16838,75 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget SDValue Mask = Op.getOperand(3); SDValue DataToCompress = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - if (isAllOnes(Mask)) // return data as is + if (isAllOnesConstant(Mask)) // return data as is return Op.getOperand(1); return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress), Mask, PassThru, Subtarget, DAG); } + case BROADCASTM: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + Mask = DAG.getBitcast(MaskVT, Mask); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); + } case BLEND: { SDValue Mask = Op.getOperand(3); - EVT VT = Op.getValueType(); - EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - VT.getVectorNumElements()); - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); - SDLoc dl(Op); - SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, - DAG.getBitcast(BitcastVT, Mask), - DAG.getIntPtrConstant(0, dl)); + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); return DAG.getNode(IntrData->Opc0, dl, VT, VMask, Op.getOperand(1), Op.getOperand(2)); } + case KUNPCK: { + MVT VT = Op.getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); + + SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); + SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); + // Arguments should be swapped. + SDValue Res = DAG.getNode(IntrData->Opc0, dl, + MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), + Src2, Src1); + return DAG.getBitcast(VT, Res); + } + case CONVERT_TO_MASK: { + MVT SrcVT = Op.getOperand(1).getSimpleValueType(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); + + SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT, + Op.getOperand(1)); + SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, + DAG.getUNDEF(BitcastVT), CvtMask, + DAG.getIntPtrConstant(0, dl)); + return DAG.getBitcast(Op.getValueType(), Res); + } + case CONVERT_MASK_TO_VEC: { + SDValue Mask = Op.getOperand(1); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); + return DAG.getNode(IntrData->Opc0, dl, VT, VMask); + } + case BRCST_SUBVEC_TO_VEC: { + SDValue Src = Op.getOperand(1); + SDValue Passthru = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + EVT resVT = Passthru.getValueType(); + SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, + DAG.getUNDEF(resVT), Src, + DAG.getIntPtrConstant(0, dl)); + SDValue immVal; + if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) + immVal = DAG.getConstant(0x44, dl, MVT::i8); + else + immVal = DAG.getConstant(0, dl, MVT::i8); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, + subVec, subVec, immVal), + Mask, Passthru, Subtarget, DAG); + } default: break; } @@ -15832,23 +17120,17 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget * Subtarget) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - if (!C) - llvm_unreachable("Invalid scale type"); - unsigned ScaleVal = C->getZExtValue(); - if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) - llvm_unreachable("Valid scale values are 1, 2, 4, 8"); - + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); - EVT MaskVT = MVT::getVectorVT(MVT::i1, + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. @@ -15860,7 +17142,7 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); if (Src.getOpcode() == ISD::UNDEF) - Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); + Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain}; SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops); SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) }; @@ -15871,25 +17153,19 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - if (!C) - llvm_unreachable("Invalid scale type"); - unsigned ScaleVal = C->getZExtValue(); - if (ScaleVal > 2 && ScaleVal != 4 && ScaleVal != 8) - llvm_unreachable("Valid scale values are 1, 2, 4, 8"); - + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = MVT::getVectorVT(MVT::i1, + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); if (MaskC) MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), dl, MaskVT); else { - EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - Mask.getValueType().getSizeInBits()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements // are extracted by EXTRACT_SUBVECTOR. @@ -15907,12 +17183,11 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain) { SDLoc dl(Op); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp); - assert(C && "Invalid scale type"); + auto *C = cast<ConstantSDNode>(ScaleOp); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); - EVT MaskVT = + MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); SDValue MaskInReg; ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask); @@ -16034,64 +17309,59 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Results, DL); } -static SDValue LowerSEHRESTOREFRAME(SDValue Op, const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); - const Function *Fn = MF.getFunction(); - SDLoc dl(Op); SDValue Chain = Op.getOperand(0); + SDValue RegNode = Op.getOperand(2); + WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); + if (!EHInfo) + report_fatal_error("EH registrations only live in functions using WinEH"); + + // Cast the operand to an alloca, and remember the frame index. + auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode); + if (!FINode) + report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); + EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); + + // Return the chain operand without making any DAG nodes. + return Chain; +} - assert(Subtarget->getFrameLowering()->hasFP(MF) && - "using llvm.x86.seh.restoreframe requires a frame pointer"); - - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT VT = TLI.getPointerTy(DAG.getDataLayout()); - - const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); - unsigned FrameReg = - RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); - unsigned SPReg = RegInfo->getStackRegister(); - unsigned SlotSize = RegInfo->getSlotSize(); +/// \brief Lower intrinsics for TRUNCATE_TO_MEM case +/// return truncate Store/MaskedStore Node +static SDValue LowerINTRINSIC_TRUNCATE_TO_MEM(const SDValue & Op, + SelectionDAG &DAG, + MVT ElementType) { + SDLoc dl(Op); + SDValue Mask = Op.getOperand(4); + SDValue DataToTruncate = Op.getOperand(3); + SDValue Addr = Op.getOperand(2); + SDValue Chain = Op.getOperand(0); - // Get incoming EBP. - SDValue IncomingEBP = - DAG.getCopyFromReg(Chain, dl, FrameReg, VT); + MVT VT = DataToTruncate.getSimpleValueType(); + MVT SVT = MVT::getVectorVT(ElementType, VT.getVectorNumElements()); - // SP is saved in the first field of every registration node, so load - // [EBP-RegNodeSize] into SP. - int RegNodeSize = getSEHRegistrationNodeSize(Fn); - SDValue SPAddr = DAG.getNode(ISD::ADD, dl, VT, IncomingEBP, - DAG.getConstant(-RegNodeSize, dl, VT)); - SDValue NewSP = - DAG.getLoad(VT, dl, Chain, SPAddr, MachinePointerInfo(), false, false, - false, VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); - - if (!RegInfo->needsStackRealignment(MF)) { - // Adjust EBP to point back to the original frame position. - SDValue NewFP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, FrameReg, NewFP); - } else { - assert(RegInfo->hasBasePointer(MF) && - "functions with Win32 EH must use frame or base pointer register"); + if (isAllOnesConstant(Mask)) // return just a truncate store + return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, + MachinePointerInfo(), SVT, false, false, + SVT.getScalarSizeInBits()/8); - // Reload the base pointer (ESI) with the adjusted incoming EBP. - SDValue NewBP = recoverFramePointer(DAG, Fn, IncomingEBP); - Chain = DAG.getCopyToReg(Chain, dl, RegInfo->getBaseRegister(), NewBP); + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + MVT BitcastVT = MVT::getVectorVT(MVT::i1, + Mask.getSimpleValueType().getSizeInBits()); + // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements + // are extracted by EXTRACT_SUBVECTOR. + SDValue VMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, + DAG.getBitcast(BitcastVT, Mask), + DAG.getIntPtrConstant(0, dl)); - // Reload the spilled EBP value, now that the stack and base pointers are - // set up. - X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - X86FI->setHasSEHFramePtrSave(true); - int FI = MF.getFrameInfo()->CreateSpillStackObject(SlotSize, SlotSize); - X86FI->setSEHFramePtrSaveIndex(FI); - SDValue NewFP = DAG.getLoad(VT, dl, Chain, DAG.getFrameIndex(FI, VT), - MachinePointerInfo(), false, false, false, - VT.getScalarSizeInBits() / 8); - Chain = DAG.getCopyToReg(NewFP, dl, FrameReg, NewFP); - } + MachineMemOperand *MMO = DAG.getMachineFunction(). + getMachineMemOperand(MachinePointerInfo(), + MachineMemOperand::MOStore, SVT.getStoreSize(), + SVT.getScalarSizeInBits()/8); - return Chain; + return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, + VMask, SVT, MMO, true); } static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, @@ -16100,16 +17370,14 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo); if (!IntrData) { - if (IntNo == llvm::Intrinsic::x86_seh_restoreframe) - return LowerSEHRESTOREFRAME(Op, Subtarget, DAG); + if (IntNo == llvm::Intrinsic::x86_seh_ehregnode) + return MarkEHRegistrationNode(Op, DAG); return SDValue(); } SDLoc dl(Op); switch(IntrData->Type) { - default: - llvm_unreachable("Unknown Intrinsic Type"); - break; + default: llvm_unreachable("Unknown Intrinsic Type"); case RDSEED: case RDRAND: { // Emit the node with the right value type. @@ -16214,8 +17482,8 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - EVT VT = DataToCompress.getValueType(); - if (isAllOnes(Mask)) // return just a store + MVT VT = DataToCompress.getSimpleValueType(); + if (isAllOnesConstant(Mask)) // return just a store return DAG.getStore(Chain, dl, DataToCompress, Addr, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); @@ -16227,15 +17495,21 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget, MachinePointerInfo(), false, false, VT.getScalarSizeInBits()/8); } + case TRUNCATE_TO_MEM_VI8: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i8); + case TRUNCATE_TO_MEM_VI16: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i16); + case TRUNCATE_TO_MEM_VI32: + return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32); case EXPAND_FROM_MEM: { SDLoc dl(Op); SDValue Mask = Op.getOperand(4); SDValue PassThru = Op.getOperand(3); SDValue Addr = Op.getOperand(2); SDValue Chain = Op.getOperand(0); - EVT VT = Op.getValueType(); + MVT VT = Op.getSimpleValueType(); - if (isAllOnes(Mask)) // return just a load + if (isAllOnesConstant(Mask)) // return just a load return DAG.getLoad(VT, dl, Chain, Addr, MachinePointerInfo(), false, false, false, VT.getScalarSizeInBits()/8); @@ -16359,6 +17633,21 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); } +unsigned X86TargetLowering::getExceptionPointerRegister( + const Constant *PersonalityFn) const { + if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; + + return Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX; +} + +unsigned X86TargetLowering::getExceptionSelectorRegister( + const Constant *PersonalityFn) const { + // Funclet personalities don't use selectors (the runtime does the selection). + assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))); + return Subtarget->isTarget64BitLP64() ? X86::RDX : X86::EDX; +} + SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Offset = Op.getOperand(1); @@ -16497,9 +17786,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.hasAttribute(Idx, Attribute::InReg)) + if (Attrs.hasAttribute(Idx, Attribute::InReg)) { + auto &DL = DAG.getDataLayout(); // FIXME: should only count parameters that are lowered to integers. - InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; + InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; + } if (InRegCount > 2) { report_fatal_error("Nest register in use - reduce number of inreg" @@ -16588,8 +17879,8 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), - MachineMemOperand::MOStore, 2, 2); + MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI), + MachineMemOperand::MOStore, 2, 2); SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, @@ -16623,12 +17914,75 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } -static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { +/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction. +// +// 1. i32/i64 128/256-bit vector (native support require VLX) are expended +// to 512-bit vector. +// 2. i8/i16 vector implemented using dword LZCNT vector instruction +// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, +// split the vector, perform operation on it's Lo a Hi part and +// concatenate the results. +static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) { + SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); - EVT OpVT = VT; + MVT EltVT = VT.getVectorElementType(); + unsigned NumElems = VT.getVectorNumElements(); + + if (EltVT == MVT::i64 || EltVT == MVT::i32) { + // Extend to 512 bit vector. + assert((VT.is256BitVector() || VT.is128BitVector()) && + "Unsupported value type for operation"); + + MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits()); + SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, + DAG.getUNDEF(NewVT), + Op.getOperand(0), + DAG.getIntPtrConstant(0, dl)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512); + + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode, + DAG.getIntPtrConstant(0, dl)); + } + + assert((EltVT == MVT::i8 || EltVT == MVT::i16) && + "Unsupported element type"); + + if (16 < NumElems) { + // Split vector, it's Lo and Hi parts will be handled in next iteration. + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl); + MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2); + + Lo = DAG.getNode(Op.getOpcode(), dl, OutVT, Lo); + Hi = DAG.getNode(Op.getOpcode(), dl, OutVT, Hi); + + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); + } + + MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); + + assert((NewVT.is256BitVector() || NewVT.is512BitVector()) && + "Unsupported value type for operation"); + + // Use native supported vector instruction vplzcntd. + Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); + SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); + SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); + SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); + + return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); +} + +static SDValue LowerCTLZ(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + MVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); SDLoc dl(Op); + if (VT.isVector() && Subtarget->hasAVX512()) + return LowerVectorCTLZ_AVX512(Op, DAG); + Op = Op.getOperand(0); if (VT == MVT::i8) { // Zero extend to i32 since there is not an i8 bsr. @@ -16658,7 +18012,8 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { return Op; } -static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -16686,13 +18041,39 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); - unsigned NumBits = VT.getSizeInBits(); + unsigned NumBits = VT.getScalarSizeInBits(); SDLoc dl(Op); - Op = Op.getOperand(0); + + if (VT.isVector()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + SDValue N0 = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + + // lsb(x) = (x & -x) + SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0, + DAG.getNode(ISD::SUB, dl, VT, Zero, N0)); + + // cttz_undef(x) = (width - 1) - ctlz(lsb) + if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF && + TLI.isOperationLegal(ISD::CTLZ, VT)) { + SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT); + return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne, + DAG.getNode(ISD::CTLZ, dl, VT, LSB)); + } + + // cttz(x) = ctpop(lsb - 1) + SDValue One = DAG.getConstant(1, dl, VT); + return DAG.getNode(ISD::CTPOP, dl, VT, + DAG.getNode(ISD::SUB, dl, VT, LSB, One)); + } + + assert(Op.getOpcode() == ISD::CTTZ && + "Only scalar CTTZ requires custom lowering"); // Issue a bsf (scan bits forward) which also sets EFLAGS. SDVTList VTs = DAG.getVTList(VT, MVT::i32); - Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); + Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0)); // If src is zero (i.e. bsf sets ZF), returns NumBits. SDValue Ops[] = { @@ -16753,6 +18134,13 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { return Lower256IntArith(Op, DAG); } +static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) { + assert(Op.getSimpleValueType().is256BitVector() && + Op.getSimpleValueType().isInteger() && + "Only handle AVX 256-bit vector integer operation"); + return Lower256IntArith(Op, DAG); +} + static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc dl(Op); @@ -16885,7 +18273,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SDValue AhiBlo = Ahi; SDValue AloBhi = Bhi; // Bit cast to 32-bit vectors for MULUDQ - EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : + MVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32; A = DAG.getBitcast(MulVT, A); B = DAG.getBitcast(MulVT, B); @@ -16962,7 +18350,7 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1); - EVT VT = Op0.getValueType(); + MVT VT = Op0.getSimpleValueType(); SDLoc dl(Op); assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) || @@ -17034,7 +18422,7 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget, return DAG.getMergeValues(Ops, dl); } -// Return true if the requred (according to Opcode) shift-imm form is natively +// Return true if the required (according to Opcode) shift-imm form is natively // supported by the Subtarget static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { @@ -17054,14 +18442,14 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget *Subtarget, } // The shift amount is a variable, but it is the same for all vector lanes. -// These instrcutions are defined together with shift-immediate. +// These instructions are defined together with shift-immediate. static bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { return SupportedVectorShiftWithImm(VT, Subtarget, Opcode); } -// Return true if the requred (according to Opcode) variable-shift form is +// Return true if the required (according to Opcode) variable-shift form is // natively supported by the Subtarget static bool SupportedVectorVarShift(MVT VT, const X86Subtarget *Subtarget, unsigned Opcode) { @@ -17133,27 +18521,37 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, // i64 SRA needs to be performed as partial shifts. if ((VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Op.getOpcode() == ISD::SRA) + Op.getOpcode() == ISD::SRA && !Subtarget->hasXOP()) return ArithmeticShiftRight64(ShiftAmt); - if (VT == MVT::v16i8 || (Subtarget->hasInt256() && VT == MVT::v32i8)) { + if (VT == MVT::v16i8 || + (Subtarget->hasInt256() && VT == MVT::v32i8) || + VT == MVT::v64i8) { unsigned NumElts = VT.getVectorNumElements(); MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); - if (Op.getOpcode() == ISD::SHL) { - // Simple i8 add case - if (ShiftAmt == 1) - return DAG.getNode(ISD::ADD, dl, VT, R, R); + // Simple i8 add case + if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) + return DAG.getNode(ISD::ADD, dl, VT, R, R); + + // ashr(R, 7) === cmp_slt(R, 0) + if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { + SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); + } + + // XOP can shift v16i8 directly instead of as shift v8i16 + mask. + if (VT == MVT::v16i8 && Subtarget->hasXOP()) + return SDValue(); + if (Op.getOpcode() == ISD::SHL) { // Make a large shift. SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, ShiftAmt, DAG); SHL = DAG.getBitcast(VT, SHL); // Zero out the rightmost bits. - SmallVector<SDValue, 32> V( - NumElts, DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SHL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT)); } if (Op.getOpcode() == ISD::SRL) { // Make a large shift. @@ -17161,24 +18559,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, R, ShiftAmt, DAG); SRL = DAG.getBitcast(VT, SRL); // Zero out the leftmost bits. - SmallVector<SDValue, 32> V( - NumElts, DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, MVT::i8)); return DAG.getNode(ISD::AND, dl, VT, SRL, - DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V)); + DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT)); } if (Op.getOpcode() == ISD::SRA) { - if (ShiftAmt == 7) { - // R s>> 7 === R s< 0 - SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl); - return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); - } - - // R s>> a === ((R u>> a) ^ m) - m + // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); - SmallVector<SDValue, 32> V(NumElts, - DAG.getConstant(128 >> ShiftAmt, dl, - MVT::i8)); - SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V); + + SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); return Res; @@ -17189,35 +18577,51 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget->is64Bit() && - (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) && - Amt.getOpcode() == ISD::BITCAST && - Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { + if (!Subtarget->is64Bit() && !Subtarget->hasXOP() && + (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64))) { + + // Peek through any splat that was introduced for i64 shift vectorization. + int SplatIndex = -1; + if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode())) + if (SVN->isSplat()) { + SplatIndex = SVN->getSplatIndex(); + Amt = Amt.getOperand(0); + assert(SplatIndex < (int)VT.getVectorNumElements() && + "Splat shuffle referencing second operand"); + } + + if (Amt.getOpcode() != ISD::BITCAST || + Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / VT.getVectorNumElements(); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; + unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); for (unsigned i = 0; i != Ratio; ++i) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i)); + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp)); if (!C) return SDValue(); // 6 == Log2(64) ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2))); } - // Check remaining shift amounts. - for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { - uint64_t ShAmt = 0; - for (unsigned j = 0; j != Ratio; ++j) { - ConstantSDNode *C = - dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); - if (!C) + + // Check remaining shift amounts (if not a splat). + if (SplatIndex < 0) { + for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) { + uint64_t ShAmt = 0; + for (unsigned j = 0; j != Ratio; ++j) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j)); + if (!C) + return SDValue(); + // 6 == Log2(64) + ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); + } + if (ShAmt != ShiftAmt) return SDValue(); - // 6 == Log2(64) - ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2))); } - if (ShAmt != ShiftAmt) - return SDValue(); } if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) @@ -17245,7 +18649,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) { SDValue BaseShAmt; - EVT EltVT = VT.getVectorElementType(); + MVT EltVT = VT.getVectorElementType(); if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) { // Check if this build_vector node is doing a splat. @@ -17262,7 +18666,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, unsigned SplatIdx = (unsigned)SVN->getSplatIndex(); SDValue InVec = Amt.getOperand(0); if (InVec.getOpcode() == ISD::BUILD_VECTOR) { - assert((SplatIdx < InVec.getValueType().getVectorNumElements()) && + assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) && "Unexpected shuffle index found!"); BaseShAmt = InVec.getOperand(SplatIdx); } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { @@ -17327,11 +18731,26 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return V; if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget)) - return V; + return V; if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode())) return Op; + // XOP has 128-bit variable logical/arithmetic shifts. + // +ve/-ve Amt = shift left/right. + if (Subtarget->hasXOP() && + (VT == MVT::v2i64 || VT == MVT::v4i32 || + VT == MVT::v8i16 || VT == MVT::v16i8)) { + if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) { + SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl); + Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); + } + if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) + return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); + if (Op.getOpcode() == ISD::SRA) + return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); + } + // 2i64 vector logical shifts can efficiently avoid scalarization - do the // shifts per-lane and then shuffle the partial results back together. if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) { @@ -17343,6 +18762,19 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); } + // i64 vector arithmetic shift can be emulated with the transform: + // M = lshr(SIGN_BIT, Amt) + // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) + if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget->hasInt256())) && + Op.getOpcode() == ISD::SRA) { + SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT); + SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); + R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); + R = DAG.getNode(ISD::XOR, dl, VT, R, M); + R = DAG.getNode(ISD::SUB, dl, VT, R, M); + return R; + } + // If possible, lower this packed shift into a vector multiply instead of // expanding it into a sequence of scalar shifts. // Do this only if the vector shift count is a constant build_vector. @@ -17351,9 +18783,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, (Subtarget->hasInt256() && VT == MVT::v16i16)) && ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) { SmallVector<SDValue, 8> Elts; - EVT SVT = VT.getScalarType(); + MVT SVT = VT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); - const APInt &One = APInt(SVTBits, 1); + APInt One(SVTBits, 1); unsigned NumElems = VT.getVectorNumElements(); for (unsigned i=0; i !=NumElems; ++i) { @@ -17364,7 +18796,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, } ConstantSDNode *ND = cast<ConstantSDNode>(Op); - const APInt &C = APInt(SVTBits, ND->getAPIntValue().getZExtValue()); + APInt C(SVTBits, ND->getAPIntValue().getZExtValue()); uint64_t ShAmt = C.getZExtValue(); if (ShAmt >= SVTBits) { Elts.push_back(DAG.getUNDEF(SVT)); @@ -17443,7 +18875,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD. - EVT CastVT = MVT::v4i32; + MVT CastVT = MVT::v4i32; SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); @@ -17507,7 +18939,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); } - if (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget->hasInt256())) { + if (VT == MVT::v16i8 || + (VT == MVT::v32i8 && Subtarget->hasInt256() && !Subtarget->hasXOP())) { MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); unsigned ShiftOpcode = Op->getOpcode(); @@ -17627,7 +19060,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt)); } - if (Subtarget->hasInt256() && VT == MVT::v16i16) { + if (Subtarget->hasInt256() && !Subtarget->hasXOP() && VT == MVT::v16i16) { MVT ExtVT = MVT::v8i32; SDValue Z = getZeroVector(VT, Subtarget, DAG, dl); SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z); @@ -17710,7 +19143,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, if (VT.is256BitVector()) { unsigned NumElems = VT.getVectorNumElements(); MVT EltVT = VT.getVectorElementType(); - EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); + MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2); // Extract the two vectors SDValue V1 = Extract128BitVector(R, 0, DAG, dl); @@ -17743,6 +19176,40 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget, return SDValue(); } +static SDValue LowerRotate(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + SDValue R = Op.getOperand(0); + SDValue Amt = Op.getOperand(1); + + assert(VT.isVector() && "Custom lowering only for vector rotates!"); + assert(Subtarget->hasXOP() && "XOP support required for vector rotates!"); + assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + + // XOP has 128-bit vector variable + immediate rotates. + // +ve/-ve Amt = rotate left/right. + + // Split 256-bit integers. + if (VT.is256BitVector()) + return Lower256IntArith(Op, DAG); + + assert(VT.is128BitVector() && "Only rotate 128-bit vectors!"); + + // Attempt to rotate by immediate. + if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { + if (auto *RotateConst = BVAmt->getConstantSplatNode()) { + uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); + assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + return DAG.getNode(X86ISD::VPROTI, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Use general rotate by variable (per-element). + return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); +} + static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering @@ -17759,8 +19226,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SADDO: // A subtract of one will be selected as a INC. Note that INC doesn't // set CF, so we can't do this for UADDO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::INC; Cond = X86::COND_O; break; @@ -17775,8 +19241,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { case ISD::SSUBO: // A subtract of one will be selected as a DEC. Note that DEC doesn't // set CF, so we can't do this for USUBO. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) - if (C->isOne()) { + if (isOneConstant(RHS)) { BaseOp = X86ISD::DEC; Cond = X86::COND_O; break; @@ -17827,7 +19292,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. /// Used to know whether to use cmpxchg8/16b when expanding atomic operations /// (otherwise we leave them alone to become __sync_fetch_and_... calls). -bool X86TargetLowering::needsCmpXchgNb(const Type *MemType) const { +bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { unsigned OpWidth = MemType->getPrimitiveSizeInBits(); if (OpWidth == 64) @@ -17844,21 +19309,23 @@ bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { // Note: this turns large loads into lock cmpxchg8b/16b. // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b. -bool X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { auto PTy = cast<PointerType>(LI->getPointerOperand()->getType()); - return needsCmpXchgNb(PTy->getElementType()); + return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } -TargetLoweringBase::AtomicRMWExpansionKind +TargetLowering::AtomicExpansionKind X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // If the operand is too big, we must see if cmpxchg8/16b is available // and default to library calls otherwise. if (MemType->getPrimitiveSizeInBits() > NativeWidth) { - return needsCmpXchgNb(MemType) ? AtomicRMWExpansionKind::CmpXChg - : AtomicRMWExpansionKind::None; + return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; } AtomicRMWInst::BinOp Op = AI->getOperation(); @@ -17869,14 +19336,14 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::Add: case AtomicRMWInst::Sub: // It's better to use xadd, xsub or xchg for these in all cases. - return AtomicRMWExpansionKind::None; + return AtomicExpansionKind::None; case AtomicRMWInst::Or: case AtomicRMWInst::And: case AtomicRMWInst::Xor: // If the atomicrmw's result isn't actually used, we can just add a "lock" // prefix to a normal instruction for these operations. - return !AI->use_empty() ? AtomicRMWExpansionKind::CmpXChg - : AtomicRMWExpansionKind::None; + return !AI->use_empty() ? AtomicExpansionKind::CmpXChg + : AtomicExpansionKind::None; case AtomicRMWInst::Nand: case AtomicRMWInst::Max: case AtomicRMWInst::Min: @@ -17884,7 +19351,7 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { case AtomicRMWInst::UMin: // These always require a non-trivial set of data operations on x86. We must // use a cmpxchg loop. - return AtomicRMWExpansionKind::CmpXChg; + return AtomicExpansionKind::CmpXChg; } } @@ -17898,7 +19365,7 @@ static bool hasMFENCE(const X86Subtarget& Subtarget) { LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget->is64Bit() ? 64 : 32; - const Type *MemType = AI->getType(); + Type *MemType = AI->getType(); // Accesses larger than the native width are turned into cmpxchg/libcalls, so // there is no benefit in turning such RMWs into loads, and it is actually // harmful as it introduces a mfence. @@ -17926,7 +19393,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // lowered to just a load without a fence. A mfence flushes the store buffer, // making the optimization clearly correct. // FIXME: it is required if isAtLeastRelease(Order) but it is not clear - // otherwise, we might be able to be more agressive on relaxed idempotent + // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. if (SynchScope == SingleThread) @@ -18043,7 +19510,7 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, SDValue InVec = Op->getOperand(0); SDLoc dl(Op); unsigned NumElts = SrcVT.getVectorNumElements(); - EVT SVT = SrcVT.getVectorElementType(); + MVT SVT = SrcVT.getVectorElementType(); // Widen the vector in input in the case of MVT::v2i32. // Example: from MVT::v2i32 to MVT::v4i32. @@ -18103,7 +19570,8 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // chunks, thus directly computes the pop count for v2i64 and v4i64. if (EltVT == MVT::i64) { SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - V = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, V, Zeros); + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); return DAG.getBitcast(VT, V); } @@ -18119,9 +19587,10 @@ static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, // Do the horizontal sums into two v2i64s. Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL); - Low = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); + Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, Low), Zeros); - High = DAG.getNode(X86ISD::PSADBW, DL, ByteVecVT, + High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, DAG.getBitcast(ByteVecVT, High), Zeros); // Merge them together. @@ -18311,7 +19780,7 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget *Subtarget, static SDValue LowerCTPOP(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { - assert(Op.getValueType().isVector() && + assert(Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count."); return LowerVectorCTPOP(Op, Subtarget, DAG); } @@ -18357,7 +19826,7 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { - EVT VT = Op.getNode()->getSimpleValueType(0); + MVT VT = Op.getNode()->getSimpleValueType(0); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) @@ -18435,31 +19904,203 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); } +/// Widen a vector input to a vector of NVT. The +/// input vector must have the same element type as NVT. +static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, + bool FillWithZeroes = false) { + // Check if InOp already has the right width. + MVT InVT = InOp.getSimpleValueType(); + if (InVT == NVT) + return InOp; + + if (InOp.isUndef()) + return DAG.getUNDEF(NVT); + + assert(InVT.getVectorElementType() == NVT.getVectorElementType() && + "input and widen element type must match"); + + unsigned InNumElts = InVT.getVectorNumElements(); + unsigned WidenNumElts = NVT.getVectorNumElements(); + assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && + "Unexpected request for vector widening"); + + EVT EltVT = NVT.getVectorElementType(); + + SDLoc dl(InOp); + if (InOp.getOpcode() == ISD::CONCAT_VECTORS && + InOp.getNumOperands() == 2) { + SDValue N1 = InOp.getOperand(1); + if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || + N1.isUndef()) { + InOp = InOp.getOperand(0); + InVT = InOp.getSimpleValueType(); + InNumElts = InVT.getVectorNumElements(); + } + } + if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || + ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { + SmallVector<SDValue, 16> Ops; + for (unsigned i = 0; i < InNumElts; ++i) + Ops.push_back(InOp.getOperand(i)); + + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : + DAG.getUNDEF(EltVT); + for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) + Ops.push_back(FillVal); + return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops); + } + SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : + DAG.getUNDEF(NVT); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, + InOp, DAG.getIntPtrConstant(0, dl)); +} + static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); + // X86 scatter kills mask register, so its type should be added to + // the list of return values. + // If the "scatter" has 2 return values, it is already handled. + if (Op.getNode()->getNumValues() == 2) + return Op; + MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); - EVT VT = N->getValue().getValueType(); + SDValue Src = N->getValue(); + MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - // X86 scatter kills mask register, so its type should be added to - // the list of return values - if (N->getNumValues() == 1) { - SDValue Index = N->getIndex(); - if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getValueType().is512BitVector()) + SDValue NewScatter; + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Chain = N->getChain(); + SDValue BasePtr = N->getBasePtr(); + MVT MemVT = N->getMemoryVT().getSimpleVT(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) { + // The v2i32 value was promoted to v2i64. + // Now we "redo" the type legalizer's work and widen the original + // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64 + // with a shuffle. + assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) && + "Unexpected memory type"); + int ShuffleMask[] = {0, 2, -1, -1}; + Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src), + DAG.getUNDEF(MVT::v4i32), ShuffleMask); + // Now we have 4 elements instead of 2. + // Expand the index. + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4); + Index = ExtendToType(Index, NewIndexVT, DAG); + + // Expand the mask with zeroes + // Mask may be <2 x i64> or <2 x i1> at this moment + assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) && + "Unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + VT = MVT::v4i32; + } + + unsigned NumElts = VT.getVectorNumElements(); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (IndexVT == MVT::v8i32) + // Just extend index Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + else { + // The minimal number of elts in scatter is 8 + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + // Use original index here, do not modify the index twice + Index = ExtendToType(N->getIndex(), NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + // Use the original mask here, do not modify the mask twice + Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true); + + // The value that should be stored + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src = ExtendToType(Src, NewVT, DAG); + } + } + // If the mask is "wide" at this point - truncate it to i1 vector + MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts); + Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask); + + // The mask is killed by scatter, add it to the values + SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; + NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); + return SDValue(NewScatter.getNode(), 0); +} + +static SDValue LowerMLOAD(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { - SDVTList VTs = DAG.getVTList(N->getMask().getValueType(), MVT::Other); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; + MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); + MVT VT = Op.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); - SDValue NewScatter = DAG.getMaskedScatter(VTs, VT, dl, Ops, N->getMemOperand()); - DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); - return SDValue(NewScatter.getNode(), 0); + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + SDValue Src0 = N->getSrc0(); + Src0 = ExtendToType(Src0, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(), + N->getBasePtr(), Mask, Src0, + N->getMemoryVT(), N->getMemOperand(), + N->getExtensionType()); + + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewLoad.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewLoad.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); + } + return Op; +} + +static SDValue LowerMSTORE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); + SDValue DataToStore = N->getValue(); + MVT VT = DataToStore.getSimpleValueType(); + SDValue Mask = N->getMask(); + SDLoc dl(Op); + + if (Subtarget->hasAVX512() && !Subtarget->hasVLX() && + !VT.is512BitVector() && Mask.getValueType() == MVT::v8i1) { + // This operation is legal for targets with VLX, but without + // VLX the vector should be widened to 512 bit + unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); + MVT WideDataVT = MVT::getVectorVT(VT.getScalarType(), NumEltsInWideVec); + MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); + DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); + Mask = ExtendToType(Mask, WideMaskVT, DAG, true); + return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), + Mask, N->getMemoryVT(), N->getMemOperand(), + N->isTruncatingStore()); } return Op; } @@ -18470,17 +20111,59 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget *Subtarget, "MGATHER/MSCATTER are supported on AVX-512 arch only"); MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); - EVT VT = Op.getValueType(); - assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); SDLoc dl(Op); - + MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue Src0 = N->getValue(); + MVT IndexVT = Index.getSimpleValueType(); + MVT MaskVT = Mask.getSimpleValueType(); + + unsigned NumElts = VT.getVectorNumElements(); + assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); + if (!Subtarget->hasVLX() && !VT.is512BitVector() && - !Index.getValueType().is512BitVector()) { - Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); + !Index.getSimpleValueType().is512BitVector()) { + // AVX512F supports only 512-bit vectors. Or data or index should + // be 512 bit wide. If now the both index and data are 256-bit, but + // the vector contains 8 elements, we just sign-extend the index + if (NumElts == 8) { + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), + N->getOperand(3), Index }; + DAG.UpdateNodeOperands(N, Ops); + return Op; + } + + // Minimal number of elements in Gather + NumElts = 8; + // Index + MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts); + Index = ExtendToType(Index, NewIndexVT, DAG); + if (IndexVT.getScalarType() == MVT::i32) + Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); + + // Mask + MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts); + // At this point we have promoted mask operand + assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type"); + MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts); + Mask = ExtendToType(Mask, ExtMaskVT, DAG, true); + Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask); + + // The pass-thru value + MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts); + Src0 = ExtendToType(Src0, NewVT, DAG); + + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), + N->getMemoryVT(), dl, Ops, + N->getMemOperand()); + SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Exract, NewGather.getValue(1)}; + return DAG.getMergeValues(RetOps, dl); } return Op; } @@ -18572,6 +20255,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); + case ISD::SETCCE: return LowerSETCCE(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); @@ -18592,12 +20276,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); - case ISD::CTLZ: return LowerCTLZ(Op, DAG); - case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); - case ISD::CTTZ: return LowerCTTZ(Op, DAG); + case ISD::CTLZ: return LowerCTLZ(Op, Subtarget, DAG); + case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, Subtarget, DAG); + case ISD::CTTZ: + case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); + case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); @@ -18615,7 +20301,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SMAX: + case ISD::SMIN: + case ISD::UMAX: + case ISD::UMIN: return LowerMINMAX(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); + case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); + case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); case ISD::GC_TRANSITION_START: @@ -18634,14 +20326,43 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, switch (N->getOpcode()) { default: llvm_unreachable("Do not know how to custom type legalize this operation!"); + case X86ISD::AVG: { + // Legalize types for X86ISD::AVG by expanding vectors. + assert(Subtarget->hasSSE2() && "Requires at least SSE2!"); + + auto InVT = N->getValueType(0); + auto InVTSize = InVT.getSizeInBits(); + const unsigned RegSize = + (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128; + assert((!Subtarget->hasAVX512() || RegSize < 512) && + "512-bit vector requires AVX512"); + assert((!Subtarget->hasAVX2() || RegSize < 256) && + "256-bit vector requires AVX2"); + + auto ElemVT = InVT.getVectorElementType(); + auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, + RegSize / ElemVT.getSizeInBits()); + assert(RegSize % InVT.getSizeInBits() == 0); + unsigned NumConcat = RegSize / InVT.getSizeInBits(); + + SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); + Ops[0] = N->getOperand(0); + SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + Ops[0] = N->getOperand(1); + SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); + + SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); + Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl))); + return; + } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. case X86ISD::FMINC: case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMAX: { EVT VT = N->getValueType(0); - if (VT != MVT::v2f32) - llvm_unreachable("Unexpected type (!= v2f32) on FMIN/FMAX."); + assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."); SDValue UNDEF = DAG.getUNDEF(VT); SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, N->getOperand(0), UNDEF); @@ -18668,17 +20389,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return; } case ISD::FP_TO_SINT: - // FP_TO_INT*_IN_MEM is not legal for f16 inputs. Do not convert - // (FP_TO_SINT (load f16)) to FP_TO_INT*. - if (N->getOperand(0).getValueType() == MVT::f16) - break; - // fallthrough case ISD::FP_TO_UINT: { bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; - if (!IsSigned && !isIntegerTypeFTOL(SDValue(N, 0).getValueType())) - return; - std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true); SDValue FIST = Vals.first, StackSlot = Vals.second; @@ -18707,6 +20420,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, DAG.getBitcast(MVT::v2i64, VBias)); Or = DAG.getBitcast(MVT::v2f64, Or); + // TODO: Are there any fast-math-flags to propagate here? SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); return; @@ -18740,6 +20454,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results); } } + case ISD::INTRINSIC_WO_CHAIN: { + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); @@ -18748,7 +20467,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; - EVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; + MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; SDValue cpInL, cpInH; cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2), DAG.getConstant(0, dl, HalfT)); @@ -18884,6 +20603,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CMOV: return "X86ISD::CMOV"; case X86ISD::BRCOND: return "X86ISD::BRCOND"; case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; + case X86ISD::IRET: return "X86ISD::IRET"; case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; @@ -18910,6 +20630,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FHADD: return "X86ISD::FHADD"; case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::ABS: return "X86ISD::ABS"; + case X86ISD::CONFLICT: return "X86ISD::CONFLICT"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND"; case X86ISD::FMIN: return "X86ISD::FMIN"; @@ -18937,12 +20658,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT: return "X86ISD::VZEXT"; case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VTRUNC: return "X86ISD::VTRUNC"; - case X86ISD::VTRUNCM: return "X86ISD::VTRUNCM"; + case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS"; + case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS"; case X86ISD::VINSERT: return "X86ISD::VINSERT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::CVTDQ2PD: return "X86ISD::CVTDQ2PD"; case X86ISD::CVTUDQ2PD: return "X86ISD::CVTUDQ2PD"; + case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -18978,6 +20701,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTM: return "X86ISD::TESTM"; case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; + case X86ISD::KTEST: return "X86ISD::KTEST"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -19000,6 +20724,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UNPCKL: return "X86ISD::UNPCKL"; case X86ISD::UNPCKH: return "X86ISD::UNPCKH"; case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; + case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; @@ -19009,11 +20734,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMV3: return "X86ISD::VPERMV3"; case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3"; case X86ISD::VPERMI: return "X86ISD::VPERMI"; + case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; + case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW"; case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; @@ -19022,10 +20749,17 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SFENCE: return "X86ISD::SFENCE"; case X86ISD::LFENCE: return "X86ISD::LFENCE"; case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA"; - case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL"; case X86ISD::SAHF: return "X86ISD::SAHF"; case X86ISD::RDRAND: return "X86ISD::RDRAND"; case X86ISD::RDSEED: return "X86ISD::RDSEED"; + case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; + case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; + case X86ISD::VPROT: return "X86ISD::VPROT"; + case X86ISD::VPROTI: return "X86ISD::VPROTI"; + case X86ISD::VPSHA: return "X86ISD::VPSHA"; + case X86ISD::VPSHL: return "X86ISD::VPSHL"; + case X86ISD::VPCOM: return "X86ISD::VPCOM"; + case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; @@ -19038,7 +20772,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; - case X86ISD::RNDSCALE: return "X86ISD::RNDSCALE"; + case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; + case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -19064,6 +20800,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND"; case X86ISD::FP_TO_SINT_RND: return "X86ISD::FP_TO_SINT_RND"; case X86ISD::FP_TO_UINT_RND: return "X86ISD::FP_TO_UINT_RND"; + case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS"; + case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS"; } return nullptr; } @@ -19218,7 +20956,7 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { - if (!(Subtarget->hasFMA() || Subtarget->hasFMA4() || Subtarget->hasAVX512())) + if (!Subtarget->hasAnyFMA()) return false; VT = VT.getScalarType(); @@ -19253,11 +20991,11 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, return false; // Not for i1 vectors - if (VT.getScalarType() == MVT::i1) + if (VT.getSimpleVT().getScalarType() == MVT::i1) return false; // Very little shuffling can be done for 64-bit vectors right now. - if (VT.getSizeInBits() == 64) + if (VT.getSimpleVT().getSizeInBits() == 64) return false; // We only care that the types being shuffled are legal. The lowering can @@ -19282,8 +21020,7 @@ static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, DebugLoc DL = MI->getDebugLoc(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // For the v = xbegin(), we generate // @@ -19531,8 +21268,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr *MI, offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); endMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++MBB->getIterator(); // Insert the new basic blocks MF->insert(MBBIter, offsetMBB); @@ -19702,8 +21438,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // stores were performed. const BasicBlock *LLVM_BB = MBB->getBasicBlock(); MachineFunction *F = MBB->getParent(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++MBB->getIterator(); MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(MBBIter, XMMSaveMBB); @@ -19727,7 +21462,7 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); - if (!Subtarget->isTargetWin64()) { + if (!Subtarget->isCallingConvWin64(F->getFunction()->getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); @@ -19744,9 +21479,8 @@ X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( // In the XMM save block, save all the XMM argument registers. for (int i = 3, e = MI->getNumOperands() - 1; i != e; ++i) { int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; - MachineMemOperand *MMO = - F->getMachineMemOperand( - MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), + MachineMemOperand *MMO = F->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset), MachineMemOperand::MOStore, /*Size=*/16, /*Align=*/16); BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc)) @@ -19800,6 +21534,39 @@ static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, return true; } +// Return true if it is OK for this CMOV pseudo-opcode to be cascaded +// together with other CMOV pseudo-opcodes into a single basic-block with +// conditional jump around it. +static bool isCMOVPseudo(MachineInstr *MI) { + switch (MI->getOpcode()) { + case X86::CMOV_FR32: + case X86::CMOV_FR64: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: + case X86::CMOV_V2F64: + case X86::CMOV_V2I64: + case X86::CMOV_V4F32: + case X86::CMOV_V4F64: + case X86::CMOV_V4I64: + case X86::CMOV_V16F32: + case X86::CMOV_V8F32: + case X86::CMOV_V8F64: + case X86::CMOV_V8I64: + case X86::CMOV_V8I1: + case X86::CMOV_V16I1: + case X86::CMOV_V32I1: + case X86::CMOV_V64I1: + return true; + + default: + return false; + } +} + MachineBasicBlock * X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *BB) const { @@ -19811,8 +21578,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -19823,8 +21589,41 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); - // We also lower double CMOVs: + // This code lowers all pseudo-CMOV instructions. Generally it lowers these + // as described above, by inserting a BB, and then making a PHI at the join + // point to select the true and false operands of the CMOV in the PHI. + // + // The code also handles two different cases of multiple CMOV opcodes + // in a row. + // + // Case 1: + // In this case, there are multiple CMOVs in a row, all which are based on + // the same condition setting (or the exact opposite condition setting). + // In this case we can lower all the CMOVs using a single inserted BB, and + // then make a number of PHIs at the join point to model the CMOVs. The only + // trickiness here, is that in a case like: + // + // t2 = CMOV cond1 t1, f1 + // t3 = CMOV cond1 t2, f2 + // + // when rewriting this into PHIs, we have to perform some renaming on the + // temps since you cannot have a PHI operand refer to a PHI result earlier + // in the same block. The "simple" but wrong lowering would be: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t2(BB1), f2(BB2) + // + // but clearly t2 is not defined in BB1, so that is incorrect. The proper + // renaming is to note that on the path through BB1, t2 is really just a + // copy of t1, and do that renaming, properly generating: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t1(BB1), f2(BB2) + // + // Case 2, we lower cascaded CMOVs such as + // // (CMOV (CMOV F, T, cc1), T, cc2) + // // to two successives branches. For that, we look for another CMOV as the // following instruction. // @@ -19890,19 +21689,42 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // .LBB5_4: // retq // - MachineInstr *NextCMOV = nullptr; + MachineInstr *CascadedCMOV = nullptr; + MachineInstr *LastCMOV = MI; + X86::CondCode CC = X86::CondCode(MI->getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); - if (NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && + + // Check for case 1, where there are multiple CMOVs with the same condition + // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the + // number of jumps the most. + + if (isCMOVPseudo(MI)) { + // See if we have a string of CMOVS with the same condition. + while (NextMIIt != BB->end() && + isCMOVPseudo(NextMIIt) && + (NextMIIt->getOperand(3).getImm() == CC || + NextMIIt->getOperand(3).getImm() == OppCC)) { + LastCMOV = &*NextMIIt; + ++NextMIIt; + } + } + + // This checks for case 2, but only do this if we didn't already find + // case 1, as indicated by LastCMOV == MI. + if (LastCMOV == MI && + NextMIIt != BB->end() && NextMIIt->getOpcode() == MI->getOpcode() && NextMIIt->getOperand(2).getReg() == MI->getOperand(2).getReg() && - NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) - NextCMOV = &*NextMIIt; + NextMIIt->getOperand(1).getReg() == MI->getOperand(0).getReg()) { + CascadedCMOV = &*NextMIIt; + } MachineBasicBlock *jcc1MBB = nullptr; - // If we have a double CMOV, we lower it to two successive branches to + // If we have a cascaded CMOV, we lower it to two successive branches to // the same block. EFLAGS is used by both, so mark it as live in the second. - if (NextCMOV) { + if (CascadedCMOV) { jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, jcc1MBB); jcc1MBB->addLiveIn(X86::EFLAGS); @@ -19917,7 +21739,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); - MachineInstr *LastEFLAGSUser = NextCMOV ? NextCMOV : MI; + MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { copy0MBB->addLiveIn(X86::EFLAGS); @@ -19926,12 +21748,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(MI)), BB->end()); + std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); // Add the true and fallthrough blocks as its successors. - if (NextCMOV) { - // The fallthrough block may be jcc1MBB, if we have a double CMOV. + if (CascadedCMOV) { + // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. BB->addSuccessor(jcc1MBB); // In that case, jcc1MBB will itself fallthrough the copy0MBB, and @@ -19946,13 +21768,12 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, BB->addSuccessor(sinkMBB); // Create the conditional branch instruction. - unsigned Opc = - X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); + unsigned Opc = X86::GetCondBranchFromCond(CC); BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); - if (NextCMOV) { + if (CascadedCMOV) { unsigned Opc2 = X86::GetCondBranchFromCond( - (X86::CondCode)NextCMOV->getOperand(3).getImm()); + (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); } @@ -19964,28 +21785,110 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... - MachineInstrBuilder MIB = - BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), - MI->getOperand(0).getReg()) - .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) - .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); + DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; + MachineInstrBuilder MIB; + + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. - // If we have a double CMOV, the second Jcc provides the same incoming + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are generating is the opposite condition from + // the jump we generated, then we have to swap the operands for the + // PHI that is going to be generated. + if (MIIt->getOperand(3).getImm() == OppCC) + std::swap(Op1Reg, Op2Reg); + + if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) + Op1Reg = RegRewriteTable[Op1Reg].first; + + if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) + Op2Reg = RegRewriteTable[Op2Reg].second; + + MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, + TII->get(X86::PHI), DestReg) + .addReg(Op1Reg).addMBB(copy0MBB) + .addReg(Op2Reg).addMBB(thisMBB); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + // If we have a cascaded CMOV, the second Jcc provides the same incoming // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). - if (NextCMOV) { + if (CascadedCMOV) { MIB.addReg(MI->getOperand(2).getReg()).addMBB(jcc1MBB); // Copy the PHI result to the register defined by the second CMOV. BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), - DL, TII->get(TargetOpcode::COPY), NextCMOV->getOperand(0).getReg()) + DL, TII->get(TargetOpcode::COPY), + CascadedCMOV->getOperand(0).getReg()) .addReg(MI->getOperand(0).getReg()); - NextCMOV->eraseFromParent(); + CascadedCMOV->eraseFromParent(); } - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Now remove the CMOV(s). + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) + (MIIt++)->eraseFromParent(); + return sinkMBB; } MachineBasicBlock * +X86TargetLowering::EmitLoweredAtomicFP(MachineInstr *MI, + MachineBasicBlock *BB) const { + // Combine the following atomic floating-point modification pattern: + // a.store(reg OP a.load(acquire), release) + // Transform them into: + // OPss (%gpr), %xmm + // movss %xmm, (%gpr) + // Or sd equivalent for 64-bit operations. + unsigned MOp, FOp; + switch (MI->getOpcode()) { + default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP"); + case X86::RELEASE_FADD32mr: MOp = X86::MOVSSmr; FOp = X86::ADDSSrm; break; + case X86::RELEASE_FADD64mr: MOp = X86::MOVSDmr; FOp = X86::ADDSDrm; break; + } + const X86InstrInfo *TII = Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineOperand MSrc = MI->getOperand(0); + unsigned VSrc = MI->getOperand(5).getReg(); + const MachineOperand &Disp = MI->getOperand(3); + MachineOperand ZeroDisp = MachineOperand::CreateImm(0); + bool hasDisp = Disp.isGlobal() || Disp.isImm(); + if (hasDisp && MSrc.isReg()) + MSrc.setIsKill(false); + MachineInstrBuilder MIM = BuildMI(*BB, MI, DL, TII->get(MOp)) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(0); + MachineInstr *MIO = BuildMI(*BB, (MachineInstr *)MIM, DL, TII->get(FOp), + MRI.createVirtualRegister(MRI.getRegClass(VSrc))) + .addReg(VSrc) + .addOperand(/*Base=*/MSrc) + .addImm(/*Scale=*/1) + .addReg(/*Index=*/0) + .addDisp(hasDisp ? Disp : ZeroDisp, /*off=*/0) + .addReg(/*Segment=*/0); + MIM.addReg(MIO->getOperand(0).getReg(), RegState::Kill); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + +MachineBasicBlock * X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); @@ -20032,8 +21935,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, sizeVReg = MI->getOperand(1).getReg(), physSPReg = IsLP64 || Subtarget->isTargetNaCl64() ? X86::RSP : X86::ESP; - MachineFunction::iterator MBBIter = BB; - ++MBBIter; + MachineFunction::iterator MBBIter = ++BB->getIterator(); MF->insert(MBBIter, bumpMBB); MF->insert(MBBIter, mallocMBB); @@ -20120,14 +22022,60 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock * X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const { + assert(!Subtarget->isTargetMachO()); DebugLoc DL = MI->getDebugLoc(); + MachineInstr *ResumeMI = Subtarget->getFrameLowering()->emitStackProbe( + *BB->getParent(), *BB, MI, DL, false); + MachineBasicBlock *ResumeBB = ResumeMI->getParent(); + MI->eraseFromParent(); // The pseudo instruction is gone now. + return ResumeBB; +} - assert(!Subtarget->isTargetMachO()); +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + MachineBasicBlock *TargetMBB = MI->getOperand(0).getMBB(); + DebugLoc DL = MI->getDebugLoc(); - Subtarget->getFrameLowering()->emitStackProbeCall(*BB->getParent(), *BB, MI, - DL); + assert(!isAsynchronousEHPersonality( + classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && + "SEH does not use catchret!"); - MI->eraseFromParent(); // The pseudo instruction is gone now. + // Only 32-bit EH needs to worry about manually restoring stack pointers. + if (!Subtarget->is32Bit()) + return BB; + + // C++ EH creates a new target block to hold the restore code, and wires up + // the new block to the return destination with a normal JMP_4. + MachineBasicBlock *RestoreMBB = + MF->CreateMachineBasicBlock(BB->getBasicBlock()); + assert(BB->succ_size() == 1); + MF->insert(std::next(BB->getIterator()), RestoreMBB); + RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); + BB->addSuccessor(RestoreMBB); + MI->getOperand(0).setMBB(RestoreMBB); + + auto RestoreMBBI = RestoreMBB->begin(); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE)); + BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); + return BB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + const Constant *PerFn = MF->getFunction()->getPersonalityFn(); + bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); + // Only 32-bit SEH requires special handling for catchpad. + if (IsSEH && Subtarget->is32Bit()) { + const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE)); + } + MI->eraseFromParent(); return BB; } @@ -20149,6 +22097,8 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, // FIXME: The 32-bit calls have non-standard calling conventions. Use a // proper register mask. const uint32_t *RegMask = + Subtarget->is64Bit() ? + Subtarget->getRegisterInfo()->getDarwinTLSCallPreservedMask() : Subtarget->getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); if (Subtarget->is64Bit()) { MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, @@ -20198,8 +22148,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MachineRegisterInfo &MRI = MF->getRegInfo(); const BasicBlock *BB = MBB->getBasicBlock(); - MachineFunction::iterator I = MBB; - ++I; + MachineFunction::iterator I = ++MBB->getIterator(); // Memory Reference MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); @@ -20225,7 +22174,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, // For v = setjmp(buf), we generate // // thisMBB: - // buf[LabelOffset] = restoreMBB + // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB // SjLjSetup restoreMBB // // mainMBB: @@ -20245,6 +22194,7 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, MF->insert(I, mainMBB); MF->insert(I, sinkMBB); MF->push_back(restoreMBB); + restoreMBB->setHasAddressTaken(); MachineInstrBuilder MIB; @@ -20511,35 +22461,44 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return BB; case X86::WIN_ALLOCA: return EmitLoweredWinAlloca(MI, BB); + case X86::CATCHRET: + return EmitLoweredCatchRet(MI, BB); + case X86::CATCHPAD: + return EmitLoweredCatchPad(MI, BB); case X86::SEG_ALLOCA_32: case X86::SEG_ALLOCA_64: return EmitLoweredSegAlloca(MI, BB); case X86::TLSCall_32: case X86::TLSCall_64: return EmitLoweredTLSCall(MI, BB); - case X86::CMOV_GR8: case X86::CMOV_FR32: case X86::CMOV_FR64: - case X86::CMOV_V4F32: + case X86::CMOV_FR128: + case X86::CMOV_GR8: + case X86::CMOV_GR16: + case X86::CMOV_GR32: + case X86::CMOV_RFP32: + case X86::CMOV_RFP64: + case X86::CMOV_RFP80: case X86::CMOV_V2F64: case X86::CMOV_V2I64: - case X86::CMOV_V8F32: + case X86::CMOV_V4F32: case X86::CMOV_V4F64: case X86::CMOV_V4I64: case X86::CMOV_V16F32: + case X86::CMOV_V8F32: case X86::CMOV_V8F64: case X86::CMOV_V8I64: - case X86::CMOV_GR16: - case X86::CMOV_GR32: - case X86::CMOV_RFP32: - case X86::CMOV_RFP64: - case X86::CMOV_RFP80: case X86::CMOV_V8I1: case X86::CMOV_V16I1: case X86::CMOV_V32I1: case X86::CMOV_V64I1: return EmitLoweredSelect(MI, BB); + case X86::RELEASE_FADD32mr: + case X86::RELEASE_FADD64mr: + return EmitLoweredAtomicFP(MI, BB); + case X86::FP32_TO_INT16_IN_MEM: case X86::FP32_TO_INT32_IN_MEM: case X86::FP32_TO_INT64_IN_MEM: @@ -20793,7 +22752,7 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( unsigned Depth) const { // SETCC_CARRY sets the dest to ~0 for true or 0 for false. if (Op.getOpcode() == X86ISD::SETCC_CARRY) - return Op.getValueType().getScalarType().getSizeInBits(); + return Op.getValueType().getScalarSizeInBits(); // Fallback case. return 1; @@ -20814,39 +22773,8 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N, return TargetLowering::isGAPlusOffset(N, GA, Offset); } -/// isShuffleHigh128VectorInsertLow - Checks whether the shuffle node is the -/// same as extracting the high 128-bit part of 256-bit vector and then -/// inserting the result into the low part of a new 256-bit vector -static bool isShuffleHigh128VectorInsertLow(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - for (unsigned i = 0, j = NumElems/2; i != NumElems/2; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - -/// isShuffleLow128VectorInsertHigh - Checks whether the shuffle node is the -/// same as extracting the low 128-bit part of 256-bit vector and then -/// inserting the result into the high part of a new 256-bit vector -static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) { - EVT VT = SVOp->getValueType(0); - unsigned NumElems = VT.getVectorNumElements(); - - // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - for (unsigned i = NumElems/2, j = 0; i != NumElems; ++i, ++j) - if (!isUndefOrEqual(SVOp->getMaskElt(i), j) || - SVOp->getMaskElt(j) >= 0) - return false; - - return true; -} - /// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors. +/// FIXME: This could be expanded to support 512 bit vectors as well. static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget* Subtarget) { @@ -20854,7 +22782,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); - EVT VT = SVOp->getValueType(0); + MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); if (V1.getOpcode() == ISD::CONCAT_VECTORS && @@ -20920,24 +22848,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG, return DCI.CombineTo(N, InsV); } - //===--------------------------------------------------------------------===// - // Combine some shuffles into subvector extracts and inserts: - // - - // vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> - if (isShuffleHigh128VectorInsertLow(SVOp)) { - SDValue V = Extract128BitVector(V1, NumElems/2, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, 0, DAG, dl); - return DCI.CombineTo(N, InsV); - } - - // vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> - if (isShuffleLow128VectorInsertHigh(SVOp)) { - SDValue V = Extract128BitVector(V1, 0, DAG, dl); - SDValue InsV = Insert128BitVector(DAG.getUNDEF(VT), V, NumElems/2, DAG, dl); - return DCI.CombineTo(N, InsV); - } - return SDValue(); } @@ -20966,10 +22876,22 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, MVT RootVT = Root.getSimpleValueType(); SDLoc DL(Root); - // Just remove no-op shuffle masks. if (Mask.size() == 1) { - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), - /*AddTo*/ true); + int Index = Mask[0]; + assert((Index >= 0 || Index == SM_SentinelUndef || + Index == SM_SentinelZero) && + "Invalid shuffle index found!"); + + // We may end up with an accumulated mask of size 1 as a result of + // widening of shuffle operands (see function canWidenShuffleElements). + // If the only shuffle index is equal to SM_SentinelZero then propagate + // a zero vector. Otherwise, the combine shuffle mask is a no-op shuffle + // mask, and therefore the entire chain of shuffles can be folded away. + if (Index == SM_SentinelZero) + DCI.CombineTo(Root.getNode(), getZeroVector(RootVT, Subtarget, DAG, DL)); + else + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Input), + /*AddTo*/ true); return true; } @@ -20985,7 +22907,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // doesn't preclude something switching to the shorter encoding post-RA. // // FIXME: Should teach these routines about AVX vector widths. - if (FloatDomain && VT.getSizeInBits() == 128) { + if (FloatDomain && VT.is128BitVector()) { if (Mask.equals({0, 0}) || Mask.equals({1, 1})) { bool Lo = Mask.equals({0, 0}); unsigned Shuffle; @@ -21049,7 +22971,7 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask, // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK // variants as none of these have single-instruction variants that are // superior to the UNPCK formulation. - if (!FloatDomain && VT.getSizeInBits() == 128 && + if (!FloatDomain && VT.is128BitVector() && (Mask.equals({0, 0, 1, 1, 2, 2, 3, 3}) || Mask.equals({4, 4, 5, 5, 6, 6, 7, 7}) || Mask.equals({0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}) || @@ -21226,26 +23148,28 @@ static bool combineX86ShufflesRecursively(SDValue Op, SDValue Root, // See if we can recurse into the operand to combine more things. switch (Op.getOpcode()) { - case X86ISD::PSHUFB: - HasPSHUFB = true; - case X86ISD::PSHUFD: - case X86ISD::PSHUFHW: - case X86ISD::PSHUFLW: - if (Op.getOperand(0).hasOneUse() && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + case X86ISD::PSHUFB: + HasPSHUFB = true; + case X86ISD::PSHUFD: + case X86ISD::PSHUFHW: + case X86ISD::PSHUFLW: + if (Op.getOperand(0).hasOneUse() && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; - case X86ISD::UNPCKL: - case X86ISD::UNPCKH: - assert(Op.getOperand(0) == Op.getOperand(1) && "We only combine unary shuffles!"); - // We can't check for single use, we have to check that this shuffle is the only user. - if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + case X86ISD::UNPCKL: + case X86ISD::UNPCKH: + assert(Op.getOperand(0) == Op.getOperand(1) && + "We only combine unary shuffles!"); + // We can't check for single use, we have to check that this shuffle is the + // only user. + if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && + combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, + HasPSHUFB, DAG, DCI, Subtarget)) + return true; + break; } // Minor canonicalization of the accumulated shuffle mask to make it easier @@ -21360,8 +23284,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, case X86ISD::UNPCKH: // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword // shuffle into a preceding word shuffle. - if (V.getSimpleValueType().getScalarType() != MVT::i8 && - V.getSimpleValueType().getScalarType() != MVT::i16) + if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && + V.getSimpleValueType().getVectorElementType() != MVT::i16) return SDValue(); // Search for a half-shuffle which we can combine with. @@ -21438,7 +23362,8 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, return V; } -/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw. +/// \brief Search for a combinable shuffle across a chain ending in pshuflw or +/// pshufhw. /// /// We walk up the chain, skipping shuffles of the other half and looking /// through shuffles which switch halves trying to find a shuffle of the same @@ -21520,6 +23445,41 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, Mask = getPSHUFShuffleMask(N); assert(Mask.size() == 4); break; + case X86ISD::UNPCKL: { + // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in + // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE + // moves upper half elements into the lower half part. For example: + // + // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1, + // undef:v16i8 + // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2 + // + // will be combined to: + // + // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1 + + // This is only for 128-bit vectors. From SSE4.1 onward this combine may not + // happen due to advanced instructions. + if (!VT.is128BitVector()) + return SDValue(); + + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + if (Op0.getOpcode() == ISD::UNDEF && + Op1.getNode()->getOpcode() == ISD::VECTOR_SHUFFLE) { + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); + + unsigned NumElts = VT.getVectorNumElements(); + SmallVector<int, 8> ExpectedMask(NumElts, -1); + std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2, + NumElts / 2); + + auto ShufOp = Op1.getOperand(0); + if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask)) + return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp); + } + return SDValue(); + } default: return SDValue(); } @@ -21535,7 +23495,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, break; case X86ISD::PSHUFLW: case X86ISD::PSHUFHW: - assert(VT.getScalarType() == MVT::i16 && "Bad word shuffle type!"); + assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!"); if (combineRedundantHalfShuffle(N, Mask, DAG, DCI)) return SDValue(); // We combined away this shuffle, so we're done. @@ -21624,14 +23584,19 @@ static SDValue combineShuffleToAddSub(SDNode *N, SelectionDAG &DAG) { return SDValue(); auto *SVN = cast<ShuffleVectorSDNode>(N); - ArrayRef<int> Mask = SVN->getMask(); + SmallVector<int, 8> Mask; + for (int M : SVN->getMask()) + Mask.push_back(M); + SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); - // We require the first shuffle operand to be the SUB node, and the second to - // be the ADD node. - // FIXME: We should support the commuted patterns. - if (V1->getOpcode() != ISD::FSUB || V2->getOpcode() != ISD::FADD) + // We require the first shuffle operand to be the FSUB node, and the second to + // be the FADD node. + if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) { + ShuffleVectorSDNode::commuteMask(Mask); + std::swap(V1, V2); + } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD) return SDValue(); // If there are other uses of these operations we can't fold them. @@ -21682,7 +23647,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, return AddSub; // Combine 256-bit vector shuffles. This is only profitable when in AVX mode - if (Subtarget->hasFp256() && VT.is256BitVector() && + if (TLI.isTypeLegal(VT) && Subtarget->hasFp256() && VT.is256BitVector() && N->getOpcode() == ISD::VECTOR_SHUFFLE) return PerformShuffleCombine256(N, DAG, DCI, Subtarget); @@ -21866,21 +23831,45 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, EltNo); } -/// \brief Detect bitcasts between i32 to x86mmx low word. Since MMX types are -/// special and don't usually play with other vector types, it's better to -/// handle them early to be sure we emit efficient code by avoiding -/// store-load conversions. -static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG) { - if (N->getValueType(0) != MVT::x86mmx || - N->getOperand(0)->getOpcode() != ISD::BUILD_VECTOR || - N->getOperand(0)->getValueType(0) != MVT::v2i32) - return SDValue(); +static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); - SDValue V = N->getOperand(0); - ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(1)); - if (C && C->getZExtValue() == 0 && V.getOperand(0).getValueType() == MVT::i32) - return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(V.getOperand(0)), - N->getValueType(0), V.getOperand(0)); + // Detect bitcasts between i32 to x86mmx low word. Since MMX types are + // special and don't usually play with other vector types, it's better to + // handle them early to be sure we emit efficient code by avoiding + // store-load conversions. + if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR && + N0.getValueType() == MVT::v2i32 && + isNullConstant(N0.getOperand(1))) { + SDValue N00 = N0->getOperand(0); + if (N00.getValueType() == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00); + } + + // Convert a bitcasted integer logic operation that has one bitcasted + // floating-point operand and one constant operand into a floating-point + // logic operation. This may create a load of the constant, but that is + // cheaper than materializing the constant in an integer register and + // transferring it to an SSE register or transferring the SSE operand to + // integer register and back. + unsigned FPOpcode; + switch (N0.getOpcode()) { + case ISD::AND: FPOpcode = X86ISD::FAND; break; + case ISD::OR: FPOpcode = X86ISD::FOR; break; + case ISD::XOR: FPOpcode = X86ISD::FXOR; break; + default: return SDValue(); + } + if (((Subtarget->hasSSE1() && VT == MVT::f32) || + (Subtarget->hasSSE2() && VT == MVT::f64)) && + isa<ConstantSDNode>(N0.getOperand(1)) && + N0.getOperand(0).getOpcode() == ISD::BITCAST && + N0.getOperand(0).getOperand(0).getValueType() == VT) { + SDValue N000 = N0.getOperand(0).getOperand(0); + SDValue FPConst = DAG.getBitcast(VT, N0.getOperand(1)); + return DAG.getNode(FPOpcode, SDLoc(N0), VT, N000, FPConst); + } return SDValue(); } @@ -21910,26 +23899,26 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, InputVector.getNode()->getOperand(0)); // The mmx is indirect: (i64 extract_elt (v1i64 bitcast (x86mmx ...))). - SDValue MMXSrcOp = MMXSrc.getOperand(0); if (MMXSrc.getOpcode() == ISD::EXTRACT_VECTOR_ELT && MMXSrc.hasOneUse() && - MMXSrc.getValueType() == MVT::i64 && MMXSrcOp.hasOneUse() && - MMXSrcOp.getOpcode() == ISD::BITCAST && - MMXSrcOp.getValueType() == MVT::v1i64 && - MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) - return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), - N->getValueType(0), - MMXSrcOp.getOperand(0)); + MMXSrc.getValueType() == MVT::i64) { + SDValue MMXSrcOp = MMXSrc.getOperand(0); + if (MMXSrcOp.hasOneUse() && MMXSrcOp.getOpcode() == ISD::BITCAST && + MMXSrcOp.getValueType() == MVT::v1i64 && + MMXSrcOp.getOperand(0).getValueType() == MVT::x86mmx) + return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector), + N->getValueType(0), MMXSrcOp.getOperand(0)); + } } EVT VT = N->getValueType(0); - if (VT == MVT::i1 && dyn_cast<ConstantSDNode>(N->getOperand(1)) && + if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) && InputVector.getOpcode() == ISD::BITCAST && - dyn_cast<ConstantSDNode>(InputVector.getOperand(0))) { + isa<ConstantSDNode>(InputVector.getOperand(0))) { uint64_t ExtractedElt = - cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); uint64_t InputValue = - cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); + cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue(); uint64_t Res = (InputValue >> ExtractedElt) & 1; return DAG.getConstant(Res, dl, MVT::i1); } @@ -22036,96 +24025,6 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match. -static std::pair<unsigned, bool> -matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS, - SelectionDAG &DAG, const X86Subtarget *Subtarget) { - if (!VT.isVector()) - return std::make_pair(0, false); - - bool NeedSplit = false; - switch (VT.getSimpleVT().SimpleTy) { - default: return std::make_pair(0, false); - case MVT::v4i64: - case MVT::v2i64: - if (!Subtarget->hasVLX()) - return std::make_pair(0, false); - break; - case MVT::v64i8: - case MVT::v32i16: - if (!Subtarget->hasBWI()) - return std::make_pair(0, false); - break; - case MVT::v16i32: - case MVT::v8i64: - if (!Subtarget->hasAVX512()) - return std::make_pair(0, false); - break; - case MVT::v32i8: - case MVT::v16i16: - case MVT::v8i32: - if (!Subtarget->hasAVX2()) - NeedSplit = true; - if (!Subtarget->hasAVX()) - return std::make_pair(0, false); - break; - case MVT::v16i8: - case MVT::v8i16: - case MVT::v4i32: - if (!Subtarget->hasSSE2()) - return std::make_pair(0, false); - } - - // SSE2 has only a small subset of the operations. - bool hasUnsigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v16i8); - bool hasSigned = Subtarget->hasSSE41() || - (Subtarget->hasSSE2() && VT == MVT::v8i16); - - ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); - - unsigned Opc = 0; - // Check for x CC y ? x : y. - if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && - DAG.isEqualTo(RHS, Cond.getOperand(1))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? ISD::UMIN : 0; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? ISD::UMAX : 0; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? ISD::SMIN : 0; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? ISD::SMAX : 0; break; - } - // Check for x CC y ? y : x -- a min/max with reversed arms. - } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && - DAG.isEqualTo(RHS, Cond.getOperand(0))) { - switch (CC) { - default: break; - case ISD::SETULT: - case ISD::SETULE: - Opc = hasUnsigned ? ISD::UMAX : 0; break; - case ISD::SETUGT: - case ISD::SETUGE: - Opc = hasUnsigned ? ISD::UMIN : 0; break; - case ISD::SETLT: - case ISD::SETLE: - Opc = hasSigned ? ISD::SMAX : 0; break; - case ISD::SETGT: - case ISD::SETGE: - Opc = hasSigned ? ISD::SMIN : 0; break; - } - } - - return std::make_pair(Opc, NeedSplit); -} - static SDValue transformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { @@ -22189,7 +24088,8 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // ignored in unsafe-math mode). // We also try to create v2f32 min/max nodes, which we later widen to v4f32. if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && - VT != MVT::f80 && (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && + VT != MVT::f80 && VT != MVT::f128 && + (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && (Subtarget->hasSSE2() || (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) { ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); @@ -22535,32 +24435,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, } } - // Try to match a min/max vector operation. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) { - std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget); - unsigned Opc = ret.first; - bool NeedSplit = ret.second; - - if (Opc && NeedSplit) { - unsigned NumElems = VT.getVectorNumElements(); - // Extract the LHS vectors - SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL); - SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL); - - // Extract the RHS vectors - SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL); - SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL); - - // Create min/max for each subvector - LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1); - RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2); - - // Merge the result - return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS); - } else if (Opc) - return DAG.getNode(Opc, DL, VT, LHS, RHS); - } - // Simplify vector selection if condition value type matches vselect // operand type if (N->getOpcode() == ISD::VSELECT && CondVT == VT) { @@ -22635,7 +24509,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() && !DCI.isBeforeLegalize() && !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { - unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits(); + unsigned BitWidth = Cond.getValueType().getScalarSizeInBits(); // Don't optimize vector selects that map to mask-registers. if (BitWidth == 1) @@ -22656,14 +24530,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // FIXME: We don't support i16-element blends currently. We could and // should support them by making *all* the bits in the condition be set // rather than just the high bit and using an i8-element blend. - if (VT.getScalarType() == MVT::i16) + if (VT.getVectorElementType() == MVT::i16) return SDValue(); // Dynamic blending was only available from SSE4.1 onward. - if (VT.getSizeInBits() == 128 && !Subtarget->hasSSE41()) + if (VT.is128BitVector() && !Subtarget->hasSSE41()) return SDValue(); // Byte blends are only available in AVX2 - if (VT.getSizeInBits() == 256 && VT.getScalarType() == MVT::i8 && - !Subtarget->hasAVX2()) + if (VT == MVT::v32i8 && !Subtarget->hasAVX2()) return SDValue(); assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); @@ -22773,12 +24646,9 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { SetCC.getOpcode() == ISD::AND) { if (SetCC.getOpcode() == ISD::AND) { int OpIdx = -1; - ConstantSDNode *CS; - if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(0))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(0))) OpIdx = 1; - if ((CS = dyn_cast<ConstantSDNode>(SetCC.getOperand(1))) && - CS->getZExtValue() == 1) + if (isOneConstant(SetCC.getOperand(1))) OpIdx = 0; if (OpIdx == -1) break; @@ -22857,8 +24727,7 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd) { if (Cond->getOpcode() == X86ISD::CMP) { - ConstantSDNode *CondOp1C = dyn_cast<ConstantSDNode>(Cond->getOperand(1)); - if (!CondOp1C || !CondOp1C->isNullValue()) + if (!isNullConstant(Cond->getOperand(1))) return false; Cond = Cond->getOperand(0); @@ -23102,106 +24971,15 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - switch (IntNo) { - default: return SDValue(); - // SSE/AVX/AVX2 blend intrinsics. - case Intrinsic::x86_avx2_pblendvb: - // Don't try to simplify this intrinsic if we don't have AVX2. - if (!Subtarget->hasAVX2()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_avx_blendv_pd_256: - case Intrinsic::x86_avx_blendv_ps_256: - // Don't try to simplify this intrinsic if we don't have AVX. - if (!Subtarget->hasAVX()) - return SDValue(); - // FALL-THROUGH - case Intrinsic::x86_sse41_blendvps: - case Intrinsic::x86_sse41_blendvpd: - case Intrinsic::x86_sse41_pblendvb: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - SDValue Mask = N->getOperand(3); - - // Don't try to simplify this intrinsic if we don't have SSE4.1. - if (!Subtarget->hasSSE41()) - return SDValue(); - - // fold (blend A, A, Mask) -> A - if (Op0 == Op1) - return Op0; - // fold (blend A, B, allZeros) -> A - if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return Op0; - // fold (blend A, B, allOnes) -> B - if (ISD::isBuildVectorAllOnes(Mask.getNode())) - return Op1; - - // Simplify the case where the mask is a constant i32 value. - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) { - if (C->isNullValue()) - return Op0; - if (C->isAllOnesValue()) - return Op1; - } - - return SDValue(); - } - - // Packed SSE2/AVX2 arithmetic shift immediate intrinsics. - case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_sse2_psrai_d: - case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx2_psrai_d: - case Intrinsic::x86_sse2_psra_w: - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_avx2_psra_w: - case Intrinsic::x86_avx2_psra_d: { - SDValue Op0 = N->getOperand(1); - SDValue Op1 = N->getOperand(2); - EVT VT = Op0.getValueType(); - assert(VT.isVector() && "Expected a vector type!"); - - if (isa<BuildVectorSDNode>(Op1)) - Op1 = Op1.getOperand(0); - - if (!isa<ConstantSDNode>(Op1)) - return SDValue(); - - EVT SVT = VT.getVectorElementType(); - unsigned SVTBits = SVT.getSizeInBits(); - - ConstantSDNode *CND = cast<ConstantSDNode>(Op1); - const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue()); - uint64_t ShAmt = C.getZExtValue(); - - // Don't try to convert this shift into a ISD::SRA if the shift - // count is bigger than or equal to the element size. - if (ShAmt >= SVTBits) - return SDValue(); - - // Trivial case: if the shift count is zero, then fold this - // into the first operand. - if (ShAmt == 0) - return Op0; - - // Replace this packed shift intrinsic with a target independent - // shift dag node. - SDLoc DL(N); - SDValue Splat = DAG.getConstant(C, DL, VT); - return DAG.getNode(ISD::SRA, DL, VT, Op0, Splat); - } - } -} - /// PerformMulCombine - Optimize a single multiply with constant into two /// in order to implement it with two cheaper instructions, e.g. /// LEA + SHL, LEA + LEA. static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + // An imul is usually smaller than the alternative sequence. + if (DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); @@ -23228,9 +25006,11 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, MulAmt1 = 3; MulAmt2 = MulAmt / 3; } + + SDLoc DL(N); + SDValue NewMul; if (MulAmt2 && (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ - SDLoc DL(N); if (isPowerOf2_64(MulAmt2) && !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) @@ -23239,7 +25019,6 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, // is an add. std::swap(MulAmt1, MulAmt2); - SDValue NewMul; if (isPowerOf2_64(MulAmt1)) NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); @@ -23253,10 +25032,31 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, else NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, DAG.getConstant(MulAmt2, DL, VT)); + } + + if (!NewMul) { + assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) + && "Both cases that could cause potential overflows should have " + "already been handled."); + if (isPowerOf2_64(MulAmt - 1)) + // (mul x, 2^N + 1) => (add (shl x, N), x) + NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), + DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt - 1), DL, + MVT::i8))); + else if (isPowerOf2_64(MulAmt + 1)) + // (mul x, 2^N - 1) => (sub (shl x, N), x) + NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, + N->getOperand(0), + DAG.getConstant(Log2_64(MulAmt + 1), + DL, MVT::i8)), N->getOperand(0)); + } + + if (NewMul) // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, NewMul, false); - } + return SDValue(); } @@ -23272,18 +25072,34 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { N1C && N0.getOpcode() == ISD::AND && N0.getOperand(1).getOpcode() == ISD::Constant) { SDValue N00 = N0.getOperand(0); - if (N00.getOpcode() == X86ISD::SETCC_CARRY || - ((N00.getOpcode() == ISD::ANY_EXTEND || - N00.getOpcode() == ISD::ZERO_EXTEND) && - N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { - APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); - APInt ShAmt = N1C->getAPIntValue(); - Mask = Mask.shl(ShAmt); - if (Mask != 0) { - SDLoc DL(N); - return DAG.getNode(ISD::AND, DL, VT, - N00, DAG.getConstant(Mask, DL, VT)); - } + APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); + APInt ShAmt = N1C->getAPIntValue(); + Mask = Mask.shl(ShAmt); + bool MaskOK = false; + // We can handle cases concerning bit-widening nodes containing setcc_c if + // we carefully interrogate the mask to make sure we are semantics + // preserving. + // The transform is not safe if the result of C1 << C2 exceeds the bitwidth + // of the underlying setcc_c operation if the setcc_c was zero extended. + // Consider the following example: + // zext(setcc_c) -> i32 0x0000FFFF + // c1 -> i32 0x0000FFFF + // c2 -> i32 0x00000001 + // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE + // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE + if (N00.getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if (N00.getOpcode() == ISD::SIGN_EXTEND && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = true; + } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || + N00.getOpcode() == ISD::ANY_EXTEND) && + N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { + MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); + } + if (MaskOK && Mask != 0) { + SDLoc DL(N); + return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); } } @@ -23304,6 +25120,59 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformSRACombine(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned Size = VT.getSizeInBits(); + + // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) + // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or + // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) + // depending on sign of (SarConst - [56,48,32,24,16]) + + // sexts in X86 are MOVs. The MOVs have the same code size + // as above SHIFTs (only SHIFT on 1 has lower code size). + // However the MOVs have 2 advantages to a SHIFT: + // 1. MOVs can write to a register that differs from source + // 2. MOVs accept memory operands + + if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant || + N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || + N0.getOperand(1).getOpcode() != ISD::Constant) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue(); + APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue(); + EVT CVT = N1.getValueType(); + + if (SarConst.isNegative()) + return SDValue(); + + for (MVT SVT : MVT::integer_valuetypes()) { + unsigned ShiftSize = SVT.getSizeInBits(); + // skipping types without corresponding sext/zext and + // ShlConst that is not one of [56,48,32,24,16] + if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize) + continue; + SDLoc DL(N); + SDValue NN = + DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); + SarConst = SarConst - (Size - ShiftSize); + if (SarConst == 0) + return NN; + else if (SarConst.isNegative()) + return DAG.getNode(ISD::SHL, DL, VT, NN, + DAG.getConstant(-SarConst, DL, CVT)); + else + return DAG.getNode(ISD::SRA, DL, VT, NN, + DAG.getConstant(SarConst, DL, CVT)); + } + return SDValue(); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -23321,14 +25190,15 @@ static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG, if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt)) if (auto *AmtSplat = AmtBV->getConstantSplatNode()) { APInt ShiftAmt = AmtSplat->getAPIntValue(); - unsigned MaxAmount = VT.getVectorElementType().getSizeInBits(); + unsigned MaxAmount = + VT.getSimpleVT().getVectorElementType().getSizeInBits(); // SSE2/AVX2 logical shifts always return a vector of 0s // if the shift amount is bigger than or equal to // the element size. The constant shift amount will be // encoded as a 8-bit immediate. if (ShiftAmt.trunc(8).uge(MaxAmount)) - return getZeroVector(VT, Subtarget, DAG, DL); + return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL); } return SDValue(); @@ -23342,6 +25212,10 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, if (SDValue V = PerformSHLCombine(N, DAG)) return V; + if (N->getOpcode() == ISD::SRA) + if (SDValue V = PerformSRACombine(N, DAG)) + return V; + // Try to fold this logical shift into a zero vector. if (N->getOpcode() != ISD::SRA) if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget)) @@ -23537,7 +25411,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, // Set N0 and N1 to hold the inputs to the new wide operation. N0 = N0->getOperand(0); if (RHSConstSplat) { - N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(), + N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(), SDValue(RHSConstSplat, 0)); SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1); N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C); @@ -23552,9 +25426,9 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG, case ISD::ANY_EXTEND: return Op; case ISD::ZERO_EXTEND: { - unsigned InBits = NarrowVT.getScalarType().getSizeInBits(); + unsigned InBits = NarrowVT.getScalarSizeInBits(); APInt Mask = APInt::getAllOnesValue(InBits); - Mask = Mask.zext(VT.getScalarType().getSizeInBits()); + Mask = Mask.zext(VT.getScalarSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(Mask, DL, VT)); } @@ -23656,6 +25530,41 @@ static SDValue VectorZextCombine(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(N0.getValueType(), NewShuffle); } +/// If both input operands of a logic op are being cast from floating point +/// types, try to convert this into a floating point logic node to avoid +/// unnecessary moves from SSE to integer registers. +static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + unsigned FPOpcode = ISD::DELETED_NODE; + if (N->getOpcode() == ISD::AND) + FPOpcode = X86ISD::FAND; + else if (N->getOpcode() == ISD::OR) + FPOpcode = X86ISD::FOR; + else if (N->getOpcode() == ISD::XOR) + FPOpcode = X86ISD::FXOR; + + assert(FPOpcode != ISD::DELETED_NODE && + "Unexpected input node for FP logic conversion"); + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && + ((Subtarget->hasSSE1() && VT == MVT::i32) || + (Subtarget->hasSSE2() && VT == MVT::i64))) { + SDValue N00 = N0.getOperand(0); + SDValue N10 = N1.getOperand(0); + EVT N00Type = N00.getValueType(); + EVT N10Type = N10.getValueType(); + if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) { + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); + } + } + return SDValue(); +} + static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -23668,6 +25577,9 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -23728,6 +25640,9 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget)) return R; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); @@ -23799,7 +25714,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, if (!Subtarget->hasSSE41()) return SDValue(); - EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; + MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8; X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); @@ -23813,9 +25728,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = - MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize); + bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -23913,17 +25826,188 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -// PerformXorCombine - Attempts to turn XOR nodes into BLSMSK nodes +// Try to turn tests against the signbit in the form of: +// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) +// into: +// SETGT(X, -1) +static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { + // This is only worth doing if the output type is i8. + if (N->getValueType(0) != MVT::i8) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // We should be performing an xor against a truncated shift. + if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) + return SDValue(); + + // Make sure we are performing an xor against one. + if (!isOneConstant(N1)) + return SDValue(); + + // SetCC on x86 zero extends so only act on this if it's a logical shift. + SDValue Shift = N0.getOperand(0); + if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) + return SDValue(); + + // Make sure we are truncating from one of i16, i32 or i64. + EVT ShiftTy = Shift.getValueType(); + if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) + return SDValue(); + + // Make sure the shift amount extracts the sign bit. + if (!isa<ConstantSDNode>(Shift.getOperand(1)) || + Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1) + return SDValue(); + + // Create a greater-than comparison against -1. + // N.B. Using SETGE against 0 works but we want a canonical looking + // comparison, using SETGT matches up with what TranslateX86CC. + SDLoc DL(N); + SDValue ShiftOp = Shift.getOperand(0); + EVT ShiftOpTy = ShiftOp.getValueType(); + SDValue Cond = DAG.getSetCC(DL, MVT::i8, ShiftOp, + DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); + return Cond; +} + static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) + return RV; + if (Subtarget->hasCMov()) if (SDValue RV = performIntegerAbsCombine(N, DAG)) return RV; + if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) + return FPLogic; + + return SDValue(); +} + +/// This function detects the AVG pattern between vectors of unsigned i8/i16, +/// which is c = (a + b + 1) / 2, and replace this operation with the efficient +/// X86ISD::AVG instruction. +static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const X86Subtarget *Subtarget, SDLoc DL) { + if (!VT.isVector() || !VT.isSimple()) + return SDValue(); + EVT InVT = In.getValueType(); + unsigned NumElems = VT.getVectorNumElements(); + + EVT ScalarVT = VT.getVectorElementType(); + if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && + isPowerOf2_32(NumElems))) + return SDValue(); + + // InScalarVT is the intermediate type in AVG pattern and it should be greater + // than the original input type (i8/i16). + EVT InScalarVT = InVT.getVectorElementType(); + if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits()) + return SDValue(); + + if (Subtarget->hasAVX512()) { + if (VT.getSizeInBits() > 512) + return SDValue(); + } else if (Subtarget->hasAVX2()) { + if (VT.getSizeInBits() > 256) + return SDValue(); + } else { + if (VT.getSizeInBits() > 128) + return SDValue(); + } + + // Detect the following pattern: + // + // %1 = zext <N x i8> %a to <N x i32> + // %2 = zext <N x i8> %b to <N x i32> + // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> + // %4 = add nuw nsw <N x i32> %3, %2 + // %5 = lshr <N x i32> %N, <i32 1 x N> + // %6 = trunc <N x i32> %5 to <N x i8> + // + // In AVX512, the last instruction can also be a trunc store. + + if (In.getOpcode() != ISD::SRL) + return SDValue(); + + // A lambda checking the given SDValue is a constant vector and each element + // is in the range [Min, Max]. + auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { + BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V); + if (!BV || !BV->isConstant()) + return false; + for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) { + ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i)); + if (!C) + return false; + uint64_t Val = C->getZExtValue(); + if (Val < Min || Val > Max) + return false; + } + return true; + }; + + // Check if each element of the vector is left-shifted by one. + auto LHS = In.getOperand(0); + auto RHS = In.getOperand(1); + if (!IsConstVectorInRange(RHS, 1, 1)) + return SDValue(); + if (LHS.getOpcode() != ISD::ADD) + return SDValue(); + + // Detect a pattern of a + b + 1 where the order doesn't matter. + SDValue Operands[3]; + Operands[0] = LHS.getOperand(0); + Operands[1] = LHS.getOperand(1); + + // Take care of the case when one of the operands is a constant vector whose + // element is in the range [1, 256]. + if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && + Operands[0].getOpcode() == ISD::ZERO_EXTEND && + Operands[0].getOperand(0).getValueType() == VT) { + // The pattern is detected. Subtract one from the constant vector, then + // demote it and emit X86ISD::AVG instruction. + SDValue One = DAG.getConstant(1, DL, InScalarVT); + SDValue Ones = DAG.getNode(ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(NumElems, One)); + Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], Ones); + Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]); + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1]); + } + + if (Operands[0].getOpcode() == ISD::ADD) + std::swap(Operands[0], Operands[1]); + else if (Operands[1].getOpcode() != ISD::ADD) + return SDValue(); + Operands[2] = Operands[1].getOperand(0); + Operands[1] = Operands[1].getOperand(1); + + // Now we have three operands of two additions. Check that one of them is a + // constant vector with ones, and the other two are promoted from i8/i16. + for (int i = 0; i < 3; ++i) { + if (!IsConstVectorInRange(Operands[i], 1, 1)) + continue; + std::swap(Operands[i], Operands[2]); + + // Check if Operands[0] and Operands[1] are results of type promotion. + for (int j = 0; j < 2; ++j) + if (Operands[j].getOpcode() != ISD::ZERO_EXTEND || + Operands[j].getOperand(0).getValueType() != VT) + return SDValue(); + + // The pattern is detected, emit X86ISD::AVG instruction. + return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0), + Operands[1].getOperand(0)); + } + return SDValue(); } @@ -23940,10 +26024,13 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, // For chips with slow 32-byte unaligned loads, break the 32-byte operation // into two 16-byte operations. ISD::LoadExtType Ext = Ld->getExtensionType(); + bool Fast; + unsigned AddressSpace = Ld->getAddressSpace(); unsigned Alignment = Ld->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; - if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { + if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && + Ext == ISD::NON_EXTLOAD && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = RegVT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -24012,8 +26099,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) - && "WideVecVT should be legal"); + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, DAG.getUNDEF(WideVecVT), &ShuffleVec[0]); } @@ -24026,8 +26113,8 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) ShuffleVec[i] = i * SizeRatio; - for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i) - ShuffleVec[i] = NumElems*SizeRatio; + for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i) + ShuffleVec[i] = NumElems * SizeRatio; NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask, DAG.getConstant(0, dl, WideVecVT), &ShuffleVec[0]); @@ -24055,7 +26142,6 @@ static SDValue PerformMLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::NON_EXTLOAD); SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd); return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); - } /// PerformMSTORECombine - Resolve truncating stores static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -24073,6 +26159,15 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two assert (isPowerOf2_32(NumElems * FromSz * ToSz) && "Unexpected size for truncating masked store"); @@ -24096,12 +26191,12 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, ShuffleVec[i] = i * SizeRatio; // Can't shuffle using an illegal type. - assert (DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) - && "WideVecVT should be legal"); + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec, - DAG.getUNDEF(WideVecVT), - &ShuffleVec[0]); + DAG.getUNDEF(WideVecVT), + &ShuffleVec[0]); SDValue NewMask; SDValue Mask = Mst->getMask(); @@ -24133,8 +26228,9 @@ static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG, NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } - return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, Mst->getBasePtr(), - NewMask, StVT, Mst->getMemOperand(), false); + return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal, + Mst->getBasePtr(), NewMask, StVT, + Mst->getMemOperand(), false); } /// PerformSTORECombine - Do target-specific dag combines on STORE nodes. static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, @@ -24148,10 +26244,12 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // If we are saving a concatenation of two XMM registers and 32-byte stores // are slow, such as on Sandy Bridge, perform two 16-byte stores. + bool Fast; + unsigned AddressSpace = St->getAddressSpace(); unsigned Alignment = St->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; - if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() && - StVT == VT && !IsAligned) { + if (VT.is256BitVector() && StVT == VT && + TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, + AddressSpace, Alignment, &Fast) && !Fast) { unsigned NumElems = VT.getVectorNumElements(); if (NumElems < 2) return SDValue(); @@ -24178,12 +26276,29 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // First, pack all of the elements in one place. Next, store to memory // in fewer chunks. if (St->isTruncatingStore() && VT.isVector()) { + // Check if we can detect an AVG pattern from the truncation. If yes, + // replace the trunc store by a normal store with the result of X86ISD::AVG + // instruction. + SDValue Avg = + detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, Subtarget, dl); + if (Avg.getNode()) + return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromSz = VT.getVectorElementType().getSizeInBits(); unsigned ToSz = StVT.getVectorElementType().getSizeInBits(); + // The truncating store is legal in some cases. For example + // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw + // are designated for truncate store. + // In this case we don't need any further transformations. + if (TLI.isTruncStoreLegal(VT, StVT)) + return SDValue(); + // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue(); // We are going to use the original vector elt for storing. @@ -24306,7 +26421,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store // pair instead. if (Subtarget->is64Bit() || F64IsLegal) { - EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; + MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), @@ -24539,8 +26654,234 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS. +static SDValue +combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 || + Regs[0].getValueType() == MVT::v2i64)); + EVT OutVT = N->getValueType(0); + EVT OutSVT = OutVT.getVectorElementType(); + EVT InVT = Regs[0].getValueType(); + EVT InSVT = InVT.getVectorElementType(); + SDLoc DL(N); + + // First, use mask to unset all bits that won't appear in the result. + assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) && + "OutSVT can only be either i8 or i16."); + SDValue MaskVal = + DAG.getConstant(OutSVT == MVT::i8 ? 0xFF : 0xFFFF, DL, InSVT); + SDValue MaskVec = DAG.getNode( + ISD::BUILD_VECTOR, DL, InVT, + SmallVector<SDValue, 8>(InVT.getVectorNumElements(), MaskVal)); + for (auto &Reg : Regs) + Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVec, Reg); + + MVT UnpackedVT, PackedVT; + if (OutSVT == MVT::i8) { + UnpackedVT = MVT::v8i16; + PackedVT = MVT::v16i8; + } else { + UnpackedVT = MVT::v4i32; + PackedVT = MVT::v8i16; + } + + // In each iteration, truncate the type by a half size. + auto RegNum = Regs.size(); + for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits(); + j < e; j *= 2, RegNum /= 2) { + for (unsigned i = 0; i < RegNum; i++) + Regs[i] = DAG.getNode(ISD::BITCAST, DL, UnpackedVT, Regs[i]); + for (unsigned i = 0; i < RegNum / 2; i++) + Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2], + Regs[i * 2 + 1]); + } + + // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and + // then extract a subvector as the result since v8i8 is not a legal type. + if (OutVT == MVT::v8i8) { + Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]); + Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0], + DAG.getIntPtrConstant(0, DL)); + return Regs[0]; + } else if (RegNum > 1) { + Regs.resize(RegNum); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. +static SDValue +combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG, + SmallVector<SDValue, 8> &Regs) { + assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32); + EVT OutVT = N->getValueType(0); + SDLoc DL(N); + + // Shift left by 16 bits, then arithmetic-shift right by 16 bits. + SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32); + for (auto &Reg : Regs) { + Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG); + Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG); + } + + for (unsigned i = 0, e = Regs.size() / 2; i < e; i++) + Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2], + Regs[i * 2 + 1]); + + if (Regs.size() > 2) { + Regs.resize(Regs.size() / 2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs); + } else + return Regs[0]; +} + +/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into +/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type +/// legalization the truncation will be translated into a BUILD_VECTOR with each +/// element that is extracted from a vector and then truncated, and it is +/// diffcult to do this optimization based on them. +static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT OutVT = N->getValueType(0); + if (!OutVT.isVector()) + return SDValue(); + + SDValue In = N->getOperand(0); + if (!In.getValueType().isSimple()) + return SDValue(); + + EVT InVT = In.getValueType(); + unsigned NumElems = OutVT.getVectorNumElements(); + + // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on + // SSE2, and we need to take care of it specially. + // AVX512 provides vpmovdb. + if (!Subtarget->hasSSE2() || Subtarget->hasAVX2()) + return SDValue(); + + EVT OutSVT = OutVT.getVectorElementType(); + EVT InSVT = InVT.getVectorElementType(); + if (!((InSVT == MVT::i32 || InSVT == MVT::i64) && + (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && + NumElems >= 8)) + return SDValue(); + + // SSSE3's pshufb results in less instructions in the cases below. + if (Subtarget->hasSSSE3() && NumElems == 8 && + ((OutSVT == MVT::i8 && InSVT != MVT::i64) || + (InSVT == MVT::i32 && OutSVT == MVT::i16))) + return SDValue(); + + SDLoc DL(N); + + // Split a long vector into vectors of legal type. + unsigned RegNum = InVT.getSizeInBits() / 128; + SmallVector<SDValue, 8> SubVec(RegNum); + if (InSVT == MVT::i32) { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, + DAG.getIntPtrConstant(i * 4, DL)); + } else { + for (unsigned i = 0; i < RegNum; i++) + SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, + DAG.getIntPtrConstant(i * 2, DL)); + } + + // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PAKCUS + // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to + // truncate 2 x v4i32 to v8i16. + if (Subtarget->hasSSE41() || OutSVT == MVT::i8) + return combineVectorTruncationWithPACKUS(N, DAG, SubVec); + else if (InSVT == MVT::i32) + return combineVectorTruncationWithPACKSS(N, DAG, SubVec); + else + return SDValue(); +} + +static SDValue PerformTRUNCATECombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // Try to detect AVG pattern first. + SDValue Avg = detectAVGPattern(N->getOperand(0), N->getValueType(0), DAG, + Subtarget, SDLoc(N)); + if (Avg.getNode()) + return Avg; + + return combineVectorTruncation(N, DAG, Subtarget); +} + +/// Do target-specific dag combines on floating point negations. +static SDValue PerformFNEGCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + SDValue Arg = N->getOperand(0); + SDLoc DL(N); + + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + // If we're negating a FMUL node on a target with FMA, then we can avoid the + // use of a constant by performing (-0 - A*B) instead. + // FIXME: Check rounding control flags as well once it becomes available. + if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && + Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { + SDValue Zero = DAG.getConstantFP(0.0, DL, VT); + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Zero); + } + + // If we're negating a FMA node, then we can adjust the + // instruction to include the extra negation. + if (Arg.hasOneUse()) { + switch (Arg.getOpcode()) { + case X86ISD::FMADD: + return DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FMSUB: + return DAG.getNode(X86ISD::FNMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMADD: + return DAG.getNode(X86ISD::FMSUB, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + case X86ISD::FNMSUB: + return DAG.getNode(X86ISD::FMADD, DL, VT, Arg.getOperand(0), + Arg.getOperand(1), Arg.getOperand(2)); + } + } + return SDValue(); +} + +static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + EVT VT = N->getValueType(0); + if (VT.is512BitVector() && !Subtarget->hasDQI()) { + // VXORPS, VORPS, VANDPS, VANDNPS are supported only under DQ extention. + // These logic operations may be executed in the integer domain. + SDLoc dl(N); + MVT IntScalar = MVT::getIntegerVT(VT.getScalarSizeInBits()); + MVT IntVT = MVT::getVectorVT(IntScalar, VT.getVectorNumElements()); + + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(0)); + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, IntVT, N->getOperand(1)); + unsigned IntOpcode = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected FP logic op"); + case X86ISD::FOR: IntOpcode = ISD::OR; break; + case X86ISD::FXOR: IntOpcode = ISD::XOR; break; + case X86ISD::FAND: IntOpcode = ISD::AND; break; + case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; + } + SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); + return DAG.getNode(ISD::BITCAST, dl, VT, IntOp); + } + return SDValue(); +} /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. -static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); // F[X]OR(0.0, x) -> x @@ -24552,7 +26893,8 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) if (C->getValueAPF().isPosZero()) return N->getOperand(0); - return SDValue(); + + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. @@ -24576,8 +26918,65 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { N->getOperand(0), N->getOperand(1)); } +static SDValue performFMinNumFMaxNumCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + if (Subtarget->useSoftFloat()) + return SDValue(); + + // TODO: Check for global or instruction-level "nnan". In that case, we + // should be able to lower to FMAX/FMIN alone. + // TODO: If an operand is already known to be a NaN or not a NaN, this + // should be an optional swap and FMAX/FMIN. + + EVT VT = N->getValueType(0); + if (!((Subtarget->hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) || + (Subtarget->hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) || + (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64)))) + return SDValue(); + + // This takes at least 3 instructions, so favor a library call when operating + // on a scalar and minimizing code size. + if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDLoc DL(N); + EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), VT); + + // There are 4 possibilities involving NaN inputs, and these are the required + // outputs: + // Op1 + // Num NaN + // ---------------- + // Num | Max | Op0 | + // Op0 ---------------- + // NaN | Op1 | NaN | + // ---------------- + // + // The SSE FP max/min instructions were not designed for this case, but rather + // to implement: + // Min = Op1 < Op0 ? Op1 : Op0 + // Max = Op1 > Op0 ? Op1 : Op0 + // + // So they always return Op0 if either input is a NaN. However, we can still + // use those instructions for fmaxnum by selecting away a NaN input. + + // If either operand is NaN, the 2nd source operand (Op0) is passed through. + auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; + SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); + SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO); + + // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands + // are NaN, the NaN value of Op1 is the result. + auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; + return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); +} + /// Do target-specific dag combines on X86ISD::FAND nodes. -static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FAND(0.0, x) -> 0.0 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -24588,11 +26987,12 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } /// Do target-specific dag combines on X86ISD::FANDN nodes -static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { // FANDN(0.0, x) -> x if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) if (C->getValueAPF().isPosZero()) @@ -24603,7 +27003,7 @@ static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) { if (C->getValueAPF().isPosZero()) return N->getOperand(1); - return SDValue(); + return lowerX86FPLogicOp(N, DAG, Subtarget); } static SDValue PerformBTCombine(SDNode *N, @@ -24673,6 +27073,57 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +/// sext(add_nsw(x, C)) --> add(sext(x), C_sext) +/// Promoting a sign extension ahead of an 'add nsw' exposes opportunities +/// to combine math ops, use an LEA, or use a complex addressing mode. This can +/// eliminate extend, add, and shift instructions. +static SDValue promoteSextBeforeAddNSW(SDNode *Sext, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + // TODO: This should be valid for other integer types. + EVT VT = Sext->getValueType(0); + if (VT != MVT::i64) + return SDValue(); + + // We need an 'add nsw' feeding into the 'sext'. + SDValue Add = Sext->getOperand(0); + if (Add.getOpcode() != ISD::ADD || !Add->getFlags()->hasNoSignedWrap()) + return SDValue(); + + // Having a constant operand to the 'add' ensures that we are not increasing + // the instruction count because the constant is extended for free below. + // A constant operand can also become the displacement field of an LEA. + auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); + if (!AddOp1) + return SDValue(); + + // Don't make the 'add' bigger if there's no hope of combining it with some + // other 'add' or 'shl' instruction. + // TODO: It may be profitable to generate simpler LEA instructions in place + // of single 'add' instructions, but the cost model for selecting an LEA + // currently has a high threshold. + bool HasLEAPotential = false; + for (auto *User : Sext->uses()) { + if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { + HasLEAPotential = true; + break; + } + } + if (!HasLEAPotential) + return SDValue(); + + // Everything looks good, so pull the 'sext' ahead of the 'add'. + int64_t AddConstant = AddOp1->getSExtValue(); + SDValue AddOp0 = Add.getOperand(0); + SDValue NewSext = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Sext), VT, AddOp0); + SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); + + // The wider add is guaranteed to not wrap because both operands are + // sign-extended. + SDNodeFlags Flags; + Flags.setNoSignedWrap(true); + return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewSext, NewConstant, &Flags); +} + static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget *Subtarget) { @@ -24763,13 +27214,13 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG, } } - if (!Subtarget->hasFp256()) - return SDValue(); - - if (VT.isVector() && VT.getSizeInBits() == 256) + if (Subtarget->hasAVX() && VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; + if (SDValue NewAdd = promoteSextBeforeAddNSW(N, DAG, Subtarget)) + return NewAdd; + return SDValue(); } @@ -24783,9 +27234,7 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, return SDValue(); EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || - (!Subtarget->hasFMA() && !Subtarget->hasFMA4() && - !Subtarget->hasAVX512())) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasAnyFMA()) return SDValue(); SDValue A = N->getOperand(0); @@ -24830,8 +27279,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG, N0.getOperand(0).hasOneUse()) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == X86ISD::SETCC_CARRY) { - ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (!C || C->getZExtValue() != 1) + if (!isOneConstant(N0.getOperand(1))) return SDValue(); return DAG.getNode(ISD::AND, dl, VT, DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, @@ -24884,21 +27332,19 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, SDLoc DL(N); if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0))) - if (C->getAPIntValue() == 0 && LHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, - LHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS, + LHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB) - if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0))) - if (C->getAPIntValue() == 0 && RHS.hasOneUse()) { - SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, - RHS.getOperand(1)); - return DAG.getSetCC(DL, N->getValueType(0), addV, - DAG.getConstant(0, DL, addV.getValueType()), CC); - } + if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) { + SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS, + RHS.getOperand(1)); + return DAG.getSetCC(DL, N->getValueType(0), addV, + DAG.getConstant(0, DL, addV.getValueType()), CC); + } if (VT.getScalarType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { @@ -24936,52 +27382,6 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, - SelectionDAG &DAG) { - SDLoc dl(Load); - MVT VT = Load->getSimpleValueType(0); - MVT EVT = VT.getVectorElementType(); - SDValue Addr = Load->getOperand(1); - SDValue NewAddr = DAG.getNode( - ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(Index * EVT.getStoreSize(), dl, - Addr.getSimpleValueType())); - - SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); - return NewLoad; -} - -static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, - const X86Subtarget *Subtarget) { - SDLoc dl(N); - MVT VT = N->getOperand(1)->getSimpleValueType(0); - assert((VT == MVT::v4f32 || VT == MVT::v4i32) && - "X86insertps is only defined for v4x32"); - - SDValue Ld = N->getOperand(1); - if (MayFoldLoad(Ld)) { - // Extract the countS bits from the immediate so we can get the proper - // address when narrowing the vector load to a specific element. - // When the second source op is a memory address, insertps doesn't use - // countS and just gets an f32 from that address. - unsigned DestIndex = - cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; - - Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); - - // Create this as a scalar to vector to match the instruction pattern. - SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); - // countS bits are ignored when loading from memory on insertps, which - // means we don't need to explicitly set them to 0. - return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), - LoadScalarToVector, N->getOperand(2)); - } - return SDValue(); -} - static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { SDValue V0 = N->getOperand(0); SDValue V1 = N->getOperand(1); @@ -25008,6 +27408,20 @@ static SDValue PerformBLENDICombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformGatherScatterCombine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + // Gather and Scatter instructions use k-registers for masks. The type of + // the masks is v*i1. So the mask will be truncated anyway. + // The SIGN_EXTEND_INREG my be dropped. + SDValue Mask = N->getOperand(2); + if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[2] = Mask.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + } + return SDValue(); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -25182,7 +27596,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. - if (Op0.getOpcode() == ISD::LOAD) { + if (!Subtarget->useSoftFloat() && Op0.getOpcode() == ISD::LOAD) { LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); EVT LdVT = Ld->getValueType(0); @@ -25357,15 +27771,14 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, } // Check if we can bypass extracting and re-inserting an element of an input - // vector. Essentialy: + // vector. Essentially: // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) { SDValue ExtractedV = V.getOperand(0); SDValue OrigV = ExtractedV.getOperand(0); - if (auto *ExtractIdx = dyn_cast<ConstantSDNode>(ExtractedV.getOperand(1))) - if (ExtractIdx->getZExtValue() == 0) { + if (isNullConstant(ExtractedV.getOperand(1))) { MVT OrigVT = OrigV.getSimpleValueType(); // Extract a subvector if necessary... if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) { @@ -25394,7 +27807,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SELECT: case X86ISD::SHRUNKBLEND: return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case ISD::BITCAST: return PerformBITCASTCombine(N, DAG); + case ISD::BITCAST: return PerformBITCASTCombine(N, DAG, Subtarget); case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); @@ -25414,12 +27827,17 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, Subtarget); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); + case ISD::FNEG: return PerformFNEGCombine(N, DAG, Subtarget); + case ISD::TRUNCATE: return PerformTRUNCATECombine(N, DAG, Subtarget); case X86ISD::FXOR: - case X86ISD::FOR: return PerformFORCombine(N, DAG); + case X86ISD::FOR: return PerformFORCombine(N, DAG, Subtarget); case X86ISD::FMIN: case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); - case X86ISD::FAND: return PerformFANDCombine(N, DAG); - case X86ISD::FANDN: return PerformFANDNCombine(N, DAG); + case ISD::FMINNUM: + case ISD::FMAXNUM: return performFMinNumFMaxNumCombine(N, DAG, + Subtarget); + case X86ISD::FAND: return PerformFANDCombine(N, DAG, Subtarget); + case X86ISD::FANDN: return PerformFANDNCombine(N, DAG, Subtarget); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: @@ -25447,14 +27865,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget); case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); - case ISD::INTRINSIC_WO_CHAIN: - return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); - case X86ISD::INSERTPS: { - if (getTargetMachine().getOptLevel() > CodeGenOpt::None) - return PerformINSERTPSCombine(N, DAG, Subtarget); - break; - } case X86ISD::BLENDI: return PerformBLENDICombine(N, DAG); + case ISD::MGATHER: + case ISD::MSCATTER: return PerformGatherScatterCombine(N, DAG); } return SDValue(); @@ -26084,6 +28497,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case MVT::f64: case MVT::i64: return std::make_pair(0U, &X86::FR64RegClass); + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. // Vector types. case MVT::v16i8: case MVT::v8i16: @@ -26168,17 +28582,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Class == &X86::GR8RegClass || Class == &X86::GR16RegClass || Class == &X86::GR32RegClass || Class == &X86::GR64RegClass) { unsigned Size = VT.getSizeInBits(); - MVT::SimpleValueType SimpleTy = Size == 1 || Size == 8 ? MVT::i8 - : Size == 16 ? MVT::i16 - : Size == 32 ? MVT::i32 - : Size == 64 ? MVT::i64 - : MVT::Other; - unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, SimpleTy); + if (Size == 1) Size = 8; + unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { Res.first = DestReg; - Res.second = SimpleTy == MVT::i8 ? &X86::GR8RegClass - : SimpleTy == MVT::i16 ? &X86::GR16RegClass - : SimpleTy == MVT::i32 ? &X86::GR32RegClass + Res.second = Size == 8 ? &X86::GR8RegClass + : Size == 16 ? &X86::GR16RegClass + : Size == 32 ? &X86::GR32RegClass : &X86::GR64RegClass; assert(Res.second->contains(Res.first) && "Register in register class"); } else { @@ -26196,6 +28606,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, // target independent register mapper will just pick the first match it can // find, ignoring the required type. + // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. if (VT == MVT::f32 || VT == MVT::i32) Res.second = &X86::FR32RegClass; else if (VT == MVT::f64 || VT == MVT::i64) @@ -26244,6 +28655,15 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } -bool X86TargetLowering::isTargetFTOL() const { - return Subtarget->isTargetKnownWindowsMSVC() && !Subtarget->is64Bit(); +bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const { + // Integer division on x86 is expensive. However, when aggressively optimizing + // for code size, we prefer to use a div instruction, as it is usually smaller + // than the alternative sequence. + // The exception to this is vector division. Since x86 doesn't have vector + // integer division, leaving the division as-is is a loss even in terms of + // size, because it will have to be scalarized, while the alternative code + // sequence can be performed in vector form. + bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex, + Attribute::MinSize); + return OptSize && !VT.isVector(); } diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.h b/contrib/llvm/lib/Target/X86/X86ISelLowering.h index 723d530..a29dc9a 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.h +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.h @@ -126,6 +126,9 @@ namespace llvm { /// 1 is the number of bytes of stack to pop. RET_FLAG, + /// Return from interrupt. Operand 0 is the number of bytes to pop. + IRET, + /// Repeat fill, corresponds to X86::REP_STOSx. REP_STOS, @@ -182,6 +185,8 @@ namespace llvm { /// Compute Sum of Absolute Differences. PSADBW, + /// Compute Double Block Packed Sum-Absolute-Differences + DBPSADBW, /// Bitwise Logical AND NOT of Packed FP values. ANDNP, @@ -211,6 +216,8 @@ namespace llvm { // FP vector get exponent FGETEXP_RND, + // Extract Normalized Mantissas + VGETMANT, // FP Scale SCALEF, // Integer add/sub with unsigned saturation. @@ -236,6 +243,9 @@ namespace llvm { // Integer absolute value ABS, + // Detect Conflicts Within a Vector + CONFLICT, + /// Floating point max and min. FMAX, FMIN, @@ -282,9 +292,8 @@ namespace llvm { // Vector integer truncate. VTRUNC, - - // Vector integer truncate with mask. - VTRUNCM, + // Vector integer truncate with unsigned/signed saturation. + VTRUNCUS, VTRUNCS, // Vector FP extend. VFPEXT, @@ -295,6 +304,9 @@ namespace llvm { // Vector signed/unsigned integer to double. CVTDQ2PD, CVTUDQ2PD, + // Convert a vector to mask, set bits base on MSB. + CVT2MASK, + // 128-bit vector logical left / right shift VSHLDQ, VSRLDQ, @@ -349,6 +361,7 @@ namespace llvm { // OR/AND test for masks KORTEST, + KTEST, // Several flavors of instructions with vector shuffle behaviors. PACKSS, @@ -382,12 +395,24 @@ namespace llvm { VPERMIV3, VPERMI, VPERM2X128, - //Fix Up Special Packed Float32/64 values + // Bitwise ternary logic + VPTERNLOG, + // Fix Up Special Packed Float32/64 values VFIXUPIMM, - //Range Restriction Calculation For Packed Pairs of Float32/64 values + // Range Restriction Calculation For Packed Pairs of Float32/64 values VRANGE, + // Reduce - Perform Reduction Transformation on scalar\packed FP + VREDUCE, + // RndScale - Round FP Values To Include A Given Number Of Fraction Bits + VRNDSCALE, + // VFPCLASS - Tests Types Of a FP Values for packed types. + VFPCLASS, + // VFPCLASSS - Tests Types Of a FP Values for scalar types. + VFPCLASSS, // Broadcast scalar to vector VBROADCAST, + // Broadcast mask to vector + VBROADCASTM, // Broadcast subvector to vector SUBV_BROADCAST, // Insert/Extract vector element @@ -397,13 +422,21 @@ namespace llvm { /// SSE4A Extraction and Insertion. EXTRQI, INSERTQI, + // XOP variable/immediate rotations + VPROT, VPROTI, + // XOP arithmetic/logical shifts + VPSHA, VPSHL, + // XOP signed/unsigned integer comparisons + VPCOM, VPCOMU, + // Vector multiply packed unsigned doubleword integers PMULUDQ, // Vector multiply packed signed doubleword integers PMULDQ, // Vector Multiply Packed UnsignedIntegers with Round and Scale MULHRS, - + // Multiply and Add Packed Integers + VPMADDUBSW, VPMADDWD, // FMA nodes FMADD, FNMADD, @@ -418,7 +451,6 @@ namespace llvm { FNMSUB_RND, FMADDSUB_RND, FMSUBADD_RND, - RNDSCALE, // Compress and expand COMPRESS, @@ -443,9 +475,6 @@ namespace llvm { // falls back to heap allocation if not. SEG_ALLOCA, - // Windows's _ftol2 runtime routine to do fptoui. - WIN_FTOL, - // Memory barrier MEMBARRIER, MFENCE, @@ -580,15 +609,6 @@ namespace llvm { bool isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt); - /// AVX512 static rounding constants. These need to match the values in - /// avx512fintrin.h. - enum STATIC_ROUNDING { - TO_NEAREST_INT = 0, - TO_NEG_INF = 1, - TO_POS_INF = 2, - TO_ZERO = 3, - CUR_DIRECTION = 4 - }; } //===--------------------------------------------------------------------===// @@ -850,16 +870,7 @@ namespace llvm { /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 - } - - /// Return true if the target uses the MSVC _ftol2 routine for fptoui. - bool isTargetFTOL() const; - - /// Return true if the MSVC _ftol2 routine should be used for fptoui to the - /// given type. - bool isIntegerTypeFTOL(EVT VT) const { - return isTargetFTOL() && VT == MVT::i64; + (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } /// \brief Returns true if it is beneficial to convert a load of a constant @@ -879,6 +890,16 @@ namespace llvm { unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override; + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + /// This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, @@ -890,6 +911,11 @@ namespace llvm { bool getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const override; + /// Return true if the target stores SafeStack pointer at a fixed offset in + /// some non-standard address space, and populates the address space and + /// offset as appropriate. + Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; + SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot, SelectionDAG &DAG) const; @@ -899,6 +925,8 @@ namespace llvm { /// \brief Customize the preferred legalization strategy for certain types. LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; + bool isIntDivCheap(EVT VT, AttributeSet Attr) const override; + protected: std::pair<const TargetRegisterClass *, uint8_t> findRepresentativeClass(const TargetRegisterInfo *TRI, @@ -908,7 +936,6 @@ namespace llvm { /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; - const DataLayout *TD; /// Select between SSE or x87 floating point ops. /// When SSE is available, use it for f32 operations. @@ -955,7 +982,6 @@ namespace llvm { const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG& DAG) const; - bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const; SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const; @@ -969,7 +995,6 @@ namespace llvm { SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; @@ -994,9 +1019,9 @@ namespace llvm { SDValue LowerToBT(SDValue And, ISD::CondCode CC, SDLoc dl, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSETCCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; @@ -1042,27 +1067,16 @@ namespace llvm { const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; - bool shouldExpandAtomicLoadInIR(LoadInst *SI) const override; + TargetLoweringBase::AtomicExpansionKind + shouldExpandAtomicLoadInIR(LoadInst *SI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; - TargetLoweringBase::AtomicRMWExpansionKind + TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; - bool needsCmpXchgNb(const Type *MemType) const; - - /// Utility function to emit atomic-load-arith operations (and, or, xor, - /// nand, max, min, umax, umin). It takes the corresponding instruction to - /// expand, the associated machine basic block, and the associated X86 - /// opcodes for reg/reg. - MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI, - MachineBasicBlock *MBB) const; - - /// Utility function to emit atomic-load-arith operations (and, or, xor, - /// nand, add, sub, swap) for 64-bit operands on 32-bit target. - MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI, - MachineBasicBlock *MBB) const; + bool needsCmpXchgNb(Type *MemType) const; // Utility function to emit the low-level va_arg code for X86-64. MachineBasicBlock *EmitVAARG64WithCustomInserter( @@ -1077,18 +1091,24 @@ namespace llvm { MachineBasicBlock *EmitLoweredSelect(MachineInstr *I, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr *I, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredWinAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCatchRet(MachineInstr *MI, + MachineBasicBlock *BB) const; + + MachineBasicBlock *EmitLoweredCatchPad(MachineInstr *MI, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI, MachineBasicBlock *BB) const; - MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI, - MachineBasicBlock *BB) const; - MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI, MachineBasicBlock *MBB) const; @@ -1121,7 +1141,7 @@ namespace llvm { unsigned &RefinementSteps) const override; /// Reassociate floating point divisions into multiply by reciprocal. - bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + unsigned combineRepeatedFPDivisors() const override; }; namespace X86 { diff --git a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td index faa9150..8bf2925 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/contrib/llvm/lib/Target/X86/X86InstrAVX512.td @@ -79,7 +79,7 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc, !if (!eq (TypeVariantName, "i"), !if (!eq (Size, 128), "v2i64", !if (!eq (Size, 256), "v4i64", - !if (!eq (Size, 512), + !if (!eq (Size, 512), !if (!eq (EltSize, 64), "v8i64", "v16i32"), VTName))), VTName)); @@ -145,6 +145,8 @@ def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; // We map scalar types to the smallest (128-bit) vector type // with the appropriate element type. This allows to use the same masking logic. +def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">; +def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">; def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; @@ -274,6 +276,22 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, (vselect _.KRCWM:$mask, RHS, _.RC:$src1)>; +// Similar to AVX512_maskable_3rc but in this case the input VT for the tied +// operand differs from the output VT. This requires a bitconvert on +// the preserved vector going into the vselect. +multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT, + X86VectorVTInfo InVT, + dag Outs, dag NonTiedIns, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS> : + AVX512_maskable_common<O, F, OutVT, Outs, + !con((ins InVT.RC:$src1), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns), + OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, + (vselect InVT.KRCWM:$mask, RHS, + (bitconvert InVT.RC:$src1))>; + multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, @@ -471,84 +489,123 @@ def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>; //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // - -multiclass vinsert_for_size_no_alt<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - PatFrag vinsert_insert, - SDNodeXForm INSERT_get_vinsert_imm> { +multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vinsert_insert> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { - def rr : AVX512AIi8<Opcode, MRMSrcReg, (outs VR512:$dst), - (ins VR512:$src1, From.RC:$src2, u8imm:$src3), - "vinsert" # From.EltTypeName # "x" # From.NumElts # - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}", - [(set To.RC:$dst, (vinsert_insert:$src3 (To.VT VR512:$src1), - (From.VT From.RC:$src2), - (iPTR imm)))]>, - EVEX_4V, EVEX_V512; + defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V; - let mayLoad = 1 in - def rm : AVX512AIi8<Opcode, MRMSrcMem, (outs VR512:$dst), - (ins VR512:$src1, From.MemOp:$src2, u8imm:$src3), - "vinsert" # From.EltTypeName # "x" # From.NumElts # - "\t{$src3, $src2, $src1, $dst|" - "$dst, $src1, $src2, $src3}", - []>, - EVEX_4V, EVEX_V512, EVEX_CD8<From.EltSize, From.CD8TupleForm>; - } -} - -multiclass vinsert_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, - PatFrag vinsert_insert, - SDNodeXForm INSERT_get_vinsert_imm> : - vinsert_for_size_no_alt<Opcode, From, To, - vinsert_insert, INSERT_get_vinsert_imm> { - // Codegen pattern with the alternative types, e.g. v2i64 -> v8i64 for - // vinserti32x4. Only add this if 64x2 and friends are not supported - // natively via AVX512DQ. - let Predicates = [NoDQI] in + let mayLoad = 1 in + defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst), + (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3), + "vinsert" # From.EltTypeName # "x" # From.NumElts, + "$src3, $src2, $src1", "$src1, $src2, $src3", + (vinsert_insert:$src3 (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm))>, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<From.EltSize, From.CD8TupleForm>; + } +} + +multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> { + let Predicates = p in { def : Pat<(vinsert_insert:$ins - (AltTo.VT VR512:$src1), (AltFrom.VT From.RC:$src2), (iPTR imm)), - (AltTo.VT (!cast<Instruction>(NAME # From.EltSize # "x4rr") - VR512:$src1, From.RC:$src2, - (INSERT_get_vinsert_imm VR512:$ins)))>; + (To.VT To.RC:$src1), (From.VT From.RC:$src2), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + + def : Pat<(vinsert_insert:$ins + (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rm") + To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins)))>; + } } multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, ValueType EltVT64, int Opcode256> { - defm NAME # "32x4" : vinsert_for_size<Opcode128, + + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT32, VR128X>, + X86VectorVTInfo< 8, EltVT32, VR256X>, + vinsert128_insert>, EVEX_V256; + + defm NAME # "32x4Z" : vinsert_for_size<Opcode128, X86VectorVTInfo< 4, EltVT32, VR128X>, X86VectorVTInfo<16, EltVT32, VR512>, - X86VectorVTInfo< 2, EltVT64, VR128X>, + vinsert128_insert>, EVEX_V512; + + defm NAME # "64x4Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert, - INSERT_get_vinsert128_imm>; - let Predicates = [HasDQI] in - defm NAME # "64x2" : vinsert_for_size_no_alt<Opcode128, + vinsert256_insert>, VEX_W, EVEX_V512; + + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vinsert_for_size<Opcode128, + X86VectorVTInfo< 2, EltVT64, VR128X>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vinsert128_insert>, VEX_W, EVEX_V256; + + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vinsert_for_size<Opcode128, X86VectorVTInfo< 2, EltVT64, VR128X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert, - INSERT_get_vinsert128_imm>, VEX_W; - defm NAME # "64x4" : vinsert_for_size<Opcode256, - X86VectorVTInfo< 4, EltVT64, VR256X>, - X86VectorVTInfo< 8, EltVT64, VR512>, - X86VectorVTInfo< 8, EltVT32, VR256>, - X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert, - INSERT_get_vinsert256_imm>, VEX_W; - let Predicates = [HasDQI] in - defm NAME # "32x8" : vinsert_for_size_no_alt<Opcode256, - X86VectorVTInfo< 8, EltVT32, VR256X>, - X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert, - INSERT_get_vinsert256_imm>; + vinsert128_insert>, VEX_W, EVEX_V512; + + defm NAME # "32x8Z" : vinsert_for_size<Opcode256, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo<16, EltVT32, VR512>, + vinsert256_insert>, EVEX_V512; + } } defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>; defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; +// Codegen pattern with the alternative types, +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + +defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + +// Codegen pattern with the alternative types insert VEC128 into VEC256 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +// Codegen pattern with the alternative types insert VEC128 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types insert VEC256 into VEC512 +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; + // vinsertps - insert f32 to XMM def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), @@ -566,90 +623,158 @@ def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), // AVX-512 VECTOR EXTRACT //--- +multiclass vextract_for_size_first_position_lowering<X86VectorVTInfo From, + X86VectorVTInfo To> { + // A subvector extract from the first vector position is + // a subregister copy that needs no instruction. + def NAME # To.NumElts: + Pat<(To.VT (extract_subvector (From.VT From.RC:$src),(iPTR 0))), + (To.VT (EXTRACT_SUBREG (From.VT From.RC:$src), To.SubRegIdx))>; +} + multiclass vextract_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - X86VectorVTInfo AltFrom, X86VectorVTInfo AltTo, - PatFrag vextract_extract, - SDNodeXForm EXTRACT_get_vextract_imm> { + X86VectorVTInfo From, X86VectorVTInfo To, + PatFrag vextract_extract> : + vextract_for_size_first_position_lowering<From, To> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to + // vextract_extract), we interesting only in patterns without mask, + // intrinsics pattern match generated bellow. defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), - (ins VR512:$src1, u8imm:$idx), - "vextract" # To.EltTypeName # "x4", + (ins From.RC:$src1, i32u8imm:$idx), + "vextract" # To.EltTypeName # "x" # To.NumElts, "$idx, $src1", "$src1, $idx", - [(set To.RC:$dst, (vextract_extract:$idx (From.VT VR512:$src1), + [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)))]>, - AVX512AIi8Base, EVEX, EVEX_V512; - let mayStore = 1 in - def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), - (ins To.MemOp:$dst, VR512:$src1, u8imm:$src2), - "vextract" # To.EltTypeName # "x4\t{$src2, $src1, $dst|" - "$dst, $src1, $src2}", - []>, EVEX, EVEX_V512, EVEX_CD8<To.EltSize, CD8VT4>; - } - - // Codegen pattern with the alternative types, e.g. v8i64 -> v2i64 for - // vextracti32x4 - def : Pat<(vextract_extract:$ext (AltFrom.VT VR512:$src1), (iPTR imm)), - (AltTo.VT (!cast<Instruction>(NAME # To.EltSize # "x4rr") - VR512:$src1, - (EXTRACT_get_vextract_imm To.RC:$ext)))>; - - // A 128/256-bit subvector extract from the first 512-bit vector position is - // a subregister copy that needs no instruction. - def : Pat<(To.VT (extract_subvector (From.VT VR512:$src), (iPTR 0))), - (To.VT - (EXTRACT_SUBREG (From.VT VR512:$src), To.SubRegIdx))>; - - // And for the alternative types. - def : Pat<(AltTo.VT (extract_subvector (AltFrom.VT VR512:$src), (iPTR 0))), - (AltTo.VT - (EXTRACT_SUBREG (AltFrom.VT VR512:$src), AltTo.SubRegIdx))>; + AVX512AIi8Base, EVEX; + let mayStore = 1 in { + def rm : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, EVEX; + + def rmk : AVX512AIi8<Opcode, MRMDestMem, (outs), + (ins To.MemOp:$dst, To.KRCWM:$mask, + From.RC:$src1, i32u8imm:$src2), + "vextract" # To.EltTypeName # "x" # To.NumElts # + "\t{$src2, $src1, $dst {${mask}}|" + "$dst {${mask}}, $src1, $src2}", + []>, EVEX_K, EVEX; + }//mayStore = 1 + } // Intrinsic call with masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.RC:$src0, GR8:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x4rrk") To.RC:$src0, - (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.RC:$src0, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrk") + To.RC:$src0, + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; // Intrinsic call with zero-masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, GR8:$mask), - (!cast<Instruction>(NAME # To.EltSize # "x4rrkz") - (v4i1 (COPY_TO_REGCLASS GR8:$mask, VK4WM)), - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, To.MRC:$mask), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rrkz") + (COPY_TO_REGCLASS To.MRC:$mask, To.KRCWM), + From.RC:$src1, imm:$idx)>; // Intrinsic call without masking. def : Pat<(!cast<Intrinsic>("int_x86_avx512_mask_vextract" # To.EltTypeName # - "x4_512") - VR512:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), - (!cast<Instruction>(NAME # To.EltSize # "x4rr") - VR512:$src1, imm:$idx)>; + "x" # To.NumElts # "_" # From.Size) + From.RC:$src1, (iPTR imm:$idx), To.ImmAllZerosV, (i8 -1)), + (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # + From.ZSuffix # "rr") + From.RC:$src1, imm:$idx)>; +} + +// Codegen pattern for the alternative types +multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm, list<Predicate> p> : + vextract_for_size_first_position_lowering<From, To> { + + let Predicates = p in + def : Pat<(vextract_extract:$ext (From.VT From.RC:$src1), (iPTR imm)), + (To.VT (!cast<Instruction>(InstrStr#"rr") + From.RC:$src1, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; } -multiclass vextract_for_type<ValueType EltVT32, int Opcode32, - ValueType EltVT64, int Opcode64> { - defm NAME # "32x4" : vextract_for_size<Opcode32, +multiclass vextract_for_type<ValueType EltVT32, int Opcode128, + ValueType EltVT64, int Opcode256> { + defm NAME # "32x4Z" : vextract_for_size<Opcode128, X86VectorVTInfo<16, EltVT32, VR512>, X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; + defm NAME # "64x4Z" : vextract_for_size<Opcode256, X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vextract256_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; + let Predicates = [HasVLX] in + defm NAME # "32x4Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 8, EltVT32, VR256X>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract>, + EVEX_V256, EVEX_CD8<32, CD8VT4>; + let Predicates = [HasVLX, HasDQI] in + defm NAME # "64x2Z256" : vextract_for_size<Opcode128, + X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract, - EXTRACT_get_vextract128_imm>; - defm NAME # "64x4" : vextract_for_size<Opcode64, + vextract128_extract>, + VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + let Predicates = [HasDQI] in { + defm NAME # "64x2Z" : vextract_for_size<Opcode128, X86VectorVTInfo< 8, EltVT64, VR512>, - X86VectorVTInfo< 4, EltVT64, VR256X>, + X86VectorVTInfo< 2, EltVT64, VR128X>, + vextract128_extract>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; + defm NAME # "32x8Z" : vextract_for_size<Opcode256, X86VectorVTInfo<16, EltVT32, VR512>, - X86VectorVTInfo< 8, EltVT32, VR256>, - vextract256_extract, - EXTRACT_get_vextract256_imm>, VEX_W; + X86VectorVTInfo< 8, EltVT32, VR256X>, + vextract256_extract>, + EVEX_V512, EVEX_CD8<32, CD8VT8>; + } } defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>; defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>; +// extract_subvector codegen patterns with the alternative types. +// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + +// Codegen pattern with the alternative types extract VEC128 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +// Codegen pattern with the alternative types extract VEC256 from VEC512 +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; + // A 128-bit subvector insert to the first 512-bit vector position // is a subregister copy that needs no instruction. def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)), @@ -677,6 +802,10 @@ def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)), (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; +def : Pat<(insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0)), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; // vextractps - extract 32 bits from XMM def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), @@ -694,50 +823,49 @@ def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs), //===---------------------------------------------------------------------===// // AVX-512 BROADCAST //--- -multiclass avx512_fp_broadcast<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, - ValueType svt, X86VectorVTInfo _> { - defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins SrcRC:$src), "vbroadcast"## !subst("p", "s", _.Suffix), - "$src", "$src", (_.VT (OpNode (svt SrcRC:$src)))>, - T8PD, EVEX; - let mayLoad = 1 in { - defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.ScalarMemOp:$src), - "vbroadcast"##!subst("p", "s", _.Suffix), "$src", "$src", - (_.VT (OpNode (_.ScalarLdFrag addr:$src)))>, - T8PD, EVEX; - } +multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { + + defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src)))>, + T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>; } -multiclass avx512_fp_broadcast_vl<bits<8> opc, SDNode OpNode, - AVX512VLVectorVTInfo _> { - defm Z : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info512>, +multiclass avx512_fp_broadcast_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : avx512_fp_broadcast<opc, OpNode, VR128X, _.info128.VT, _.info256>, - EVEX_V256; + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; } } let ExeDomain = SSEPackedSingle in { - defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, X86VBroadcast, - avx512vl_f32_info>, EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSS : avx512_fp_broadcast_vl<0x18, "vbroadcastss", + avx512vl_f32_info>; let Predicates = [HasVLX] in { - defm VBROADCASTSSZ128 : avx512_fp_broadcast<0x18, X86VBroadcast, VR128X, - v4f32, v4f32x_info>, EVEX_V128, - EVEX_CD8<32, CD8VT1>; + defm VBROADCASTSSZ128 : avx512_broadcast_rm<0x18, "vbroadcastss", + v4f32x_info, v4f32x_info>, EVEX_V128; } } let ExeDomain = SSEPackedDouble in { - defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, X86VBroadcast, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VBROADCASTSD : avx512_fp_broadcast_vl<0x19, "vbroadcastsd", + avx512vl_f64_info>, VEX_W; } // avx512_broadcast_pat introduces patterns for broadcast with a scalar argument. -// Later, we can canonize broadcast instructions before ISel phase and +// Later, we can canonize broadcast instructions before ISel phase and // eliminate additional patterns on ISel. // SrcRC_v and SrcRC_s are RegisterClasses for vector and scalar // representations of source @@ -834,70 +962,50 @@ def : Pat<(v8i64 (int_x86_avx512_mask_pbroadcast_q_gpr_512 (i64 GR64:$src), (bc_v8i64 (v16i32 immAllZerosV)), (i8 GR8:$mask))), (VPBROADCASTQrZrkz (COPY_TO_REGCLASS GR8:$mask, VK8WM), GR64:$src)>; -multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, PatFrag ld_frag, - RegisterClass DstRC, ValueType OpVT, ValueType SrcVT, - RegisterClass KRC> { - def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX; - def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, - VR128X:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} |${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask, - VR128X:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, - (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX; - def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, - x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}} , $src}"), - []>, EVEX, EVEX_K; - def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask, - x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - [(set DstRC:$dst, (OpVT (vselect KRC:$mask, - (X86VBroadcast (ld_frag addr:$src)), - (OpVT (bitconvert (v16i32 immAllZerosV))))))]>, EVEX, EVEX_KZ; - } -} - -defm VPBROADCASTDZ : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem, - loadi32, VR512, v16i32, v4i32, VK16WM>, - EVEX_V512, EVEX_CD8<32, CD8VT1>; -defm VPBROADCASTQZ : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem, - loadi64, VR512, v8i64, v2i64, VK8WM>, EVEX_V512, VEX_W, - EVEX_CD8<64, CD8VT1>; +// Provide aliases for broadcast from the same register class that +// automatically does the extract. +multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> { + def : Pat<(DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))), + (!cast<Instruction>(NAME#DestInfo.ZSuffix#"r") + (EXTRACT_SUBREG (SrcInfo.VT SrcInfo.RC:$src), sub_xmm))>; +} + +multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + avx512_int_broadcast_rm_lowering<_.info512, _.info256>, + EVEX_V512; + // Defined separately to avoid redefinition. + defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>; + } + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + avx512_int_broadcast_rm_lowering<_.info256, _.info256>, + EVEX_V256; + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>, + EVEX_V128; + } +} + +defm VPBROADCASTB : avx512_int_broadcast_rm_vl<0x78, "vpbroadcastb", + avx512vl_i8_info, HasBWI>; +defm VPBROADCASTW : avx512_int_broadcast_rm_vl<0x79, "vpbroadcastw", + avx512vl_i16_info, HasBWI>; +defm VPBROADCASTD : avx512_int_broadcast_rm_vl<0x58, "vpbroadcastd", + avx512vl_i32_info, HasAVX512>; +defm VPBROADCASTQ : avx512_int_broadcast_rm_vl<0x59, "vpbroadcastq", + avx512vl_i64_info, HasAVX512>, VEX_W; multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { - let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _Dst.RC:$dst, - (_Dst.VT (X86SubVBroadcast - (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))))]>, EVEX; - def rmk : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, - _Src.MemOp:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; - def rmkz : AVX5128I<opc, MRMSrcMem, (outs _Dst.RC:$dst), (ins _Dst.KRCWM:$mask, - _Src.MemOp:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - } + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, + AVX5128IBase, EVEX; } defm VBROADCASTI32X4 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4", @@ -944,10 +1052,45 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", EVEX_V512, EVEX_CD8<32, CD8VT8>; } -def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))), - (VPBROADCASTDZrr VR128X:$src)>; -def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))), - (VPBROADCASTQZrr VR128X:$src)>; +multiclass avx512_broadcast_32x2<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src, + SDNode OpNode = X86SubVBroadcast> { + + defm r : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + (ins _Src.RC:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src)))>, + T8PD, EVEX; + let mayLoad = 1 in + defm m : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_Dst.VT (OpNode + (_Src.VT (scalar_to_vector(loadi64 addr:$src)))))>, + T8PD, EVEX, EVEX_CD8<_Src.EltSize, CD8VT2>; +} + +multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> { + let Predicates = [HasDQI] in + defm Z : avx512_broadcast_32x2<opc, OpcodeStr, _.info512, _.info128>, + EVEX_V512; + let Predicates = [HasDQI, HasVLX] in + defm Z256 : avx512_broadcast_32x2<opc, OpcodeStr, _.info256, _.info128>, + EVEX_V256; +} + +multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo _> : + avx512_common_broadcast_32x2<opc, OpcodeStr, _> { + + let Predicates = [HasDQI, HasVLX] in + defm Z128 : avx512_broadcast_32x2<opc, OpcodeStr, _.info128, _.info128, + X86SubV32x2Broadcast>, EVEX_V128; +} + +defm VPBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", + avx512vl_i32_info>; +defm VPBROADCASTF32X2 : avx512_common_broadcast_32x2<0x19, "vbroadcastf32x2", + avx512vl_f32_info>; def : Pat<(v16f32 (X86VBroadcast (v16f32 VR512:$src))), (VBROADCASTSSZr (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; @@ -959,21 +1102,6 @@ def : Pat<(v8f64 (X86VBroadcast (v8f64 VR512:$src))), def : Pat<(v8f64 (X86VBroadcast (v4f64 VR256X:$src))), (VBROADCASTSDZr (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm))>; -def : Pat<(v16i32 (X86VBroadcast (v16i32 VR512:$src))), - (VPBROADCASTDZrr (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v16i32 (X86VBroadcast (v8i32 VR256X:$src))), - (VPBROADCASTDZrr (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm))>; - -def : Pat<(v8i64 (X86VBroadcast (v8i64 VR512:$src))), - (VPBROADCASTQZrr (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v8i64 (X86VBroadcast (v4i64 VR256X:$src))), - (VPBROADCASTQZrr (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm))>; - -def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))), - (VBROADCASTSSZr VR128X:$src)>; -def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))), - (VBROADCASTSDZr VR128X:$src)>; - // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. def : Pat<(v16f32 (X86VBroadcast FR32X:$src)), @@ -985,170 +1113,178 @@ def : Pat<(v8f64 (X86VBroadcast FR64X:$src)), //===----------------------------------------------------------------------===// // AVX-512 BROADCAST MASK TO VECTOR REGISTER //--- - -multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, - RegisterClass KRC> { -let Predicates = [HasCDI] in -def Zrr : AVX512XS8I<opc, MRMSrcReg, (outs VR512:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V512; - -let Predicates = [HasCDI, HasVLX] in { -def Z128rr : AVX512XS8I<opc, MRMSrcReg, (outs VR128:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V128; -def Z256rr : AVX512XS8I<opc, MRMSrcReg, (outs VR256:$dst), (ins KRC:$src), +multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass KRC> { + def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX, EVEX_V256; + [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; } + +multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, RegisterClass KRC> { + let Predicates = [HasCDI] in + defm Z : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info512, KRC>, EVEX_V512; + let Predicates = [HasCDI, HasVLX] in { + defm Z256 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info256, KRC>, EVEX_V256; + defm Z128 : avx512_mask_broadcastm<opc, OpcodeStr, VTInfo.info128, KRC>, EVEX_V128; + } } -let Predicates = [HasCDI] in { defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", - VK16>; + avx512vl_i32_info, VK16>; defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", - VK8>, VEX_W; -} + avx512vl_i64_info, VK8>, VEX_W; //===----------------------------------------------------------------------===// -// AVX-512 - VPERM -// -// -- immediate form -- -multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { - let ExeDomain = _.ExeDomain in { - def ri : AVX512AIi8<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>, - EVEX; - def mi : AVX512AIi8<opc, MRMSrcMem, (outs _.RC:$dst), - (ins _.MemOp:$src1, u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (OpNode (_.LdFrag addr:$src1), - (i8 imm:$src2))))]>, - EVEX, EVEX_CD8<_.EltSize, CD8VF>; +// -- VPERMI2 - 3 source operands form -- +multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { +let Constraints = "$src1 = $dst" in { + defm rr: AVX512_maskable_3src_cast<opc, MRMSrcReg, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + AVX5128IBase; + + let mayLoad = 1 in + defm rm: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3), + OpcodeStr, "$src3, $src2", "$src2, $src3", + (_.VT (X86VPermi2X IdxVT.RC:$src1, _.RC:$src2, + (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + EVEX_4V, AVX5128IBase; + } } +multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { + let mayLoad = 1, Constraints = "$src1 = $dst" in + defm rmb: AVX512_maskable_3src_cast<opc, MRMSrcMem, _, IdxVT, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3), + OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), + !strconcat("$src2, ${src3}", _.BroadcastStr ), + (_.VT (X86VPermi2X IdxVT.RC:$src1, + _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + AVX5128IBase, EVEX_4V, EVEX_B; } -multiclass avx512_permil<bits<8> OpcImm, bits<8> OpcVar, X86VectorVTInfo _, - X86VectorVTInfo Ctrl> : - avx512_perm_imm<OpcImm, "vpermil" # _.Suffix, X86VPermilpi, _> { - let ExeDomain = _.ExeDomain in { - def rr : AVX5128I<OpcVar, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), - !strconcat("vpermil" # _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT Ctrl.RC:$src2))))]>, - EVEX_4V; - def rm : AVX5128I<OpcVar, MRMSrcMem, (outs _.RC:$dst), - (ins _.RC:$src1, Ctrl.MemOp:$src2), - !strconcat("vpermil" # _.Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, - (_.VT (X86VPermilpv _.RC:$src1, - (Ctrl.VT (Ctrl.LdFrag addr:$src2)))))]>, - EVEX_4V; - } -} -defm VPERMILPSZ : avx512_permil<0x04, 0x0C, v16f32_info, v16i32_info>, - EVEX_V512; -defm VPERMILPDZ : avx512_permil<0x05, 0x0D, v8f64_info, v8i64_info>, - EVEX_V512, VEX_W; - -def : Pat<(v16i32 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPSZri VR512:$src1, imm:$imm)>; -def : Pat<(v8i64 (X86VPermilpi VR512:$src1, (i8 imm:$imm))), - (VPERMILPDZri VR512:$src1, imm:$imm)>; - -// -- VPERM2I - 3 source operands form -- -multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _> { +multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; + } +} + +multiclass avx512_perm_i_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { + let Predicates = [HasBWI] in + defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; + let Predicates = [HasBWI, HasVLX] in { + defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMI2W : avx512_perm_i_sizes_w<0x75, "vpermi2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; + +// VPERMT2 +multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let Constraints = "$src1 = $dst" in { defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.RC:$src3), + (ins IdxVT.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, EVEX_4V, + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3))>, EVEX_4V, AVX5128IBase; let mayLoad = 1 in defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.MemOp:$src3), + (ins IdxVT.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, - (_.VT (bitconvert (_.LdFrag addr:$src3)))))>, + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, + (bitconvert (_.LdFrag addr:$src3))))>, EVEX_4V, AVX5128IBase; } } -multiclass avx512_perm_3src_mb<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _> { +multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let mayLoad = 1, Constraints = "$src1 = $dst" in defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src2, _.ScalarMemOp:$src3), + (ins IdxVT.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), - (_.VT (OpNode _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, + (_.VT (X86VPermt2 _.RC:$src1, + IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))))>, AVX5128IBase, EVEX_4V, EVEX_B; } -multiclass avx512_perm_3src_sizes<bits<8> opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { - let Predicates = [HasAVX512] in - defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; +multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo ShuffleMask> { + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512, + ShuffleMask.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; - defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128, + ShuffleMask.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, + avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256, + ShuffleMask.info256>, EVEX_V256; } } -multiclass avx512_perm_3src_sizes_w<bits<8> opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo> { + +multiclass avx512_perm_t_sizes_w<bits<8> opc, string OpcodeStr, + AVX512VLVectorVTInfo VTInfo, + AVX512VLVectorVTInfo Idx> { let Predicates = [HasBWI] in - defm NAME: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info512>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info512>, - EVEX_V512; + defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + Idx.info512>, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { - defm NAME#128: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info128>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; - defm NAME#256: avx512_perm_3src<opc, OpcodeStr, OpNode, VTInfo.info256>, - avx512_perm_3src_mb<opc, OpcodeStr, OpNode, VTInfo.info256>, - EVEX_V256; - } -} -defm VPERMI2D : avx512_perm_3src_sizes<0x76, "vpermi2d", X86VPermiv3, - avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_3src_sizes<0x76, "vpermi2q", X86VPermiv3, - avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2PS : avx512_perm_3src_sizes<0x77, "vpermi2ps", X86VPermiv3, - avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_3src_sizes<0x77, "vpermi2pd", X86VPermiv3, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2D : avx512_perm_3src_sizes<0x7E, "vpermt2d", X86VPermv3, - avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_3src_sizes<0x7E, "vpermt2q", X86VPermv3, - avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2PS : avx512_perm_3src_sizes<0x7F, "vpermt2ps", X86VPermv3, - avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_3src_sizes<0x7F, "vpermt2pd", X86VPermv3, - avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; - -defm VPERMT2W : avx512_perm_3src_sizes_w<0x7D, "vpermt2w", X86VPermv3, - avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPERMI2W : avx512_perm_3src_sizes_w<0x75, "vpermi2w", X86VPermiv3, - avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; + defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + Idx.info128>, EVEX_V128; + defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + Idx.info256>, EVEX_V256; + } +} + +defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", + avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", + avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +defm VPERMT2W : avx512_perm_t_sizes_w<0x7D, "vpermt2w", + avx512vl_i16_info, avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; +defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", + avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; +defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", + avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask @@ -1265,41 +1401,85 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), //===----------------------------------------------------------------------===// // avx512_cmp_scalar - AVX512 CMPSS and CMPSD -multiclass avx512_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, - SDNode OpNode, ValueType VT, - PatFrag ld_frag, string Suffix> { - def rr : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", Suffix, + +multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{ + + defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V; + let mayLoad = 1 in + defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), + imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), + "vcmp${cc}"#_.Suffix, + "{sae}, $src2, $src1", "$src1, $src2,{sae}", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + imm:$cc, + (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B; + // Accept explicit immediate argument form instead of comparison code. + let isAsmParserOnly = 1, hasSideEffects = 0 in { + defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs VK1:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V; + defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc, $src2, $src1", "$src1, $src2, $cc">, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + + defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, + (outs _.KRC:$dst), + (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), + "vcmp"#_.Suffix, + "$cc,{sae}, $src2, $src1","$src1, $src2,{sae}, $cc">, + EVEX_4V, EVEX_B; + }// let isAsmParserOnly = 1, hasSideEffects = 0 + + let isCodeGenOnly = 1 in { + def rr : AVX512Ii8<0xC2, MRMSrcReg, + (outs _.KRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VK1:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + _.FRC:$src2, + imm:$cc))], IIC_SSE_ALU_F32S_RR>, EVEX_4V; - def rm : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, AVXCC:$cc), - !strconcat("vcmp${cc}", Suffix, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VK1:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V; - let isAsmParserOnly = 1, hasSideEffects = 0 in { - def rri_alt : AVX512Ii8<0xC2, MRMSrcReg, - (outs VK1:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), - !strconcat("vcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32S_RR>, EVEX_4V; let mayLoad = 1 in - def rmi_alt : AVX512Ii8<0xC2, MRMSrcMem, - (outs VK1:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), - !strconcat("vcmp", Suffix, - "\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + def rm : AVX512Ii8<0xC2, MRMSrcMem, + (outs _.KRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), + !strconcat("vcmp${cc}", _.Suffix, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.KRC:$dst, (OpNode _.FRC:$src1, + (_.ScalarLdFrag addr:$src2), + imm:$cc))], + IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; } } let Predicates = [HasAVX512] in { -defm VCMPSSZ : avx512_cmp_scalar<FR32X, f32mem, X86cmpms, f32, loadf32, "ss">, - XS; -defm VCMPSDZ : avx512_cmp_scalar<FR64X, f64mem, X86cmpms, f64, loadf64, "sd">, - XD, VEX_W; + defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>, + AVX512XSIi8Base; + defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>, + AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -1700,6 +1880,128 @@ def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)), (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; +// ---------------------------------------------------------------- +// FPClass +//handle fpclass instruction mask = op(reg_scalar,imm) +// op(mem_scalar,imm) +multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, Predicate prd> { + let Predicates = [prd] in { + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1, AddedComplexity = 20 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix## + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + } + } +} + +//handle fpclass instruction mask = fpclass(reg_vec, reg_vec, imm) +// fpclass(reg_vec, mem_vec, imm) +// fpclass(reg_vec, broadcast(eltVt), imm) +multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, string mem, string broadcast>{ + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), + (i32 imm:$src2)))], NoItinerary>; + def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix# + "\t{$src2, $src1, $dst {${mask}}| $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + let mayLoad = 1 in { + def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst | $dst, $src1, $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>; + def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##mem# + "\t{$src2, $src1, $dst {${mask}} | $dst {${mask}}, $src1, $src2}", + [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode + (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, EVEX_K; + def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst | $dst, ${src1}" + ##_.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2)))], NoItinerary>,EVEX_B; + def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), + (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## + _.BroadcastStr##", $dst {${mask}} | $dst {${mask}}, ${src1}"## + _.BroadcastStr##", $src2}", + [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode + (_.VT (X86VBroadcast + (_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2))))], NoItinerary>, + EVEX_B, EVEX_K; + } +} + +multiclass avx512_vector_fpclass_all<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, + string broadcast>{ + let Predicates = [prd] in { + defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", + broadcast>, EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}", + broadcast>, EVEX_V128; + defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}", + broadcast>, EVEX_V256; + } +} + +multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec, + bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ + defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec, + VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>; + defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec, + VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W; + defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f32x_info, prd>, EVEX_CD8<32, CD8VT1>; + defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, + f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W; +} + +defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, + X86Vfpclasss, HasDQI>, AVX512AIi8Base,EVEX; + //----------------------------------------------------------------- // Mask register copy, including // - copy between mask registers @@ -1786,6 +2088,11 @@ let Predicates = [HasDQI] in { (KMOVBmk addr:$dst, VK8:$src)>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (KMOVBkm addr:$src)>; + + def : Pat<(store VK4:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK4:$src, VK8))>; + def : Pat<(store VK2:$src, addr:$dst), + (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK2:$src, VK8))>; } let Predicates = [HasAVX512, NoDQI] in { def : Pat<(store (i8 (bitconvert (v8i1 VK8:$src))), addr:$dst), @@ -1837,10 +2144,15 @@ let Predicates = [HasAVX512] in { (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>; def : Pat<(i32 (anyext VK1:$src)), (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16))>; + def : Pat<(i8 (zext VK1:$src)), (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>; + def : Pat<(i8 (anyext VK1:$src)), + (EXTRACT_SUBREG + (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_8bit)>; + def : Pat<(i64 (zext VK1:$src)), (AND64ri8 (SUBREG_TO_REG (i64 0), (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>; @@ -1848,17 +2160,19 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_16bit)>; - def : Pat<(v16i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK16)>; - def : Pat<(v8i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK8)>; -} -let Predicates = [HasBWI] in { - def : Pat<(v32i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK32)>; - def : Pat<(v64i1 (scalar_to_vector VK1:$src)), - (COPY_TO_REGCLASS VK1:$src, VK64)>; } +def : Pat<(v16i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK16)>; +def : Pat<(v8i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK8)>; +def : Pat<(v4i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK4)>; +def : Pat<(v2i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK2)>; +def : Pat<(v32i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK32)>; +def : Pat<(v64i1 (scalar_to_vector VK1:$src)), + (COPY_TO_REGCLASS VK1:$src, VK64)>; // With AVX-512 only, 8-bit mask is promoted to 16-bit mask. @@ -1955,11 +2269,12 @@ multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, } multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, bit IsCommutable> { + SDPatternOperator OpNode, bit IsCommutable, + Predicate prdW = HasAVX512> { defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, - HasAVX512, IsCommutable>, VEX_4V, VEX_L, PS; + prdW, IsCommutable>, VEX_4V, VEX_L, PS; defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, @@ -1974,6 +2289,7 @@ defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", xnor, 1>; defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; defm KANDN : avx512_mask_binop_all<0x42, "kandn", andn, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; multiclass avx512_mask_binop_int<string IntName, string InstName> { let Predicates = [HasAVX512] in @@ -2047,59 +2363,48 @@ def : Pat<(xor (xor VK1:$src1, VK1:$src2), (i1 1)), (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>; // Mask unpacking -multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr, - RegisterClass KRC> { - let Predicates = [HasAVX512] in - def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>; -} +multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, + RegisterClass KRCSrc, Predicate prd> { + let Predicates = [prd] in { + def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), + (ins KRC:$src1, KRC:$src2), + "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + VEX_4V, VEX_L; -multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> { - defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16>, - VEX_4V, VEX_L, PD; + def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), + (!cast<Instruction>(NAME##rr) + (COPY_TO_REGCLASS KRCSrc:$src2, KRC), + (COPY_TO_REGCLASS KRCSrc:$src1, KRC))>; + } } -defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">; -def : Pat<(v16i1 (concat_vectors (v8i1 VK8:$src1), (v8i1 VK8:$src2))), - (KUNPCKBWrr (COPY_TO_REGCLASS VK8:$src2, VK16), - (COPY_TO_REGCLASS VK8:$src1, VK16))>; - - -multiclass avx512_mask_unpck_int<string IntName, string InstName> { - let Predicates = [HasAVX512] in - def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_bw") - (i16 GR16:$src1), (i16 GR16:$src2)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"BWrr") - (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)), - (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>; -} -defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">; +defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W; // Mask bit testing multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, - SDNode OpNode> { - let Predicates = [HasAVX512], Defs = [EFLAGS] in + SDNode OpNode, Predicate prd> { + let Predicates = [prd], Defs = [EFLAGS] in def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; } -multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, PS; - let Predicates = [HasDQI] in - defm B : avx512_mask_testop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode>, - VEX, PD; - let Predicates = [HasBWI] in { - defm Q : avx512_mask_testop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode>, - VEX, PS, VEX_W; - defm D : avx512_mask_testop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode>, - VEX, PD, VEX_W; - } +multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode, + Predicate prdW = HasAVX512> { + defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>, + VEX, PD; + defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>, + VEX, PS; + defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>, + VEX, PS, VEX_W; + defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>, + VEX, PD, VEX_W; } defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>; // Mask shift multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, @@ -2124,7 +2429,7 @@ multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, let Predicates = [HasDQI] in defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>, VEX, TAPD; - } + } } defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; @@ -2167,24 +2472,52 @@ def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))), def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 0))), + (v16i1 (COPY_TO_REGCLASS VK32:$src, VK16))>; + +def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), + (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; + def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 0))), (v32i1 (COPY_TO_REGCLASS VK64:$src, VK32))>; def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; -let Predicates = [HasVLX] in { - def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), - (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; - def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), - (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; - def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), - (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; - def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), - (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; - def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), - (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; -} +def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK8:$src, VK4))>; + +def : Pat<(v2i1 (extract_subvector (v8i1 VK8:$src), (iPTR 0))), + (v2i1 (COPY_TO_REGCLASS VK8:$src, VK2))>; + +def : Pat<(v4i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v4i1 (COPY_TO_REGCLASS VK2:$src, VK4))>; + +def : Pat<(v8i1 (insert_subvector undef, (v4i1 VK4:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK4:$src, VK8))>; +def : Pat<(v8i1 (insert_subvector undef, (v2i1 VK2:$src), (iPTR 0))), + (v8i1 (COPY_TO_REGCLASS VK2:$src, VK8))>; + +def : Pat<(v32i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK2:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK4:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK8:$src, VK32))>; +def : Pat<(v32i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v32i1 (COPY_TO_REGCLASS VK16:$src, VK32))>; + +def : Pat<(v64i1 (insert_subvector undef, VK2:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK2:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK4:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK4:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK8:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK8:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK16:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK16:$src, VK64))>; +def : Pat<(v64i1 (insert_subvector undef, VK32:$src, (iPTR 0))), + (v64i1 (COPY_TO_REGCLASS VK32:$src, VK64))>; + def : Pat<(v8i1 (X86vshli VK8:$src, (i8 imm:$imm))), (v8i1 (COPY_TO_REGCLASS @@ -2304,23 +2637,21 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore> { - let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def rr_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), - OpcodeStr # "\t{$src, $dst|$dst, $src}", [], - _.ExeDomain>, EVEX; - let Constraints = "$src1 = $dst" in - def rrk_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.KRCWM:$mask, _.RC:$src2), - OpcodeStr # - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}", + + def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), + OpcodeStr # ".s\t{$src, $dst|$dst, $src}", + [], _.ExeDomain>, EVEX; + def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + (ins _.KRCWM:$mask, _.RC:$src), + OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"# + "${dst} {${mask}}, $src}", [], _.ExeDomain>, EVEX, EVEX_K; - def rrkz_alt : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), + def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), - OpcodeStr # - "\t{$src, ${dst} {${mask}} {z}|" # + OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" # "${dst} {${mask}} {z}, $src}", [], _.ExeDomain>, EVEX, EVEX_KZ; - } + let mayStore = 1 in { def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -2425,22 +2756,6 @@ def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src), (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src)>; -let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src)), - (VMOVUPSZmrk addr:$ptr, - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; - -def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmkz - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; - -def: Pat<(v8f32 (masked_load addr:$ptr, VK8WM:$mask, (v8f32 VR256:$src0))), - (v8f32 (EXTRACT_SUBREG (v16f32 (VMOVUPSZrmk - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256:$src0, sub_ymm), - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -} - defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, @@ -2502,17 +2817,6 @@ def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 immAllZerosV), (v16i32 VR512:$src))), (VMOVDQU32Zrrkz (KNOTWrr VK16WM:$mask), VR512:$src)>; } -// NoVLX patterns -let Predicates = [HasAVX512, NoVLX] in { -def: Pat<(masked_store addr:$ptr, VK8WM:$mask, (v8i32 VR256:$src)), - (VMOVDQU32Zmrk addr:$ptr, - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256:$src, sub_ymm))>; - -def: Pat<(v8i32 (masked_load addr:$ptr, VK8WM:$mask, undef)), - (v8i32 (EXTRACT_SUBREG (v16i32 (VMOVDQU32Zrmkz - (v16i1 (COPY_TO_REGCLASS VK8WM:$mask, VK16WM)), addr:$ptr)), sub_ymm))>; -} // Move Int Doubleword to Packed Double Int // @@ -2520,32 +2824,37 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, - EVEX, VEX_LIG; + EVEX; def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector GR64:$src)))], - IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG; + IIC_SSE_MOVDQ>, EVEX, VEX_W; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in +def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), + (ins i64mem:$src), + "vmovq\t{$src, $dst|$dst, $src}", []>, + EVEX, VEX_W, EVEX_CD8<64, CD8VT1>; let isCodeGenOnly = 1 in { -def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src), +def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set FR64:$dst, (bitconvert GR64:$src))], + [(set FR64X:$dst, (bitconvert GR64:$src))], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; -def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src), +def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (bitconvert FR64:$src))], + [(set GR64:$dst, (bitconvert FR64X:$src))], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; -} -def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src), +def VMOVSDto64Zmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(store (i64 (bitconvert FR64:$src)), addr:$dst)], + [(store (i64 (bitconvert FR64X:$src)), addr:$dst)], IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>, EVEX_CD8<64, CD8VT1>; +} // Move Int Doubleword to Single Scalar // @@ -2553,27 +2862,27 @@ let isCodeGenOnly = 1 in { def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert GR32:$src))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG; + IIC_SSE_MOVDQ>, EVEX; def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; } // Move doubleword from xmm register to r/m32 // def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, - EVEX, VEX_LIG; + EVEX; def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128X:$src), + [(store (i32 (extractelt (v4i32 VR128X:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, - EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + EVEX, EVEX_CD8<32, CD8VT1>; // Move quadword from xmm1 register to r/m64 // @@ -2581,16 +2890,28 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), (iPTR 0)))], - IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_LIG, VEX_W, + IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Requires<[HasAVX512, In64BitMode]>; -def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs), - (ins i64mem:$dst, VR128X:$src), - "vmovq\t{$src, $dst|$dst, $src}", - [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), - addr:$dst)], IIC_SSE_MOVDQ>, - EVEX, PD, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>, - Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; +let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in +def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, + Requires<[HasAVX512, In64BitMode]>; + +def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), + (ins i64mem:$dst, VR128X:$src), + "vmovq\t{$src, $dst|$dst, $src}", + [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)], IIC_SSE_MOVDQ>, + EVEX, PD, VEX_W, EVEX_CD8<64, CD8VT1>, + Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; + +let hasSideEffects = 0 in +def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src), + "vmovq.s\t{$src, $dst|$dst, $src}",[]>, + EVEX, VEX_W; // Move Scalar Single to Double Int // @@ -2599,92 +2920,95 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32X:$src))], - IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG; + IIC_SSE_MOVD_ToGP>, EVEX; def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; } // Move Quadword Int to Packed Quadword Int // -def VMOVQI2PQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), +def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, - EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; //===----------------------------------------------------------------------===// // AVX-512 MOVSS, MOVSD //===----------------------------------------------------------------------===// -multiclass avx512_move_scalar <string asm, RegisterClass RC, - SDNode OpNode, ValueType vt, - X86MemOperand x86memop, PatFrag mem_pat> { - let hasSideEffects = 0 in { - def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, (vt (OpNode VR128X:$src1, - (scalar_to_vector RC:$src2))))], - IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG; - let Constraints = "$src1 = $dst" in - def rrk : SI<0x10, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VK1WM:$mask, RC:$src2, RC:$src3), - !strconcat(asm, - "\t{$src3, $src2, $dst {${mask}}|$dst {${mask}}, $src2, $src3}"), - [], IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG, EVEX_K; - def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>, - EVEX, VEX_LIG; +multiclass avx512_move_scalar <string asm, SDNode OpNode, + X86VectorVTInfo _> { + defm rr_Int : AVX512_maskable_scalar<0x10, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), + asm, "$src2, $src1","$src1, $src2", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2))), + IIC_SSE_MOV_S_RR>, EVEX_4V; + let Constraints = "$src1 = $dst" , mayLoad = 1 in + defm rm_Int : AVX512_maskable_3src_scalar<0x10, MRMSrcMem, _, + (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), + asm,"$src","$src", + (_.VT (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, EVEX; + let isCodeGenOnly = 1 in { + def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, _.FRC:$src2), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, + (scalar_to_vector _.FRC:$src2))))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; + let mayLoad = 1 in + def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], + _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX; + } let mayStore = 1 in { - def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>, - EVEX, VEX_LIG; - def mrk: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, VK1WM:$mask, RC:$src), - !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), - [], IIC_SSE_MOV_S_MR>, - EVEX, VEX_LIG, EVEX_K; + def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, + EVEX; + def mrk: AVX512PI<0x11, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), + !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), + [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; } // mayStore - } //hasSideEffects = 0 } -let ExeDomain = SSEPackedSingle in -defm VMOVSSZ : avx512_move_scalar<"movss", FR32X, X86Movss, v4f32, f32mem, - loadf32>, XS, EVEX_CD8<32, CD8VT1>; +defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, + VEX_LIG, XS, EVEX_CD8<32, CD8VT1>; -let ExeDomain = SSEPackedDouble in -defm VMOVSDZ : avx512_move_scalar<"movsd", FR64X, X86Movsd, v2f64, f64mem, - loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, + VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; def : Pat<(f32 (X86select VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), - (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), - VK1WM:$mask, (f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>; + (COPY_TO_REGCLASS (VMOVSSZrr_Intk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; def : Pat<(f64 (X86select VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), - (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), - VK1WM:$mask, (f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; + (COPY_TO_REGCLASS (VMOVSDZrr_Intk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; -// For the disassembler -let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { - def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR32X:$src2), - "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], - IIC_SSE_MOV_S_RR>, - XS, EVEX_4V, VEX_LIG; - def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR64X:$src2), - "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], - IIC_SSE_MOV_S_RR>, - XD, EVEX_4V, VEX_LIG, VEX_W; -} +defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovss.s", "$src2, $src1", "$src1, $src2", []>, + XS, EVEX_4V, VEX_LIG; + +defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info, + (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), + "vmovsd.s", "$src2, $src1", "$src1, $src2", []>, + XD, EVEX_4V, VEX_LIG, VEX_W; let Predicates = [HasAVX512] in { let AddedComplexity = 15 in { @@ -2768,10 +3092,10 @@ let Predicates = [HasAVX512] in { (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; - def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>; @@ -2835,7 +3159,7 @@ def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), (v2i64 VR128X:$src))))], IIC_SSE_MOVQ_RR>, EVEX, VEX_W; -let AddedComplexity = 20 in +let AddedComplexity = 20 , isCodeGenOnly = 1 in def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), (ins i128mem:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -2964,7 +3288,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, OpndItins itins, bit IsCommutable = 0> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr, IsCommutable>, @@ -2972,7 +3296,7 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), @@ -2986,7 +3310,7 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_binop_rm<opc, OpcodeStr, OpNode, _, itins, IsCommutable> { let mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, @@ -3058,20 +3382,20 @@ multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { - defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr, OpNode, itins, prd, + defm Q : avx512_binop_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, itins, prd, IsCommutable>; - defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr, OpNode, itins, prd, + defm D : avx512_binop_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, itins, prd, IsCommutable>; } multiclass avx512_binop_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { - defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr, OpNode, itins, prd, + defm W : avx512_binop_rm_vl_w<opc_w, OpcodeStr#"w", OpNode, itins, prd, IsCommutable>; - defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr, OpNode, itins, prd, + defm B : avx512_binop_rm_vl_b<opc_b, OpcodeStr#"b", OpNode, itins, prd, IsCommutable>; } @@ -3086,15 +3410,15 @@ multiclass avx512_binop_rm_vl_all<bits<8> opc_b, bits<8> opc_w, } multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, - SDNode OpNode,X86VectorVTInfo _Src, + SDNode OpNode,X86VectorVTInfo _Src, X86VectorVTInfo _Dst, bit IsCommutable = 0> { - defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, - "$src2, $src1","$src1, $src2", - (_Dst.VT (OpNode - (_Src.VT _Src.RC:$src1), + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), - itins.rr, IsCommutable>, + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in { defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), @@ -3106,12 +3430,12 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, AVX512BIBase, EVEX_4V; defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), - (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), + (ins _Src.RC:$src1, _Dst.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_Dst.BroadcastStr##", $src1", "$src1, ${src2}"##_Dst.BroadcastStr, - (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Dst.VT (X86VBroadcast + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Dst.VT (X86VBroadcast (_Dst.ScalarLdFrag addr:$src2)))))), itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; @@ -3127,24 +3451,24 @@ defm VPADDS : avx512_binop_rm_vl_bw<0xEC, 0xED, "vpadds", X86adds, defm VPSUBS : avx512_binop_rm_vl_bw<0xE8, 0xE9, "vpsubs", X86subs, SSE_INTALU_ITINS_P, HasBWI, 0>; defm VPADDUS : avx512_binop_rm_vl_bw<0xDC, 0xDD, "vpaddus", X86addus, - SSE_INTALU_ITINS_P, HasBWI, 1>; + SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus, - SSE_INTALU_ITINS_P, HasBWI, 0>; -defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmull", mul, - SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmull", mul, - SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmull", mul, - SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; -defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulh", mulhs, SSE_INTALU_ITINS_P, - HasBWI, 1>; -defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhu", mulhu, SSE_INTMUL_ITINS_P, + SSE_INTALU_ITINS_P, HasBWI, 0>; +defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul, + SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; +defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul, + SSE_INTALU_ITINS_P, HasBWI, 1>; +defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul, + SSE_INTALU_ITINS_P, HasDQI, 1>, T8PD; +defm VPMULHW : avx512_binop_rm_vl_w<0xE5, "vpmulhw", mulhs, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrs", X86mulhrs, SSE_INTMUL_ITINS_P, - HasBWI, 1>, T8PD; +defm VPMULHUW : avx512_binop_rm_vl_w<0xE4, "vpmulhuw", mulhu, SSE_INTMUL_ITINS_P, + HasBWI, 1>; +defm VPMULHRSW : avx512_binop_rm_vl_w<0x0B, "vpmulhrsw", X86mulhrs, SSE_INTMUL_ITINS_P, + HasBWI, 1>, T8PD; defm VPAVG : avx512_binop_rm_vl_bw<0xE0, 0xE3, "vpavg", X86avg, - SSE_INTALU_ITINS_P, HasBWI, 1>; - + SSE_INTALU_ITINS_P, HasBWI, 1>; + multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, SDNode OpNode, bit IsCommutable = 0> { @@ -3159,7 +3483,7 @@ multiclass avx512_binop_all<bits<8> opc, string OpcodeStr, OpndItins itins, v4i32x_info, v2i64x_info, IsCommutable>, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_W; } -} +} defm VPMULDQ : avx512_binop_all<0x28, "vpmuldq", SSE_INTALU_ITINS_P, X86pmuldq, 1>,T8PD; @@ -3170,25 +3494,25 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { let mayLoad = 1 in { defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), - (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), + (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_Src.BroadcastStr##", $src1", "$src1, ${src2}"##_Src.BroadcastStr, - (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert - (_Src.VT (X86VBroadcast + (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert + (_Src.VT (X86VBroadcast (_Src.ScalarLdFrag addr:$src2))))))>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>; } } -multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, - SDNode OpNode,X86VectorVTInfo _Src, +multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, + SDNode OpNode,X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { - defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), + defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, - "$src2, $src1","$src1, $src2", - (_Dst.VT (OpNode - (_Src.VT _Src.RC:$src1), + "$src2, $src1","$src1, $src2", + (_Dst.VT (OpNode + (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2)))>, EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; let mayLoad = 1 in { @@ -3229,102 +3553,59 @@ multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, v16i8x_info>, EVEX_V128; } } + +multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr, + SDNode OpNode, AVX512VLVectorVTInfo _Src, + AVX512VLVectorVTInfo _Dst> { + defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512, + _Dst.info512>, EVEX_V512; + let Predicates = [HasVLX] in { + defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256, + _Dst.info256>, EVEX_V256; + defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128, + _Dst.info128>, EVEX_V128; + } +} + let Predicates = [HasBWI] in { defm VPACKSSDW : avx512_packs_all_i32_i16<0x6B, "vpackssdw", X86Packss>, PD; defm VPACKUSDW : avx512_packs_all_i32_i16<0x2b, "vpackusdw", X86Packus>, T8PD; defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512BIBase, VEX_W; defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase, VEX_W; + + defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, + avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD; + defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, + avx512vl_i16_info, avx512vl_i32_info>, AVX512BIBase; } -defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxs", smax, +defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxs", smax, +defm VPMAXSW : avx512_binop_rm_vl_w<0xEE, "vpmaxsw", smax, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMAXS : avx512_binop_rm_vl_dq<0x3D, 0x3D, "vpmaxs", smax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxu", umax, +defm VPMAXUB : avx512_binop_rm_vl_b<0xDE, "vpmaxub", umax, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxu", umax, +defm VPMAXUW : avx512_binop_rm_vl_w<0x3E, "vpmaxuw", umax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; defm VPMAXU : avx512_binop_rm_vl_dq<0x3F, 0x3F, "vpmaxu", umax, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpmins", smin, +defm VPMINSB : avx512_binop_rm_vl_b<0x38, "vpminsb", smin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; -defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpmins", smin, +defm VPMINSW : avx512_binop_rm_vl_w<0xEA, "vpminsw", smin, SSE_INTALU_ITINS_P, HasBWI, 1>; defm VPMINS : avx512_binop_rm_vl_dq<0x39, 0x39, "vpmins", smin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; -defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminu", umin, +defm VPMINUB : avx512_binop_rm_vl_b<0xDA, "vpminub", umin, SSE_INTALU_ITINS_P, HasBWI, 1>; -defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminu", umin, +defm VPMINUW : avx512_binop_rm_vl_w<0x3A, "vpminuw", umin, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; defm VPMINU : avx512_binop_rm_vl_dq<0x3B, 0x3B, "vpminu", umin, SSE_INTALU_ITINS_P, HasAVX512, 1>, T8PD; - -//===----------------------------------------------------------------------===// -// AVX-512 - Unpack Instructions -//===----------------------------------------------------------------------===// - -multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt, - PatFrag mem_frag, RegisterClass RC, - X86MemOperand x86memop, string asm, - Domain d> { - def rr : AVX512PI<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, RC:$src2), - asm, [(set RC:$dst, - (vt (OpNode RC:$src1, RC:$src2)))], - d>, EVEX_4V; - def rm : AVX512PI<opc, MRMSrcMem, - (outs RC:$dst), (ins RC:$src1, x86memop:$src2), - asm, [(set RC:$dst, - (vt (OpNode RC:$src1, - (bitconvert (mem_frag addr:$src2)))))], - d>, EVEX_4V; -} - -defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, loadv8f64, - VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, loadv8f64, - VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; -defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, loadv8f64, - VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, loadv8f64, - VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { - def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], - IIC_SSE_UNPCK>, EVEX_4V; - def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), - (bitconvert (memop_frag addr:$src2)))))], - IIC_SSE_UNPCK>, EVEX_4V; -} -defm VPUNPCKLDQZ : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32, - VR512, loadv16i32, i512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64, - VR512, loadv8i64, i512mem>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; -defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32, - VR512, loadv16i32, i512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64, - VR512, loadv8i64, i512mem>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// @@ -3362,12 +3643,12 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, let isCodeGenOnly = 1, isCommutable = IsCommutable, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), - (ins _.FRC:$src1, _.FRC:$src2), + (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], itins.rr>; def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), - (ins _.FRC:$src1, _.ScalarMemOp:$src2), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2)))], itins.rr>; @@ -3375,7 +3656,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, } multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - SDNode VecNode, OpndItins itins, bit IsCommutable> { + SDNode VecNode, OpndItins itins, bit IsCommutable = 0> { defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, @@ -3470,7 +3751,7 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, EVEX_4V, EVEX_B; } -multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, bit IsCommutable = 0> { defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info, IsCommutable>, EVEX_V512, PS, @@ -3514,7 +3795,7 @@ defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, 1>, avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, 1>, avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; -defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, +defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub>, avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv>, avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; @@ -3550,13 +3831,34 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, }//let mayLoad = 1 } -multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, +multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>; + let mayLoad = 1 in { + defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, + "$src2, $src1", "$src1, $src2", + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>; + }//let mayLoad = 1 +} + +multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode> { + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; + defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f32x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNode, SSE_ALU_ITINS_S.s>, + EVEX_4V,EVEX_CD8<32, CD8VT1>; + defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNode, f64x_info>, + avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNode, SSE_ALU_ITINS_S.d>, + EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; + // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>, @@ -3569,7 +3871,7 @@ multiclass avx512_fp_scalef_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; } } -defm VSCALEF : avx512_fp_scalef_all<0x2C, "vscalef", X86scalef>, T8PD; +defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef>, T8PD; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions @@ -3586,7 +3888,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode, defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), + (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))))>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; @@ -3748,12 +4050,12 @@ multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, VTInfo.info256>, EVEX_V256; defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, VTInfo.info128>, - avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, VTInfo.info128>, EVEX_V128; } } -multiclass avx512_shift_rmi_w<bits<8> opcw, +multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -3846,6 +4148,27 @@ multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, avx512vl_i64_info>, VEX_W; } +// Use 512bit version to implement 128/256 bit in case NoVLX. +multiclass avx512_var_shift_w_lowering<AVX512VLVectorVTInfo _, SDNode OpNode> { + let Predicates = [HasBWI, NoVLX] in { + def : Pat<(_.info256.VT (OpNode (_.info256.VT _.info256.RC:$src1), + (_.info256.VT _.info256.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(_.info128.VT (OpNode (_.info128.VT _.info128.RC:$src1), + (_.info128.VT _.info128.RC:$src2))), + (EXTRACT_SUBREG + (!cast<Instruction>(NAME#"WZrr") + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (_.info512.VT (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; + } +} + multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in @@ -3861,11 +4184,14 @@ multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, } defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, - avx512_var_shift_w<0x12, "vpsllvw", shl>; + avx512_var_shift_w<0x12, "vpsllvw", shl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, shl>; defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, - avx512_var_shift_w<0x11, "vpsravw", sra>; + avx512_var_shift_w<0x11, "vpsravw", sra>, + avx512_var_shift_w_lowering<avx512vl_i16_info, sra>; defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, - avx512_var_shift_w<0x10, "vpsrlvw", srl>; + avx512_var_shift_w<0x10, "vpsrlvw", srl>, + avx512_var_shift_w_lowering<avx512vl_i16_info, srl>; defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; @@ -3916,19 +4242,77 @@ defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", X86VPermi, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - VPERMIL +//===----------------------------------------------------------------------===// +multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, X86VectorVTInfo Ctrl> { + defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode _.RC:$src1, + (Ctrl.VT Ctrl.RC:$src2)))>, + T8PD, EVEX_4V; + let mayLoad = 1 in { + defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + "${src2}"##_.BroadcastStr##", $src1", + "$src1, ${src2}"##_.BroadcastStr, + (_.VT (OpNode + _.RC:$src1, + (Ctrl.VT (X86VBroadcast + (Ctrl.ScalarLdFrag addr:$src2)))))>, + T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + }//let mayLoad = 1 +} + +multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512, + Ctrl.info512>, EVEX_V512; + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128, + Ctrl.info128>, EVEX_V128; + defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256, + Ctrl.info256>, EVEX_V256; + } +} + +multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, + AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + + defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>; + defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr, + X86VPermilpi, _>, + EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; +} + +defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, + avx512vl_i32_info>; +defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, + avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW //===----------------------------------------------------------------------===// defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd", - X86PShufd, avx512vl_i32_info>, + X86PShufd, avx512vl_i32_info>, EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>; defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", - X86PShufhw>, EVEX, AVX512XSIi8Base, VEX_W; + X86PShufhw>, EVEX, AVX512XSIi8Base; defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", - X86PShuflw>, EVEX, AVX512XDIi8Base, VEX_W; - + X86PShuflw>, EVEX, AVX512XDIi8Base; + multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512; @@ -3942,55 +4326,6 @@ multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; //===----------------------------------------------------------------------===// -// AVX-512 - MOVDDUP -//===----------------------------------------------------------------------===// - -multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, - X86MemOperand x86memop, PatFrag memop_frag> { -def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX; -def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, - (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX; -} - -defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, loadv8f64>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), - (VMOVDDUPZrm addr:$src)>; - -//===---------------------------------------------------------------------===// -// Replicate Single FP - MOVSHDUP and MOVSLDUP -//===---------------------------------------------------------------------===// -multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr, - ValueType vt, RegisterClass RC, PatFrag mem_frag, - X86MemOperand x86memop> { - def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX; - let mayLoad = 1 in - def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX; -} - -defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))), - (VMOVSHDUPZrm addr:$src)>; -def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))), - (VMOVSLDUPZrm addr:$src)>; - -//===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), @@ -4017,6 +4352,115 @@ let Predicates = [HasAVX512] in { } //===----------------------------------------------------------------------===// +// VMOVHPS/PD VMOVLPS Instructions +// All patterns was taken from SSS implementation. +//===----------------------------------------------------------------------===// +multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayLoad = 1 in + def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, f64mem:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst, + (OpNode _.RC:$src1, + (_.VT (bitconvert + (v2f64 (scalar_to_vector (loadf64 addr:$src2)))))))], + IIC_SSE_MOV_LH>, EVEX_4V; +} + +defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; +defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps, + v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; +defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movlpd, + v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; + +let Predicates = [HasAVX512] in { + // VMOVHPS patterns + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(X86Movlhps VR128X:$src1, + (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVHPD patterns + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (scalar_to_vector (loadf64 addr:$src2)))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Unpckl VR128X:$src1, + (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), + (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPS patterns + def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))), + (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; + // VMOVLPD patterns + def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, + (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), + (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; +} + +let mayStore = 1 in { +def VMOVHPSZ128mr : AVX512PSI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (X86Unpckh (bc_v2f64 (v4f32 VR128X:$src)), + (bc_v2f64 (v4f32 VR128X:$src))), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVHPDZ128mr : AVX512PDI<0x17, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovhpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract + (v2f64 (X86Unpckh VR128X:$src, VR128X:$src)), + (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +def VMOVLPSZ128mr : AVX512PSI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlps\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128X:$src)), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<32, CD8VT2>; +def VMOVLPDZ128mr : AVX512PDI<0x13, MRMDestMem, (outs), + (ins f64mem:$dst, VR128X:$src), + "vmovlpd\t{$src, $dst|$dst, $src}", + [(store (f64 (vector_extract (v2f64 VR128X:$src), + (iPTR 0))), addr:$dst)], + IIC_SSE_MOV_LH>, + EVEX, EVEX_CD8<64, CD8VT1>, VEX_W; +} +let Predicates = [HasAVX512] in { + // VMOVHPD patterns + def : Pat<(store (f64 (vector_extract + (v2f64 (X86VPermilpi VR128X:$src, (i8 1))), + (iPTR 0))), addr:$dst), + (VMOVHPDZ128mr addr:$dst, VR128X:$src)>; + // VMOVLPS patterns + def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v4i32 (X86Movlps + (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1), + (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; + // VMOVLPD patterns + def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; + def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)), + addr:$src1), + (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; +} +//===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations // @@ -4034,7 +4478,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, - AVX512FMA3Base; + AVX512FMA3Base; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), @@ -4435,50 +4879,55 @@ def : Pat<(f64 (uint_to_fp GR64:$src)), //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - Intrinsic Int, Operand memop, ComplexPattern mem_cpat, - string asm> { -let hasSideEffects = 0 in { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG, - Requires<[HasAVX512]>; - let mayLoad = 1 in - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), - !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG, - Requires<[HasAVX512]>; -} // hasSideEffects = 0 +multiclass avx512_cvt_s_int_round<bits<8> opc, RegisterClass SrcRC, + RegisterClass DstRC, Intrinsic Int, + Operand memop, ComplexPattern mem_cpat, string asm> { + let hasSideEffects = 0, Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG; + def rb : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src, AVX512RC:$rc), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), []>, + EVEX, VEX_LIG, EVEX_B, EVEX_RC; + let mayLoad = 1 in + def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG; + } // hasSideEffects = 0, Predicates = [HasAVX512] } -let Predicates = [HasAVX512] in { + // Convert float/double to signed/unsigned int 32/64 -defm VCVTSS2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, +defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse_cvtss2si, ssmem, sse_load_f32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64, +defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse_cvtss2si64, ssmem, sse_load_f32, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi, +defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtss2usi, ssmem, sse_load_f32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, +defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, int_x86_avx512_cvtss2usi64, ssmem, sse_load_f32, "cvtss2usi">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; -defm VCVTSD2SIZ: avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, +defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2SI64Z: avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64, +defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, VR128X, GR64, + int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USIZ: avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi, +defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, VR128X, GR32, + int_x86_avx512_cvtsd2usi, sdmem, sse_load_f64, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; -defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64, +defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, VR128X, GR64, int_x86_avx512_cvtsd2usi64, sdmem, sse_load_f64, "cvtsd2usi">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; -let isCodeGenOnly = 1 in { +let isCodeGenOnly = 1 , Predicates = [HasAVX512] in { defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", SSE_CVT_Scalar, 0>, XS, EVEX_4V; @@ -4495,121 +4944,170 @@ let isCodeGenOnly = 1 in { defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X, int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}", SSE_CVT_Scalar, 0>, XD, EVEX_4V; -} // isCodeGenOnly = 1 +} // isCodeGenOnly = 1, Predicates = [HasAVX512] // Convert float/double to signed/unsigned int 32/64 with truncation -let isCodeGenOnly = 1 in { - defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si, - ssmem, sse_load_f32, "cvttss2si">, - XS, EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, - int_x86_sse_cvttss2si64, ssmem, sse_load_f32, - "cvttss2si">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si, - sdmem, sse_load_f64, "cvttsd2si">, XD, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64, - int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, - "cvttsd2si">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, - int_x86_avx512_cvttss2usi, ssmem, sse_load_f32, - "cvttss2usi">, XS, EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, - int_x86_avx512_cvttss2usi64, ssmem, - sse_load_f32, "cvttss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; - defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32, - int_x86_avx512_cvttsd2usi, - sdmem, sse_load_f64, "cvttsd2usi">, XD, - EVEX_CD8<64, CD8VT1>; - defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64, - int_x86_avx512_cvttsd2usi64, sdmem, - sse_load_f64, "cvttsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; -} // isCodeGenOnly = 1 - -multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag, - string asm> { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), +multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, + X86VectorVTInfo _DstRC, SDNode OpNode, + SDNode OpNodeRnd>{ +let Predicates = [HasAVX512] in { + def rr : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX; - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), + [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX; + def rb : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + []>, EVEX, EVEX_B; + def rm : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.MemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX; -} - -defm VCVTTSS2SIZ : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem, - loadf32, "cvttss2si">, XS, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USIZ : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem, - loadf32, "cvttss2usi">, XS, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2SI64Z : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem, - loadf32, "cvttss2si">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem, - loadf32, "cvttss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; -defm VCVTTSD2SIZ : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem, - loadf64, "cvttsd2si">, XD, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USIZ : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem, - loadf64, "cvttsd2usi">, XD, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2SI64Z : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem, - loadf64, "cvttsd2si">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; -defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem, - loadf64, "cvttsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, + EVEX; + + let isCodeGenOnly = 1,hasSideEffects = 0 in { + def rr_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; + def rb_Int : SI<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), + [(set _DstRC.RC:$dst, (OpNodeRnd _SrcRC.RC:$src, + (i32 FROUND_NO_EXC)))]>, + EVEX,VEX_LIG , EVEX_B; + let mayLoad = 1 in + def rm_Int : SI<opc, MRMSrcMem, (outs _DstRC.RC:$dst), + (ins _SrcRC.MemOp:$src), + !strconcat(asm,"\t{$src, $dst|$dst, $src}"), + []>, EVEX, VEX_LIG; + + } // isCodeGenOnly = 1, hasSideEffects = 0 +} //HasAVX512 +} + + +defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i32x_info, + fp_to_sint,X86cvttss2IntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "cvttss2si", f32x_info, i64x_info, + fp_to_sint,X86cvttss2IntRnd>, + VEX_W, XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i32x_info, + fp_to_sint,X86cvttsd2IntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "cvttsd2si", f64x_info, i64x_info, + fp_to_sint,X86cvttsd2IntRnd>, + VEX_W, XD, EVEX_CD8<64, CD8VT1>; + +defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i32x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS, EVEX_CD8<32, CD8VT1>; +defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "cvttss2usi", f32x_info, i64x_info, + fp_to_uint,X86cvttss2UIntRnd>, + XS,VEX_W, EVEX_CD8<32, CD8VT1>; +defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i32x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, EVEX_CD8<64, CD8VT1>; +defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "cvttsd2usi", f64x_info, i64x_info, + fp_to_uint,X86cvttsd2UIntRnd>, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; +let Predicates = [HasAVX512] in { + def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), + (VCVTTSS2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))), + (VCVTTSS2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))), + (VCVTTSD2SIZrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))), + (VCVTTSD2SI64Zrr_Int (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + } // HasAVX512 //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back //===----------------------------------------------------------------------===// -let hasSideEffects = 0 in { -def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst), - (ins FR32X:$src1, FR32X:$src2), - "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; -let mayLoad = 1 in -def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst), - (ins FR32X:$src1, f32mem:$src2), - "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, - EVEX_CD8<32, CD8VT1>; - -// Convert scalar double to scalar single -def VCVTSD2SSZrr : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst), - (ins FR64X:$src1, FR64X:$src2), - "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>; -let mayLoad = 1 in -def VCVTSD2SSZrm : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst), - (ins FR64X:$src1, f64mem:$src2), - "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX_4V, VEX_LIG, VEX_W, - Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>; -} - -def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>, - Requires<[HasAVX512]>; -def : Pat<(fextend (loadf32 addr:$src)), - (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>; - -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, +multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNode> { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (_.VT (OpNode (_Src.VT _Src.RC:$src1), + (_Src.VT (scalar_to_vector + (_Src.ScalarLdFrag addr:$src2)))))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; +} + +// Scalar Coversion with SAE - suppress all exceptions +multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, + "{sae}, $src2, $src1", "$src1, $src2, {sae}", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), + (i32 FROUND_NO_EXC)))>, + EVEX_4V, VEX_LIG, EVEX_B; +} + +// Scalar Conversion with rounding control (RC) +multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, + X86VectorVTInfo _Src, SDNode OpNodeRnd> { + defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _Src.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src1), + (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, + EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + EVEX_B, EVEX_RC; +} +multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, + OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, + EVEX_V512, XD; + } +} + +multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, X86VectorVTInfo _src, + X86VectorVTInfo _dst> { + let Predicates = [HasAVX512] in { + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode>, + avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, + EVEX_CD8<32, CD8VT1>, XS, EVEX_V512; + } +} +defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86fround, + X86froundRnd, f64x_info, f32x_info>; +defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpext, + X86fpextRnd,f32x_info, f64x_info >; + +def : Pat<(f64 (fextend FR32X:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X), + (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>, + Requires<[HasAVX512]>; +def : Pat<(f64 (fextend (loadf32 addr:$src))), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, + Requires<[HasAVX512]>; + +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[HasAVX512, OptForSize]>; -def : Pat<(extloadf32 addr:$src), - (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>, - Requires<[HasAVX512, OptForSpeed]>; +def : Pat<(f64 (extloadf32 addr:$src)), + (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>, + Requires<[HasAVX512, OptForSpeed]>; -def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, +def : Pat<(f32 (fround FR64X:$src)), + (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X), + (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>, Requires<[HasAVX512]>; - //===----------------------------------------------------------------------===// // AVX-512 Vector convert from signed/unsigned integer to float/double // and from float/double to signed/unsigned integer @@ -4992,7 +5490,7 @@ defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUlongToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>; -let Predicates = [NoVLX] in { +let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>; @@ -5024,40 +5522,102 @@ let Predicates = [HasAVX512] in { //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_cvtph2ps<RegisterClass destRC, RegisterClass srcRC, - X86MemOperand x86memop> { - def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", - []>, EVEX; - let hasSideEffects = 0, mayLoad = 1 in - def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX; -} - -multiclass avx512_cvtps2ph<RegisterClass destRC, RegisterClass srcRC, - X86MemOperand x86memop> { - def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst), - (ins srcRC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, EVEX; - let hasSideEffects = 0, mayStore = 1 in - def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, srcRC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX; +multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop, PatFrag ld_frag> { + defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_CURRENT))>, T8PD; + let hasSideEffects = 0, mayLoad = 1 in { + defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), + "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), + (i32 FROUND_CURRENT))>, T8PD; + } } -defm VCVTPH2PSZ : avx512_cvtph2ps<VR512, VR256X, f256mem>, EVEX_V512, - EVEX_CD8<32, CD8VH>; -defm VCVTPS2PHZ : avx512_cvtps2ph<VR256X, VR512, f256mem>, EVEX_V512, - EVEX_CD8<32, CD8VH>; +multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), + "vcvtph2ps", "{sae}, $src", "$src, {sae}", + (X86cvtph2ps (_src.VT _src.RC:$src), + (i32 FROUND_NO_EXC))>, T8PD, EVEX_B; -def : Pat<(v16i16 (int_x86_avx512_mask_vcvtps2ph_512 (v16f32 VR512:$src), - imm:$rc, (bc_v16i16(v8i32 immAllZerosV)), (i16 -1))), - (VCVTPS2PHZrr VR512:$src, imm:$rc)>; +} -def : Pat<(v16f32 (int_x86_avx512_mask_vcvtph2ps_512 (v16i16 VR256X:$src), - (bc_v16f32(v16i32 immAllZerosV)), (i16 -1), (i32 FROUND_CURRENT))), - (VCVTPH2PSZrr VR256X:$src)>; +let Predicates = [HasAVX512] in { + defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>, + avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, + loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, + loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} + +multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop> { + defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, $src1", "$src1, $src2", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, AVX512AIi8Base; + let hasSideEffects = 0, mayStore = 1 in { + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), (i32 FROUND_CURRENT) )), + addr:$dst)]>; + def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + []>, EVEX_K; + } +} +multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + defm rb : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph", "$src2, {sae}, $src1", "$src1, $src2, {sae}", + (X86cvtps2ph (_src.VT _src.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B, AVX512AIi8Base; +} +let Predicates = [HasAVX512] in { + defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>, + avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>, + EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + let Predicates = [HasVLX] in { + defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>, + EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>, + EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + } +} + +// Unordered/Ordered scalar fp compare with Sea and set EFLAGS +multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, SDNode OpNode, + string OpcodeStr> { + def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), + [(set EFLAGS, (OpNode (_.VT _.RC:$src1), _.RC:$src2, + (i32 FROUND_NO_EXC)))], + IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + Sched<[WriteFAdd]>; +} + +let Defs = [EFLAGS], Predicates = [HasAVX512] in { + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, X86ucomiSae, "vucomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, X86ucomiSae, "vucomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, X86comiSae, "vcomiss">, + AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, X86comiSae, "vcomisd">, + AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; +} let Defs = [EFLAGS], Predicates = [HasAVX512] in { defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, @@ -5067,10 +5627,10 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { "ucomisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; let Pattern = []<dag> in { - defm VCOMISSZ : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load, + defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, "comiss">, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load, + defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, "comisd">, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } @@ -5092,50 +5652,31 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in { } /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd -multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop> { - let hasSideEffects = 0 in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; +multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let hasSideEffects = 0, AddedComplexity = 20 , Predicates = [HasAVX512] in { + defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V; let mayLoad = 1 in { - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, EVEX_4V; + defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNode (_.VT _.RC:$src1), + (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V; } } } -defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; -defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", FR32X, f32mem>, - EVEX_CD8<32, CD8VT1>; -defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", FR64X, f64mem>, - VEX_W, EVEX_CD8<64, CD8VT1>; - -def : Pat <(v4f32 (int_x86_avx512_rcp14_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRCP14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rcp14_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRCP14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; - -def : Pat <(v4f32 (int_x86_avx512_rsqrt14_ss (v4f32 VR128X:$src1), - (v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRSQRT14SSrr (COPY_TO_REGCLASS VR128X:$src1, FR32X), - (COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>; - -def : Pat <(v2f64 (int_x86_avx512_rsqrt14_sd (v2f64 VR128X:$src1), - (v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1))), - (COPY_TO_REGCLASS (VRSQRT14SDrr (COPY_TO_REGCLASS VR128X:$src1, FR64X), - (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; +defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; +defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD; +defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -5183,20 +5724,6 @@ multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; -def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRSQRT14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRSQRT14PDZr VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRCP14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRCP14PDZr VR512:$src)>; - /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode> { @@ -5232,6 +5759,8 @@ let hasSideEffects = 0, Predicates = [HasERI] in { defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; } + +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, @@ -5322,67 +5851,6 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, } } -multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, - Intrinsic F32Int, Intrinsic F64Int, - OpndItins itins_s, OpndItins itins_d> { - def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst), - (ins FR32X:$src1, FR32X:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [], itins_s.rr>, XS, EVEX_4V; - let isCodeGenOnly = 1 in - def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F32Int VR128X:$src1, VR128X:$src2))], - itins_s.rr>, XS, EVEX_4V; - let mayLoad = 1 in { - def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst), - (ins FR32X:$src1, f32mem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - let isCodeGenOnly = 1 in - def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, ssmem:$src2), - !strconcat(OpcodeStr, - "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F32Int VR128X:$src1, sse_load_f32:$src2))], - itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>; - } - def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst), - (ins FR64X:$src1, FR64X:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XD, EVEX_4V, VEX_W; - let isCodeGenOnly = 1 in - def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst), - (ins VR128X:$src1, VR128X:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, VR128X:$src2))], - itins_s.rr>, XD, EVEX_4V, VEX_W; - let mayLoad = 1 in { - def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst), - (ins FR64X:$src1, f64mem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - let isCodeGenOnly = 1 in - def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst), - (ins VR128X:$src1, sdmem:$src2), - !strconcat(OpcodeStr, - "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128X:$dst, - (F64Int VR128X:$src1, sse_load_f64:$src2))]>, - XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>; - } -} - multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, @@ -5416,93 +5884,77 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; } +multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, + string SUFF, SDNode OpNode, SDNode OpNodeRnd> { + + defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in + defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, + "$src2, $src1", "$src1, $src2", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT))>; + + defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, + "$rc, $src2, $src1", "$src1, $src2, $rc", + (OpNodeRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$rc))>, + EVEX_B, EVEX_RC; + + let isCodeGenOnly = 1 in { + def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + } + + def : Pat<(_.EltVT (OpNode _.FRC:$src)), + (!cast<Instruction>(NAME#SUFF#Zr) + (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; + + def : Pat<(_.EltVT (OpNode (load addr:$src))), + (!cast<Instruction>(NAME#SUFF#Zm) + (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[OptForSize]>; +} + +multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> { + defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt, + X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; + defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt, + X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; +} + defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>; -defm VSQRT : avx512_sqrt_scalar<0x51, "sqrt", - int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, - SSE_SQRTSS, SSE_SQRTSD>; +defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; let Predicates = [HasAVX512] in { - def : Pat<(f32 (fsqrt FR32X:$src)), - (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>; - def : Pat<(f32 (fsqrt (load addr:$src))), - (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f64 (fsqrt FR64X:$src)), - (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>; - def : Pat<(f64 (fsqrt (load addr:$src))), - (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>, - Requires<[OptForSize]>; - def : Pat<(f32 (X86frsqrt FR32X:$src)), - (VRSQRT14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; def : Pat<(f32 (X86frsqrt (load addr:$src))), - (VRSQRT14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[OptForSize]>; - def : Pat<(f32 (X86frcp FR32X:$src)), - (VRCP14SSrr (f32 (IMPLICIT_DEF)), FR32X:$src)>; + (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; def : Pat<(f32 (X86frcp (load addr:$src))), - (VRCP14SSrm (f32 (IMPLICIT_DEF)), addr:$src)>, + (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, Requires<[OptForSize]>; - - def : Pat<(int_x86_sse_sqrt_ss VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR32)), - VR128X)>; - def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src), - (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>; - - def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src), - (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)), - (COPY_TO_REGCLASS VR128X:$src, FR64)), - VR128X)>; - def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src), - (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>; -} - - -multiclass avx512_rndscale<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - PatFrag mem_frag, Domain d> { -let ExeDomain = d in { - // Intrinsic operation, reg. - // Vector intrinsic operation, reg - def r : AVX512AIi8<opc, MRMSrcReg, - (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX; - - // Vector intrinsic operation, mem - def m : AVX512AIi8<opc, MRMSrcMem, - (outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - []>, EVEX; -} // ExeDomain } -defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512, - loadv16f32, SSEPackedSingle>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - -def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1), - imm:$src2, (v16f32 VR512:$src1), (i16 -1), - FROUND_CURRENT)), - (VRNDSCALEPSZr VR512:$src1, imm:$src2)>; - - -defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512, - loadv8f64, SSEPackedDouble>, EVEX_V512, - VEX_W, EVEX_CD8<64, CD8VF>; - -def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1), - imm:$src2, (v8f64 VR512:$src1), (i8 -1), - FROUND_CURRENT)), - (VRNDSCALEPDZr VR512:$src1, imm:$src2)>; - multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { @@ -5510,20 +5962,20 @@ avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), (i32 FROUND_CURRENT)))>; defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, - "{sae}, $src3, $src2, $src1", "$src1, $src2, $src3, {sae}", - (_.VT (X86RndScale (_.VT _.RC:$src1), (_.VT _.RC:$src2), + "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B; let mayLoad = 1 in defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86RndScale (_.VT _.RC:$src1), + (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), (i32 imm:$src3), (i32 FROUND_CURRENT)))>; } @@ -5568,109 +6020,238 @@ defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; -let Predicates = [HasAVX512] in { -def : Pat<(v16f32 (ffloor VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x1))>; -def : Pat<(v16f32 (fnearbyint VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0xC))>; -def : Pat<(v16f32 (fceil VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x2))>; -def : Pat<(v16f32 (frint VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x4))>; -def : Pat<(v16f32 (ftrunc VR512:$src)), - (VRNDSCALEPSZr VR512:$src, (i32 0x3))>; - -def : Pat<(v8f64 (ffloor VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x1))>; -def : Pat<(v8f64 (fnearbyint VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0xC))>; -def : Pat<(v8f64 (fceil VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x2))>; -def : Pat<(v8f64 (frint VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x4))>; -def : Pat<(v8f64 (ftrunc VR512:$src)), - (VRNDSCALEPDZr VR512:$src, (i32 0x3))>; -} //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- -multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, - RegisterClass dstRC, RegisterClass srcRC, - RegisterClass KRC, X86MemOperand x86memop> { - def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins srcRC:$src), - !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), +multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, + X86MemOperand x86memop> { + + defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>, + EVEX, T8XS; + + // for intrinsic patter match + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + undef)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.ImmAllZerosV)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + DestInfo.RC:$src0)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0, + DestInfo.KRCWM:$mask , + SrcInfo.RC:$src1)>; + + let mayStore = 1 in { + def mr : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst |$dst, $src}", []>, EVEX; - def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), + def mrk : AVX512XS8I<opc, MRMDestMem, (outs), + (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src), + OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", []>, EVEX, EVEX_K; + }//mayStore = 1 +} - def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst), - (ins KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; +multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, + PatFrag truncFrag, PatFrag mtruncFrag > { - def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - []>, EVEX; + def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) + addr:$dst, SrcInfo.RC:$src)>; - def mrk : AVX512XS8I<opc, MRMDestMem, (outs), - (ins x86memop:$dst, KRC:$mask, srcRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"), - []>, EVEX, EVEX_K; + def : Pat<(mtruncFrag addr:$dst, SrcInfo.KRCWM:$mask, + (SrcInfo.VT SrcInfo.RC:$src)), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) + addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>; +} + +multiclass avx512_trunc_sat_mr_lowering<X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, string sat > { + + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), SrcInfo.MRC:$mask), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mrk) addr:$ptr, + (COPY_TO_REGCLASS SrcInfo.MRC:$mask, SrcInfo.KRCWM), + (SrcInfo.VT SrcInfo.RC:$src))>; + def: Pat<(!cast<Intrinsic>("int_x86_avx512_mask_pmov"#sat#"_"#SrcInfo.Suffix# + DestInfo.Suffix#"_mem_"#SrcInfo.Size) + addr:$ptr, (SrcInfo.VT SrcInfo.RC:$src), -1), + (!cast<Instruction>(NAME#SrcInfo.ZSuffix##mr) addr:$ptr, + (SrcInfo.VT SrcInfo.RC:$src))>; } -defm VPMOVQB : avx512_trunc_sat<0x32, "vpmovqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVSQB : avx512_trunc_sat<0x22, "vpmovsqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVUSQB : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>; -defm VPMOVQW : avx512_trunc_sat<0x34, "vpmovqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVSQW : avx512_trunc_sat<0x24, "vpmovsqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVUSQW : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM, - i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>; -defm VPMOVQD : avx512_trunc_sat<0x35, "vpmovqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVSQD : avx512_trunc_sat<0x25, "vpmovsqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVUSQD : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM, - i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>; -defm VPMOVDW : avx512_trunc_sat<0x33, "vpmovdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVSDW : avx512_trunc_sat<0x23, "vpmovsdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVUSDW : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM, - i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>; -defm VPMOVDB : avx512_trunc_sat<0x31, "vpmovdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVSDB : avx512_trunc_sat<0x21, "vpmovsdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; -defm VPMOVUSDB : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM, - i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>; - -def : Pat<(v16i8 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQBrr VR512:$src)>; -def : Pat<(v8i16 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQWrr VR512:$src)>; -def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr VR512:$src)>; -def : Pat<(v16i8 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr VR512:$src)>; -def : Pat<(v8i32 (X86vtrunc (v8i64 VR512:$src))), (VPMOVQDrr VR512:$src)>; - -def : Pat<(v16i8 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))), - (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>; -def : Pat<(v8i16 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQWrrkz VK8WM:$mask, VR512:$src)>; -def : Pat<(v8i32 (X86vtruncm VK8WM:$mask, (v8i64 VR512:$src))), - (VPMOVQDrrkz VK8WM:$mask, VR512:$src)>; +multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, + Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + truncFrag, mtruncFrag>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + truncFrag, mtruncFrag>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ, + truncFrag, mtruncFrag>, EVEX_V512; +} + +multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, + X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, + X86MemOperand x86memopZ, string sat, Predicate prd = HasAVX512>{ + + let Predicates = [HasVLX, prd] in { + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, + DestInfoZ128, x86memopZ128>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info128, DestInfoZ128, + sat>, EVEX_V128; + + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, + DestInfoZ256, x86memopZ256>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info256, DestInfoZ256, + sat>, EVEX_V256; + } + let Predicates = [prd] in + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, + DestInfoZ, x86memopZ>, + avx512_trunc_sat_mr_lowering<VTSrcInfo.info512, DestInfoZ, + sat>, EVEX_V512; +} + +multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VO>; +} +multiclass avx512_trunc_sat_qb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qb", OpNode, avx512vl_i64_info, + v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, + sat>, EVEX_CD8<8, CD8VO>; +} + +multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VQ>; +} +multiclass avx512_trunc_sat_qw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qw", OpNode, avx512vl_i64_info, + v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<16, CD8VQ>; +} + +multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + truncstorevi32, masked_truncstorevi32>, EVEX_CD8<32, CD8VH>; +} +multiclass avx512_trunc_sat_qd<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"qd", OpNode, avx512vl_i64_info, + v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<32, CD8VH>; +} + +multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + truncstorevi8, masked_truncstorevi8>, EVEX_CD8<8, CD8VQ>; +} +multiclass avx512_trunc_sat_db<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"db", OpNode, avx512vl_i32_info, + v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, + sat>, EVEX_CD8<8, CD8VQ>; +} + +multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + truncstorevi16, masked_truncstorevi16>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_dw<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"dw", OpNode, avx512vl_i32_info, + v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, + sat>, EVEX_CD8<16, CD8VH>; +} + +multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + truncstorevi8, masked_truncstorevi8,HasBWI>, EVEX_CD8<16, CD8VH>; +} +multiclass avx512_trunc_sat_wb<bits<8> opc, string sat, SDNode OpNode> { + defm NAME: avx512_trunc_sat<opc, "vpmov"##sat##"wb", OpNode, avx512vl_i16_info, + v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, + sat, HasBWI>, EVEX_CD8<16, CD8VH>; +} + +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc>; +defm VPMOVSQB : avx512_trunc_sat_qb<0x22, "s", X86vtruncs>; +defm VPMOVUSQB : avx512_trunc_sat_qb<0x12, "us", X86vtruncus>; + +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc>; +defm VPMOVSQW : avx512_trunc_sat_qw<0x24, "s", X86vtruncs>; +defm VPMOVUSQW : avx512_trunc_sat_qw<0x14, "us", X86vtruncus>; + +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc>; +defm VPMOVSQD : avx512_trunc_sat_qd<0x25, "s", X86vtruncs>; +defm VPMOVUSQD : avx512_trunc_sat_qd<0x15, "us", X86vtruncus>; + +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc>; +defm VPMOVSDB : avx512_trunc_sat_db<0x21, "s", X86vtruncs>; +defm VPMOVUSDB : avx512_trunc_sat_db<0x11, "us", X86vtruncus>; + +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc>; +defm VPMOVSDW : avx512_trunc_sat_dw<0x23, "s", X86vtruncs>; +defm VPMOVUSDW : avx512_trunc_sat_dw<0x13, "us", X86vtruncus>; + +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc>; +defm VPMOVSWB : avx512_trunc_sat_wb<0x20, "s", X86vtruncs>; +defm VPMOVUSWB : avx512_trunc_sat_wb<0x10, "us", X86vtruncus>; + +let Predicates = [HasAVX512, NoVLX] in { +def: Pat<(v8i16 (X86vtrunc (v8i32 VR256X:$src))), + (v8i16 (EXTRACT_SUBREG + (v16i16 (VPMOVDWZrr (v16i32 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +def: Pat<(v4i32 (X86vtrunc (v4i64 VR256X:$src))), + (v4i32 (EXTRACT_SUBREG + (v8i32 (VPMOVQDZrr (v8i64 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm)))), sub_xmm))>; +} + +let Predicates = [HasBWI, NoVLX] in { +def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))), + (v16i8 (EXTRACT_SUBREG (VPMOVWBZrr (v32i16 (SUBREG_TO_REG (i32 0), + VR256X:$src, sub_ymm))), sub_xmm))>; +} multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, @@ -5985,163 +6566,11 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -//===----------------------------------------------------------------------===// -// VSHUFPS - VSHUFPD Operations - -multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop, - ValueType vt, string OpcodeStr, PatFrag mem_frag, - Domain d> { - def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, x86memop:$src2, u8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), - (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, - EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; - def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, u8imm:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], d, IIC_SSE_SHUFP>, - EVEX_4V, Sched<[WriteShuffle]>; -} - -defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", loadv16f32, - SSEPackedSingle>, PS, EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", loadv8f64, - SSEPackedDouble>, PD, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>; -def : Pat<(v16i32 (X86Shufp VR512:$src1, - (loadv16i32 addr:$src2), (i8 imm:$imm))), - (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>; - -def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))), - (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>; -def : Pat<(v8i64 (X86Shufp VR512:$src1, - (loadv8i64 addr:$src2), (i8 imm:$imm))), - (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>; // Helper fragments to match sext vXi1 to vXiY. def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; -multiclass avx512_conflict<bits<8> opc, string OpcodeStr, - RegisterClass RC, RegisterClass KRC, - X86MemOperand x86memop, - X86MemOperand x86scalar_mop, string BrdcstStr> { - let hasSideEffects = 0 in { - def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"), - []>, EVEX; - let mayLoad = 1 in - def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"), - []>, EVEX; - let mayLoad = 1 in - def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", ${dst}|${dst}, ${src}", BrdcstStr, "}"), - []>, EVEX, EVEX_B; - def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins KRC:$mask, RC:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in - def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86memop:$src), - !strconcat(OpcodeStr, - "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"), - []>, EVEX, EVEX_KZ; - let mayLoad = 1 in - def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins KRC:$mask, x86scalar_mop:$src), - !strconcat(OpcodeStr, "\t{${src}", BrdcstStr, - ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}", - BrdcstStr, "}"), - []>, EVEX, EVEX_KZ, EVEX_B; - - let Constraints = "$src1 = $dst" in { - def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, RC:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; - let mayLoad = 1 in - def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86memop:$src2), - !strconcat(OpcodeStr, - "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K; - let mayLoad = 1 in - def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2), - !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, - ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"), - []>, EVEX, EVEX_K, EVEX_B; - } - } -} - -let Predicates = [HasCDI] in { -defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM, - i512mem, i32mem, "{1to16}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - - -defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM, - i512mem, i64mem, "{1to8}">, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -} - -def : Pat<(int_x86_avx512_mask_conflict_d_512 VR512:$src2, VR512:$src1, - GR16:$mask), - (VPCONFLICTDrrk VR512:$src1, - (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; - -def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1, - GR8:$mask), - (VPCONFLICTQrrk VR512:$src1, - (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; - -let Predicates = [HasCDI] in { -defm VPLZCNTD : avx512_conflict<0x44, "vplzcntd", VR512, VK16WM, - i512mem, i32mem, "{1to16}">, - EVEX_V512, EVEX_CD8<32, CD8VF>; - - -defm VPLZCNTQ : avx512_conflict<0x44, "vplzcntq", VR512, VK8WM, - i512mem, i64mem, "{1to8}">, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - -} - -def : Pat<(int_x86_avx512_mask_lzcnt_d_512 VR512:$src2, VR512:$src1, - GR16:$mask), - (VPLZCNTDrrk VR512:$src1, - (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), VR512:$src2)>; - -def : Pat<(int_x86_avx512_mask_lzcnt_q_512 VR512:$src2, VR512:$src1, - GR8:$mask), - (VPLZCNTQrrk VR512:$src1, - (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>; - -def : Pat<(v16i32 (ctlz (loadv16i32 addr:$src))), - (VPLZCNTDrm addr:$src)>; -def : Pat<(v16i32 (ctlz (v16i32 VR512:$src))), - (VPLZCNTDrr VR512:$src)>; -def : Pat<(v8i64 (ctlz (loadv8i64 addr:$src))), - (VPLZCNTQrm addr:$src)>; -def : Pat<(v8i64 (ctlz (v8i64 VR512:$src))), - (VPLZCNTQrr VR512:$src)>; - def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>; def : Pat<(store (i1 0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>; @@ -6197,7 +6626,7 @@ defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">; multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _.KRC:$dst, (trunc (_.VT _.RC:$src)))]>, EVEX; + [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; } multiclass avx512_convert_vector_to_mask<bits<8> opc, string OpcodeStr, @@ -6230,7 +6659,7 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (X86compress _.RC:$src1))>, AVX5128IBase; let mayStore = 1 in { @@ -6242,7 +6671,7 @@ multiclass compress_by_vec_width<bits<8> opc, X86VectorVTInfo _, def mrk : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, $dst {${mask}} |$dst {${mask}}, $src}", - [(store (_.VT (vselect _.KRCWM:$mask, + [(store (_.VT (vselect _.KRCWM:$mask, (_.VT (X86compress _.RC:$src)), _.ImmAllZerosV)), addr:$dst)]>, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; @@ -6272,7 +6701,7 @@ defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, string OpcodeStr> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (X86expand _.RC:$src1))>, AVX5128IBase; let mayLoad = 1 in @@ -6302,6 +6731,62 @@ defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, EVEX, VEX_W; +//handle instruction reg_vec1 = op(reg_vec,imm) +// op(mem_vec,imm) +// op(broadcast(eltVt),imm) +//all instruction created with FROUND_CURRENT +multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src2, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + let mayLoad = 1 in { + defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.MemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", + (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>; + defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, + "${src1}"##_.BroadcastStr##", $src2", + (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), + (i32 imm:$src2), + (i32 FROUND_CURRENT))>, EVEX_B; + } +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} +multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, + SDNode OpNode, X86VectorVTInfo _>{ + defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, i32u8imm:$src2), + OpcodeStr##_.Suffix, "$src2,{sae}, $src1", + "$src1, {sae}, $src2", + (OpNode (_.VT _.RC:$src1), + (i32 imm:$src2), + (i32 FROUND_NO_EXC))>, EVEX_B; +} + +multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + let Predicates = [prd] in { + defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + EVEX_V512; + } + let Predicates = [prd, HasVLX] in { + defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>, + EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>, + EVEX_V256; + } +} + //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) // op(reg_vec2,broadcast(eltVt),imm) @@ -6309,49 +6794,60 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let mayLoad = 1 in { defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>, EVEX_B; } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) +multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{ + + defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT SrcInfo.RC:$src2), + (i8 imm:$src3)))>; + let mayLoad = 1 in + defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), + OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), + (SrcInfo.VT (bitconvert + (SrcInfo.LdFrag addr:$src2))), + (i8 imm:$src3)))>; +} + +//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) +// op(reg_vec2,mem_vec,imm) // op(reg_vec2,broadcast(eltVt),imm) multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ - defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), - OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - (i8 imm:$src3))>; - let mayLoad = 1 in { - defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), - OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i8 imm:$src3))>; + X86VectorVTInfo _>: + avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{ + + let mayLoad = 1 in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", @@ -6359,7 +6855,6 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), (i8 imm:$src3))>, EVEX_B; - } } //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) @@ -6369,20 +6864,20 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let mayLoad = 1 in { defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3), + (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_CURRENT))>; let isAsmParserOnly = 1 in { @@ -6398,18 +6893,25 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.RC:$src2, u8imm:$src3), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3,{sae}, $src2, $src1", "$src1, $src2,{sae}, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i8 imm:$src3), + (i32 imm:$src3), (i32 FROUND_NO_EXC))>, EVEX_B; } //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { - defm NAME: avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _>; + defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), + OpcodeStr, "$src3,{sae}, $src2, $src1", + "$src1, $src2,{sae}, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (i32 imm:$src3), + (i32 FROUND_NO_EXC))>, EVEX_B; } multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, @@ -6428,6 +6930,20 @@ multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, } } +multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr, + AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{ + let Predicates = [HasBWI] in { + defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512, + SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V; + } + let Predicates = [HasBWI, HasVLX] in { + defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128, + SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V; + defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256, + SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V; + } +} + multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode>{ let Predicates = [HasAVX512] in { @@ -6447,6 +6963,14 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, } } +multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, + bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{ + defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, + opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>; + defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, + opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W; +} + defm VFIXUPIMMPD : avx512_common_fp_sae_packed_imm<"vfixupimmpd", avx512vl_f64_info, 0x54, X86VFixupimm, HasAVX512>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; @@ -6461,6 +6985,14 @@ defm VFIXUPIMMSS: avx512_common_fp_sae_scalar_imm<"vfixupimmss", f32x_info, 0x55, X86VFixupimm, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, + X86VReduce, HasDQI>, AVX512AIi8Base, EVEX; +defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, + X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX; +defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, + X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX; + + defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, 0x50, X86VRange, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; @@ -6475,6 +7007,19 @@ defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, 0x51, X86VRange, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; +defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, + 0x57, X86Reduces, HasDQI>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; +defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, + 0x27, X86GetMants, HasAVX512>, + AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode = X86Shuf128>{ @@ -6486,6 +7031,29 @@ multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; } } +let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (ffloor VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x1))>; +def : Pat<(v16f32 (fnearbyint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>; +def : Pat<(v16f32 (fceil VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x2))>; +def : Pat<(v16f32 (frint VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>; +def : Pat<(v16f32 (ftrunc VR512:$src)), + (VRNDSCALEPSZrri VR512:$src, (i32 0x3))>; + +def : Pat<(v8f64 (ffloor VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x1))>; +def : Pat<(v8f64 (fnearbyint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>; +def : Pat<(v8f64 (fceil VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x2))>; +def : Pat<(v8f64 (frint VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>; +def : Pat<(v8f64 (ftrunc VR512:$src)), + (VRNDSCALEPDZrri VR512:$src, (i32 0x3))>; +} defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; @@ -6496,31 +7064,51 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; -multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, - AVX512VLVectorVTInfo VTInfo_FP>{ +multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> { defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>, AVX512AIi8Base, EVEX_4V; - let isCodeGenOnly = 1 in { - defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>, - AVX512AIi8Base, EVEX_4V; - } } -defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>, +defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>, +defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; +multiclass avx512_vpalign_lowering<X86VectorVTInfo _ , list<Predicate> p>{ + let Predicates = p in + def NAME#_.VTName#rri: + Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), + (!cast<Instruction>(NAME#_.ZSuffix#rri) + _.RC:$src1, _.RC:$src2, imm:$imm)>; +} + +multiclass avx512_vpalign_lowering_common<AVX512VLVectorVTInfo _>: + avx512_vpalign_lowering<_.info512, [HasBWI]>, + avx512_vpalign_lowering<_.info128, [HasBWI, HasVLX]>, + avx512_vpalign_lowering<_.info256, [HasBWI, HasVLX]>; + +defm VPALIGN: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , + avx512vl_i8_info, avx512vl_i8_info>, + avx512_vpalign_lowering_common<avx512vl_i16_info>, + avx512_vpalign_lowering_common<avx512vl_i32_info>, + avx512_vpalign_lowering_common<avx512vl_f32_info>, + avx512_vpalign_lowering_common<avx512vl_i64_info>, + avx512_vpalign_lowering_common<avx512vl_f64_info>, + EVEX_CD8<8, CD8VF>; + +defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , + avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; + multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), - (ins _.RC:$src1), OpcodeStr##_.Suffix, + (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase; let mayLoad = 1 in defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.MemOp:$src1), OpcodeStr##_.Suffix, + (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; @@ -6531,7 +7119,7 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, avx512_unary_rm<opc, OpcodeStr, OpNode, _> { let mayLoad = 1 in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.ScalarMemOp:$src1), OpcodeStr##_.Suffix, + (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, (_.VT (OpNode (X86VBroadcast @@ -6568,15 +7156,16 @@ multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr, OpNode, avx512vl_i64_info, + defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info, prd>, VEX_W; - defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr, OpNode, avx512vl_i32_info, prd>; + defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info, + prd>; } multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm W : avx512_unary_rm_vl<opc_w, OpcodeStr, OpNode, avx512vl_i16_info, prd>; - defm B : avx512_unary_rm_vl<opc_b, OpcodeStr, OpNode, avx512vl_i8_info, prd>; + defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>; + defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>; } multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, @@ -6598,3 +7187,332 @@ def : Pat<(xor (bc_v8i64 (v8i1sextv8i64)), (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))), (VPABSQZrr VR512:$src)>; + +multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ + + defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>; +} + +defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>; +defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>; + +//===---------------------------------------------------------------------===// +// Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info, + HasAVX512>, XS; +} + +defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; +defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; + +//===----------------------------------------------------------------------===// +// AVX-512 - MOVDDUP +//===----------------------------------------------------------------------===// + +multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX; + let mayLoad = 1 in + defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", + (_.VT (OpNode (_.VT (scalar_to_vector + (_.ScalarLdFrag addr:$src)))))>, + EVEX, EVEX_CD8<_.EltSize, CD8VH>; +} + +multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, + AVX512VLVectorVTInfo VTInfo> { + + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>, + EVEX_V128; + } +} + +multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, + avx512vl_f64_info>, XD, VEX_W; +} + +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; + +def : Pat<(X86Movddup (loadv2f64 addr:$src)), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; +def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>, Requires<[HasAVX512, HasVLX]>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Unpack Instructions +//===----------------------------------------------------------------------===// +defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh>; +defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl>; + +defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHBW : avx512_binop_rm_vl_b<0x68, "vpunpckhbw", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKLWD : avx512_binop_rm_vl_w<0x61, "vpunpcklwd", X86Unpckl, + SSE_INTALU_ITINS_P, HasBWI>; +defm VPUNPCKHWD : avx512_binop_rm_vl_w<0x69, "vpunpckhwd", X86Unpckh, + SSE_INTALU_ITINS_P, HasBWI>; + +defm VPUNPCKLDQ : avx512_binop_rm_vl_d<0x62, "vpunpckldq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHDQ : avx512_binop_rm_vl_d<0x6A, "vpunpckhdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKLQDQ : avx512_binop_rm_vl_q<0x6C, "vpunpcklqdq", X86Unpckl, + SSE_INTALU_ITINS_P, HasAVX512>; +defm VPUNPCKHQDQ : avx512_binop_rm_vl_q<0x6D, "vpunpckhqdq", X86Unpckh, + SSE_INTALU_ITINS_P, HasAVX512>; + +//===----------------------------------------------------------------------===// +// AVX-512 - Extract & Insert Integer Instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _> { + let mayStore = 1 in + def mr : AVX512Ii8<opc, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1), + imm:$src2)))), + addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0x14, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; + } +} + +multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GR32orGR64:$dst, + (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, PD; + + def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, + EVEX, TAPD; + + defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; + } +} + +multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _, + RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<0x16, MRMDestReg, (outs GRC:$dst), + (ins _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set GRC:$dst, + (extractelt (_.VT _.RC:$src1), imm:$src2))]>, + EVEX, TAPD; + + let mayStore = 1 in + def mr : AVX512Ii8<0x16, MRMDestMem, (outs), + (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(store (extractelt (_.VT _.RC:$src1), + imm:$src2),addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD; + } +} + +defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>; +defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>; +defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>; +defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W; + +multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + def rm : AVX512Ii8<opc, MRMSrcMem, (outs _.RC:$dst), + (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; +} + +multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _, PatFrag LdFrag> { + let Predicates = [HasBWI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>; + } +} + +multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _, RegisterClass GRC> { + let Predicates = [HasDQI] in { + def rr : AVX512Ii8<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src1, GRC:$src2, u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set _.RC:$dst, + (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>, + EVEX_4V, TAPD; + + defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _, + _.ScalarLdFrag>, TAPD; + } +} + +defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info, + extloadi8>, TAPD; +defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info, + extloadi16>, PD; +defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; +defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; +//===----------------------------------------------------------------------===// +// VSHUFPS - VSHUFPD Operations +//===----------------------------------------------------------------------===// +multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, + AVX512VLVectorVTInfo VTInfo_FP>{ + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>, + EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>, + AVX512AIi8Base, EVEX_4V; +} + +defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS; +defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W; +//===----------------------------------------------------------------------===// +// AVX-512 - Byte shift Left/Right +//===----------------------------------------------------------------------===// + +multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, X86VectorVTInfo _>{ + def rr : AVX512<opc, MRMr, + (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512<opc, MRMm, + (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _.RC:$dst,(_.VT (OpNode + (_.LdFrag addr:$src1), (i8 imm:$src2))))]>; +} + +multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, + Format MRMm, string OpcodeStr, Predicate prd>{ + let Predicates = [prd] in + defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v8i64_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v4i64x_info>, EVEX_V256; + defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, v2i64x_info>, EVEX_V128; + } +} +defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; +defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", + HasBWI>, AVX512PDIi8Base, EVEX_4V; + + +multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, + string OpcodeStr, X86VectorVTInfo _dst, + X86VectorVTInfo _src>{ + def rr : AVX512BI<opc, MRMSrcReg, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT _src.RC:$src2))))]>; + let mayLoad = 1 in + def rm : AVX512BI<opc, MRMSrcMem, + (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2), + !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set _dst.RC:$dst,(_dst.VT + (OpNode (_src.VT _src.RC:$src1), + (_src.VT (bitconvert + (_src.LdFrag addr:$src2))))))]>; +} + +multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, + string OpcodeStr, Predicate prd> { + let Predicates = [prd] in + defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info, + v64i8_info>, EVEX_V512; + let Predicates = [prd, HasVLX] in { + defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info, + v32i8x_info>, EVEX_V256; + defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info, + v16i8x_info>, EVEX_V128; + } +} + +defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", + HasBWI>, EVEX_4V; + +multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, + X86VectorVTInfo _>{ + let Constraints = "$src1 = $dst" in { + defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT _.RC:$src3), + (i8 imm:$src4))>, AVX512AIi8Base, EVEX_4V; + let mayLoad = 1 in { + defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src3", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (bitconvert (_.LdFrag addr:$src3))), + (i8 imm:$src4))>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), + OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2", + "$src2, ${src3}"##_.BroadcastStr##", $src4", + (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + (i8 imm:$src4))>, EVEX_B, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + } + }// Constraints = "$src1 = $dst" +} + +multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{ + let Predicates = [HasAVX512] in + defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512; + let Predicates = [HasAVX512, HasVLX] in { + defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128; + defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256; + } +} + +defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>; +defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W; + diff --git a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td index 5e19ad4..1a2e786 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/contrib/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -615,14 +615,14 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass, def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">; -def Xi8 : X86TypeInfo<i8 , "b", GR8 , loadi8 , i8mem , - Imm8 , i8imm , imm, i8imm , invalid_node, +def Xi8 : X86TypeInfo<i8, "b", GR8, loadi8, i8mem, + Imm8, i8imm, imm8_su, i8imm, invalid_node, 0, OpSizeFixed, 0>; def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem, - Imm16, i16imm, imm, i16i8imm, i16immSExt8, + Imm16, i16imm, imm16_su, i16i8imm, i16immSExt8_su, 1, OpSize16, 0>; def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem, - Imm32, i32imm, imm, i32i8imm, i32immSExt8, + Imm32, i32imm, imm32_su, i32i8imm, i32immSExt8_su, 1, OpSize32, 0>; def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem, Imm32S, i64i32imm, i64immSExt32, i64i8imm, i64immSExt8, @@ -928,15 +928,22 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, let hasSideEffects = 0; } -// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define +// BinOpAI_RFF - Instructions like "adc %eax, %eax, imm", that implicitly define // and use EFLAGS. -class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - Register areg, string operands> +class BinOpAI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> : BinOpAI<opcode, mnemonic, typeinfo, areg, operands, IIC_BIN_CARRY_NONMEM> { let Uses = [areg, EFLAGS]; } +// BinOpAI_F - Instructions like "cmp %eax, %eax, imm", that imp-def EFLAGS. +class BinOpAI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, + Register areg, string operands> + : BinOpAI<opcode, mnemonic, typeinfo, areg, operands> { + let Defs = [EFLAGS]; +} + /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is /// defined with "(set GPR:$dst, EFLAGS, (...". /// @@ -1092,14 +1099,14 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Uses = [EFLAGS], Defs = [EFLAGS] - def NAME#8i8 : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">; - def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">; - def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def NAME#8i8 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_RFF<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is @@ -1170,14 +1177,14 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, } } // Defs = [EFLAGS] - def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL, - "{$src, %al|al, $src}">; - def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX, - "{$src, %ax|ax, $src}">; - def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX, - "{$src, %eax|eax, $src}">; - def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX, - "{$src, %rax|rax, $src}">; + def NAME#8i8 : BinOpAI_F<BaseOpc4, mnemonic, Xi8 , AL, + "{$src, %al|al, $src}">; + def NAME#16i16 : BinOpAI_F<BaseOpc4, mnemonic, Xi16, AX, + "{$src, %ax|ax, $src}">; + def NAME#32i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi32, EAX, + "{$src, %eax|eax, $src}">; + def NAME#64i32 : BinOpAI_F<BaseOpc4, mnemonic, Xi64, RAX, + "{$src, %rax|rax, $src}">; } @@ -1246,14 +1253,14 @@ let isCompare = 1 in { "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>; } // Defs = [EFLAGS] - def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL, - "{$src, %al|al, $src}">; - def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX, - "{$src, %ax|ax, $src}">; - def TEST32i32 : BinOpAI<0xA8, "test", Xi32, EAX, - "{$src, %eax|eax, $src}">; - def TEST64i32 : BinOpAI<0xA8, "test", Xi64, RAX, - "{$src, %rax|rax, $src}">; + def TEST8i8 : BinOpAI_F<0xA8, "test", Xi8 , AL, + "{$src, %al|al, $src}">; + def TEST16i16 : BinOpAI_F<0xA8, "test", Xi16, AX, + "{$src, %ax|ax, $src}">; + def TEST32i32 : BinOpAI_F<0xA8, "test", Xi32, EAX, + "{$src, %eax|eax, $src}">; + def TEST64i32 : BinOpAI_F<0xA8, "test", Xi64, RAX, + "{$src, %rax|rax, $src}">; } // isCompare //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h index 2056056..787f15b 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrBuilder.h +++ b/contrib/llvm/lib/Target/X86/X86InstrBuilder.h @@ -156,10 +156,9 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) { Flags |= MachineMemOperand::MOLoad; if (MCID.mayStore()) Flags |= MachineMemOperand::MOStore; - MachineMemOperand *MMO = - MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI, Offset), - Flags, MFI.getObjectSize(FI), - MFI.getObjectAlignment(FI)); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags, + MFI.getObjectSize(FI), MFI.getObjectAlignment(FI)); return addOffset(MIB.addFrameIndex(FI), Offset) .addMemOperand(MMO); } diff --git a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td index 315f213..c73c950 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCMovSetCC.td @@ -13,7 +13,7 @@ //===----------------------------------------------------------------------===// -// SetCC instructions. +// CMOV instructions. multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> { let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst", isCommutable = 1, SchedRW = [WriteALU] in { diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index 7f850d6..5d7283f 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -132,26 +132,6 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), Requires<[In64BitMode]>; } -// The MSVC runtime contains an _ftol2 routine for converting floating-point -// to integer values. It has a strange calling convention: the input is -// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is -// used as a temporary register. No other registers (aside from flags) are -// touched. -// Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 -// variant is unnecessary. - -let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { - def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), - "# win32 fptoui", - [(X86WinFTOL RFP32:$src)]>, - Requires<[Not64BitMode]>; - - def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), - "# win32 fptoui", - [(X86WinFTOL RFP64:$src)]>, - Requires<[Not64BitMode]>; -} - //===----------------------------------------------------------------------===// // EH Pseudo Instructions // @@ -172,6 +152,29 @@ def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), } +let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, + isCodeGenOnly = 1, isReturn = 1 in { + def CLEANUPRET : I<0, Pseudo, (outs), (ins), "# CLEANUPRET", [(cleanupret)]>; + + // CATCHRET needs a custom inserter for SEH. + let usesCustomInserter = 1 in + def CATCHRET : I<0, Pseudo, (outs), (ins brtarget32:$dst, brtarget32:$from), + "# CATCHRET", + [(catchret bb:$dst, bb:$from)]>; +} + +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1, + usesCustomInserter = 1 in +def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>; + +// This instruction is responsible for re-establishing stack pointers after an +// exception has been caught and we are rejoining normal control flow in the +// parent function or funclet. It generally sets ESP and EBP, and optionally +// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us +// elsewhere. +let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in +def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>; + let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), @@ -247,7 +250,7 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1 in + isPseudo = 1, AddedComplexity = 20 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; @@ -259,6 +262,33 @@ def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { let AddedComplexity = 20; } +let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], + AddedComplexity = 15 in { + // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, + // which only require 3 bytes compared to MOV32ri which requires 5. + let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { + def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, 1)]>; + def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", + [(set GR32:$dst, -1)]>; + } + + // MOV16ri is 4 bytes, so the instructions above are smaller. + def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>; + def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; +} + +let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in { +// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. +// FIXME: Add itinerary class and Schedule. +def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", + [(set GR32:$dst, i32immSExt8:$src)]>, + Requires<[OptForMinSize]>; +def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", + [(set GR64:$dst, i64immSExt8:$src)]>, + Requires<[OptForMinSize, NotWin64WithoutFP]>; +} + // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. @@ -268,9 +298,9 @@ def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; // This 64-bit pseudo-move can be used for both a 64-bit constant that is -// actually the zero-extension of a 32-bit constant, and for labels in the +// actually the zero-extension of a 32-bit constant and for labels in the // x86-64 small code model. -def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>; +def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>; let AddedComplexity = 1 in def : Pat<(i64 mov64imm32:$src), @@ -509,6 +539,7 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in { defm _FR32 : CMOVrr_PSEUDO<FR32, f32>; defm _FR64 : CMOVrr_PSEUDO<FR64, f64>; + defm _FR128 : CMOVrr_PSEUDO<FR128, f128>; defm _V4F32 : CMOVrr_PSEUDO<VR128, v4f32>; defm _V2F64 : CMOVrr_PSEUDO<VR128, v2f64>; defm _V2I64 : CMOVrr_PSEUDO<VR128, v2i64>; @@ -752,67 +783,111 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", /* The following multiclass tries to make sure that in code like * x.store (immediate op x.load(acquire), release) + * and + * x.store (register op x.load(acquire), release) * an operation directly on memory is generated instead of wasting a register. * It is not automatic as atomic_store/load are only lowered to MOV instructions * extremely late to prevent them from being accidentally reordered in the backend * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) */ -multiclass RELEASE_BINOP_MI<string op> { +multiclass RELEASE_BINOP_MI<SDNode op> { def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (op (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + def NAME#8mr : I<0, Pseudo, (outs), (ins i8mem:$dst, GR8:$src), + "#BINOP "#NAME#"8mr PSEUDO!", + [(atomic_store_8 addr:$dst, (op + (atomic_load_8 addr:$dst), GR8:$src))]>; // NAME#16 is not generated as 16-bit arithmetic instructions are considered // costly and avoided as far as possible by this backend anyway def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (op (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, (op + (atomic_load_32 addr:$dst), GR32:$src))]>; def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_BINOP PSEUDO!", - [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op) + "#BINOP "#NAME#"64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, (op (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, (op + (atomic_load_64 addr:$dst), GR64:$src))]>; +} +let Defs = [EFLAGS] in { + defm RELEASE_ADD : RELEASE_BINOP_MI<add>; + defm RELEASE_AND : RELEASE_BINOP_MI<and>; + defm RELEASE_OR : RELEASE_BINOP_MI<or>; + defm RELEASE_XOR : RELEASE_BINOP_MI<xor>; + // Note: we don't deal with sub, because substractions of constants are + // optimized into additions before this code can run. +} + +// Same as above, but for floating-point. +// FIXME: imm version. +// FIXME: Version that doesn't clobber $src, using AVX's VADDSS. +// FIXME: This could also handle SIMD operations with *ps and *pd instructions. +let usesCustomInserter = 1 in { +multiclass RELEASE_FP_BINOP_MI<SDNode op> { + def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), + "#BINOP "#NAME#"32mr PSEUDO!", + [(atomic_store_32 addr:$dst, + (i32 (bitconvert (op + (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), + FR32:$src))))]>, Requires<[HasSSE1]>; + def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), + "#BINOP "#NAME#"64mr PSEUDO!", + [(atomic_store_64 addr:$dst, + (i64 (bitconvert (op + (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), + FR64:$src))))]>, Requires<[HasSSE2]>; +} +defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>; +// FIXME: Add fsub, fmul, fdiv, ... } -defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; -defm RELEASE_AND : RELEASE_BINOP_MI<"and">; -defm RELEASE_OR : RELEASE_BINOP_MI<"or">; -defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; -// Note: we don't deal with sub, because substractions of constants are -// optimized into additions before this code can run multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"8m PSEUDO!", [(atomic_store_8 addr:$dst, dag8)]>; def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"16m PSEUDO!", [(atomic_store_16 addr:$dst, dag16)]>; def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"32m PSEUDO!", [(atomic_store_32 addr:$dst, dag32)]>; def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), - "#RELEASE_UNOP PSEUDO!", + "#UNOP "#NAME#"64m PSEUDO!", [(atomic_store_64 addr:$dst, dag64)]>; } -defm RELEASE_INC : RELEASE_UNOP< - (add (atomic_load_8 addr:$dst), (i8 1)), - (add (atomic_load_16 addr:$dst), (i16 1)), - (add (atomic_load_32 addr:$dst), (i32 1)), - (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; -defm RELEASE_DEC : RELEASE_UNOP< - (add (atomic_load_8 addr:$dst), (i8 -1)), - (add (atomic_load_16 addr:$dst), (i16 -1)), - (add (atomic_load_32 addr:$dst), (i32 -1)), - (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +let Defs = [EFLAGS] in { + defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; + defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; +} /* TODO: These don't work because the type inference of TableGen fails. TODO: find a way to fix it. -defm RELEASE_NEG : RELEASE_UNOP< - (ineg (atomic_load_8 addr:$dst)), - (ineg (atomic_load_16 addr:$dst)), - (ineg (atomic_load_32 addr:$dst)), - (ineg (atomic_load_64 addr:$dst))>; +let Defs = [EFLAGS] in { + defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +} +// NOT doesn't set flags. defm RELEASE_NOT : RELEASE_UNOP< (not (atomic_load_8 addr:$dst)), (not (atomic_load_16 addr:$dst)), @@ -821,42 +896,42 @@ defm RELEASE_NOT : RELEASE_UNOP< */ def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV8mi PSEUDO!", [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV16mi PSEUDO!", [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV32mi PSEUDO!", [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_MOV PSEUDO !", + "#RELEASE_MOV64mi32 PSEUDO!", [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV8mr PSEUDO!", [(atomic_store_8 addr:$dst, GR8 :$src)]>; def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV16mr PSEUDO!", [(atomic_store_16 addr:$dst, GR16:$src)]>; def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV32mr PSEUDO!", [(atomic_store_32 addr:$dst, GR32:$src)]>; def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), - "#RELEASE_MOV PSEUDO!", + "#RELEASE_MOV64mr PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV8rm PSEUDO!", [(set GR8:$dst, (atomic_load_8 addr:$src))]>; def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV16rm PSEUDO!", [(set GR16:$dst, (atomic_load_16 addr:$src))]>; def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV32rm PSEUDO!", [(set GR32:$dst, (atomic_load_32 addr:$src))]>; def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", + "#ACQUIRE_MOV64rm PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// @@ -1077,11 +1152,11 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>; // zextload bool -> zextload byte def : Pat<(zextloadi8i1 addr:$src), (AND8ri (MOV8rm addr:$src), (i8 1))>; -def : Pat<(zextloadi16i1 addr:$src), (AND16ri (MOVZX16rm8 addr:$src), (i16 1))>; -def : Pat<(zextloadi32i1 addr:$src), (AND32ri (MOVZX32rm8 addr:$src), (i32 1))>; +def : Pat<(zextloadi16i1 addr:$src), (AND16ri8 (MOVZX16rm8 addr:$src), (i16 1))>; +def : Pat<(zextloadi32i1 addr:$src), (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1))>; def : Pat<(zextloadi64i1 addr:$src), (SUBREG_TO_REG (i64 0), - (AND32ri (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; + (AND32ri8 (MOVZX32rm8 addr:$src), (i32 1)), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit @@ -1298,7 +1373,6 @@ def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), sub_32bit)>; // r & (2^16-1) ==> movz -let AddedComplexity = 1 in // Give priority over i64immZExt32. def : Pat<(and GR64:$src, 0xffff), (SUBREG_TO_REG (i64 0), (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), diff --git a/contrib/llvm/lib/Target/X86/X86InstrControl.td b/contrib/llvm/lib/Target/X86/X86InstrControl.td index 4cd5563..8c351a5 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrControl.td +++ b/contrib/llvm/lib/Target/X86/X86InstrControl.td @@ -53,6 +53,19 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, "{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>; def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt), "{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16; + + // The machine return from interrupt instruction, but sometimes we need to + // perform a post-epilogue stack adjustment. Codegen emits the pseudo form + // which expands to include an SP adjustment if necessary. + def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, + OpSize16; + def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], + IIC_IRET>, OpSize32; + def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [], + IIC_IRET>, Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in + def IRET : PseudoI<(outs), (ins i16imm:$adj), [(X86iret timm:$adj)]>; + } // Unconditional branches. diff --git a/contrib/llvm/lib/Target/X86/X86InstrFMA.td b/contrib/llvm/lib/Target/X86/X86InstrFMA.td index 7cc3b59..fd800cf 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFMA.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFMA.td @@ -15,13 +15,31 @@ // FMA3 - Intel 3 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// -let Constraints = "$src1 = $dst" in { +// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined +// below, both the register and memory variants are commutable. +// For the register form the commutable operands are 1, 2 and 3. +// For the memory variant the folded operand must be in 3. Thus, +// in that case, only the operands 1 and 2 can be swapped. +// Commuting some of operands may require the opcode change. +// FMA*213*: +// operands 1 and 2 (memory & register forms): *213* --> *213*(no changes); +// operands 1 and 3 (register forms only): *213* --> *231*; +// operands 2 and 3 (register forms only): *213* --> *132*. +// FMA*132*: +// operands 1 and 2 (memory & register forms): *132* --> *231*; +// operands 1 and 3 (register forms only): *132* --> *132*(no changes); +// operands 2 and 3 (register forms only): *132* --> *213*. +// FMA*231*: +// operands 1 and 2 (memory & register forms): *231* --> *132*; +// operands 1 and 3 (register forms only): *231* --> *213*; +// operands 2 and 3 (register forms only): *231* --> *231*(no changes). + +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in multiclass fma3p_rm<bits<8> opc, string OpcodeStr, PatFrag MemFrag128, PatFrag MemFrag256, ValueType OpVT128, ValueType OpVT256, - bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, SDPatternOperator Op = null_frag> { - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -29,7 +47,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, VR128:$src3)))]>; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, @@ -37,7 +55,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, (MemFrag128 addr:$src3))))]>; - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, @@ -45,7 +63,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, VR256:$src3)))]>, VEX_L; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, @@ -54,34 +72,20 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr, (OpVT256 (Op VR256:$src2, VR256:$src1, (MemFrag256 addr:$src3))))]>, VEX_L; } -} // Constraints = "$src1 = $dst" multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - // For 213, both the register and memory variant are commutable. - // Indeed, the commutable operands are 1 and 2 and both live in registers - // for both variants. defm r213 : fma3p_rm<opc213, !strconcat(OpcodeStr, "213", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 1, - Op>; -let hasSideEffects = 0 in { + MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; defm r132 : fma3p_rm<opc132, !strconcat(OpcodeStr, "132", PackTy), MemFrag128, MemFrag256, OpTy128, OpTy256>; - // For 231, only the register variant is commutable. - // For the memory variant the folded operand must be in 3. Thus, - // in that case, it cannot be swapped with 2. defm r231 : fma3p_rm<opc231, !strconcat(OpcodeStr, "231", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 0>; -} // hasSideEffects = 0 + MemFrag128, MemFrag256, OpTy128, OpTy256>; } // Fused Multiply-Add @@ -126,83 +130,122 @@ let ExeDomain = SSEPackedDouble in { v4f64>, VEX_W; } -let Constraints = "$src1 = $dst" in { -multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, - RegisterClass RC, ValueType OpVT, PatFrag mem_frag, - bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, +// All source register operands of FMA opcodes defined in fma3s_rm multiclass +// can be commuted. In many cases such commute transformation requres an opcode +// adjustment, for example, commuting the operands 1 and 2 in FMA*132 form +// would require an opcode change to FMA*231: +// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2; +// --> +// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; +// Please see more detailed comment at the very beginning of the section +// defining FMA3 opcodes above. +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +multiclass fma3s_rm<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, SDPatternOperator OpNode = null_frag> { - let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in + let usesCustomInserter = 1 in def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; - let mayLoad = 1, isCommutable = IsMVariantCommutable in + let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, - (mem_frag addr:$src3))))]>; + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>; +} + +// These FMA*_Int instructions are defined specially for being used when +// the scalar FMA intrinsics are lowered to machine instructions, and in that +// sense, they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc. +// instructions. +// +// All of the FMA*_Int opcodes are defined as commutable here. +// Commuting the 2nd and 3rd source register operands of FMAs is quite trivial +// and the corresponding optimizations have been developed. +// Commuting the 1st operand of FMA*_Int requires some additional analysis, +// the commute optimization is legal only if all users of FMA*_Int use only +// the lowest element of the FMA*_Int instruction. Even though such analysis +// may be not implemented yet we allow the routines doing the actual commute +// transformation to decide if one or another instruction is commutable or not. +let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, + hasSideEffects = 0 in +multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, + Operand memopr, RegisterClass RC> { + def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; + + let mayLoad = 1 in + def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, memopr:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>; } -} // Constraints = "$src1 = $dst" multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, string PT2, Intrinsic Int, - SDNode OpNode, RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, PatFrag mem_frag, - ComplexPattern mem_cpat> { -let hasSideEffects = 0 in { - defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), - x86memop, RC, OpVT, mem_frag>; - // See the other defm of r231 for the explanation regarding the - // commutable flags. - defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), - x86memop, RC, OpVT, mem_frag, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 0>; + string OpStr, string PackTy, + SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop> { + defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), x86memop, RC>; + defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), x86memop, RC, + OpNode>; + defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), x86memop, RC>; } -// See the other defm of r213 for the explanation regarding the -// commutable flags. -defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), - x86memop, RC, OpVT, mem_frag, - /* IsRVariantCommutable */ 1, - /* IsMVariantCommutable */ 1, - OpNode>; +// The FMA 213 form is created for lowering of scalar FMA intrinscis +// to machine instructions. +// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands +// of FMA 213 form. +// The FMA 231 form can be get only by commuting the 1st operand of 213 or 132 +// forms and is possible only after special analysis of all uses of the initial +// instruction. Such analysis do not exist yet and thus introducing the 231 +// form of FMA*_Int instructions is done using an optimistic assumption that +// such analysis will be implemented eventually. +multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, + RegisterClass RC, Operand memop> { + defm r132 : fma3s_rm_int<opc132, !strconcat(OpStr, "132", PackTy), + memop, RC>; + defm r213 : fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy), + memop, RC>; + defm r231 : fma3s_rm_int<opc231, !strconcat(OpStr, "231", PackTy), + memop, RC>; } multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpStr, Intrinsic IntF32, Intrinsic IntF64, SDNode OpNode> { - defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", IntF32, OpNode, - FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>; - defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", "PD", IntF64, OpNode, - FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W; + let ExeDomain = SSEPackedSingle in + defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", OpNode, + FR32, f32mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "ss", VR128, ssmem>; + + let ExeDomain = SSEPackedDouble in + defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", OpNode, + FR64, f64mem>, + fma3s_int_forms<opc132, opc213, opc231, OpStr, "sd", VR128, sdmem>, + VEX_W; -// These patterns use the 123 ordering, instead of 213, even though -// they match the intrinsic to the 213 version of the instruction. -// This is because src1 is tied to dest, and the scalar intrinsics -// require the pass-through values to come from the first source -// operand, not the second. + // These patterns use the 123 ordering, instead of 213, even though + // they match the intrinsic to the 213 version of the instruction. + // This is because src1 is tied to dest, and the scalar intrinsics + // require the pass-through values to come from the first source + // operand, not the second. def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"SSr213r") - (COPY_TO_REGCLASS $src1, FR32), - (COPY_TO_REGCLASS $src2, FR32), - (COPY_TO_REGCLASS $src3, FR32)), - VR128)>; + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SSr213r_Int") + $src1, $src2, $src3), VR128)>; def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS - (!cast<Instruction>(NAME#"SDr213r") - (COPY_TO_REGCLASS $src1, FR64), - (COPY_TO_REGCLASS $src2, FR64), - (COPY_TO_REGCLASS $src3, FR64)), - VR128)>; + (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"SDr213r_Int") + $src1, $src2, $src3), VR128)>; } defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, @@ -334,36 +377,23 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { } // isCodeGenOnly = 1 } -defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, - fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; -defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, - fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; -defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, - fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; -defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, - fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; -defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, - X86Fnmadd, loadf32>, - fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; -defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, - X86Fnmadd, loadf64>, - fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; -defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, - X86Fnmsub, loadf32>, - fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; -defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, - X86Fnmsub, loadf64>, - fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; - let ExeDomain = SSEPackedSingle in { + // Scalar Instructions + defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, + fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, + int_x86_fma_vfmadd_ss>; + defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, + fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, + int_x86_fma_vfmsub_ss>; + defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, + X86Fnmadd, loadf32>, + fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, + int_x86_fma_vfnmadd_ss>; + defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, + X86Fnmsub, loadf32>, + fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, + int_x86_fma_vfnmsub_ss>; + // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, loadv4f32, loadv8f32>; defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, @@ -379,6 +409,22 @@ let ExeDomain = SSEPackedSingle in { } let ExeDomain = SSEPackedDouble in { + // Scalar Instructions + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, + fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfmadd_sd>; + defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, + fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfmsub_sd>; + defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, + X86Fnmadd, loadf64>, + fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, + int_x86_fma_vfnmadd_sd>; + defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, + X86Fnmsub, loadf64>, + fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, + int_x86_fma_vfnmsub_sd>; + // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, loadv2f64, loadv4f64>; defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, diff --git a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td index 49068e9..03ae211 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFPStack.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFPStack.td @@ -137,69 +137,99 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP, // The FopST0 series are not included here because of the irregularities // in where the 'r' goes in assembly output. // These instructions cannot address 80-bit memory. -multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> { +multiclass FPBinary<SDNode OpNode, Format fp, string asmstring, + bit Forward = 1> { // ST(0) = ST(0) + [mem] def _Fp32m : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, - (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (loadf32 addr:$src2))), + (set RFP32:$dst, + (OpNode (loadf32 addr:$src2), RFP32:$src1)))]>; def _Fp64m : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (loadf64 addr:$src2))), + (set RFP64:$dst, + (OpNode (loadf64 addr:$src2), RFP64:$src1)))]>; def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, - (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2)))), + (set RFP64:$dst, + (OpNode (f64 (extloadf32 addr:$src2)), RFP64:$src1)))]>; def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf32 addr:$src2)), RFP80:$src1)))]>; def _Fp80m64: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f64mem:$src2), OneArgFPRW, - [(set RFP80:$dst, - (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>; + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2)))), + (set RFP80:$dst, + (OpNode (f80 (extloadf64 addr:$src2)), RFP80:$src1)))]>; +let mayLoad = 1 in def _F32m : FPI<0xD8, fp, (outs), (ins f32mem:$src), - !strconcat("f", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _F64m : FPI<0xDC, fp, (outs), (ins f64mem:$src), - !strconcat("f", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("f", asmstring, "{l}\t$src")>; // ST(0) = ST(0) + [memint] def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i16))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i16), RFP32:$src1)))]>; def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP32:$dst, + (OpNode RFP32:$src1, (X86fild addr:$src2, i32))), + (set RFP32:$dst, + (OpNode (X86fild addr:$src2, i32), RFP32:$src1)))]>; def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i16)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i16))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i16), RFP64:$src1)))]>; def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src1, - (X86fild addr:$src2, i32)))]>; + [!if(Forward, + (set RFP64:$dst, + (OpNode RFP64:$src1, (X86fild addr:$src2, i32))), + (set RFP64:$dst, + (OpNode (X86fild addr:$src2, i32), RFP64:$src1)))]>; def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i16)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i16))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i16), RFP80:$src1)))]>; def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), - OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src1, - (X86fild addr:$src2, i32)))]>; + OneArgFPRW, + [!if(Forward, + (set RFP80:$dst, + (OpNode RFP80:$src1, (X86fild addr:$src2, i32))), + (set RFP80:$dst, + (OpNode (X86fild addr:$src2, i32), RFP80:$src1)))]>; +let mayLoad = 1 in def _FI16m : FPI<0xDE, fp, (outs), (ins i16mem:$src), - !strconcat("fi", asmstring, "{s}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{s}\t$src")>; +let mayLoad = 1 in def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), - !strconcat("fi", asmstring, "{l}\t$src")> { - let mayLoad = 1; -} + !strconcat("fi", asmstring, "{l}\t$src")>; } let Defs = [FPSW] in { @@ -213,14 +243,14 @@ defm DIV : FPBinary_rr<fdiv>; let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary<fadd, MRM0m, "add">; defm SUB : FPBinary<fsub, MRM4m, "sub">; -defm SUBR: FPBinary<fsub ,MRM5m, "subr">; +defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>; } let SchedRW = [WriteFMulLd] in { defm MUL : FPBinary<fmul, MRM1m, "mul">; } let SchedRW = [WriteFDivLd] in { defm DIV : FPBinary<fdiv, MRM6m, "div">; -defm DIVR: FPBinary<fdiv, MRM7m, "divr">; +defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>; } } @@ -306,13 +336,13 @@ def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; def FRSTORm : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">; -def FNSTSWm : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">; +def FNSTSWm : FPI<0xDD, MRM7m, (outs i16mem:$dst), (ins), "fnstsw\t$dst">; def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; -def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">; -def FBSTPm : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">; +def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; +def FBSTPm : FPI<0xDF, MRM6m, (outs f80mem:$dst), (ins), "fbstp\t$dst">; // Floating point cmovs. class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : @@ -633,16 +663,18 @@ def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>; def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>; def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; -def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; -def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], - IIC_FXSAVE>, TB, Requires<[In64BitMode]>; -def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; -def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], - IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; +let Predicates = [HasFXSR] in { + def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; + def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], + IIC_FXSAVE>, TB, Requires<[In64BitMode]>; + def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB; + def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], + IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; +} // Predicates = [FeatureFXSR] } // SchedRW //===----------------------------------------------------------------------===// diff --git a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 1f61ffa..829cedd 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/contrib/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -38,6 +38,8 @@ def bc_mmx : PatFrag<(ops node:$in), (x86mmx (bitconvert node:$in))>; def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisFP<1>, SDTCisVT<3, i8>, SDTCisVec<1>]>; +def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, + SDTCisSameAs<1, 2>, SDTCisInt<3>]>; def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>; def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>; @@ -58,13 +60,17 @@ def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp, [SDNPCommutative, SDNPAssociative]>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; +def X86frsqrt14s: SDNode<"X86ISD::FRSQRT", SDTFPBinOp>; +def X86frcp14s : SDNode<"X86ISD::FRCP", SDTFPBinOp>; def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>; def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; def X86hsub : SDNode<"X86ISD::HSUB", SDTIntBinOp>; def X86comi : SDNode<"X86ISD::COMI", SDTX86CmpTest>; +def X86comiSae : SDNode<"X86ISD::COMI", SDTX86CmpTestSae>; def X86ucomi : SDNode<"X86ISD::UCOMI", SDTX86CmpTest>; +def X86ucomiSae: SDNode<"X86ISD::UCOMI", SDTX86CmpTestSae>; def X86cmps : SDNode<"X86ISD::FSETCC", SDTX86Cmps>; //def X86cmpsd : SDNode<"X86ISD::FSETCCsd", SDTX86Cmpsd>; def X86cvtdq2pd: SDNode<"X86ISD::CVTDQ2PD", @@ -74,11 +80,18 @@ def X86cvtudq2pd: SDNode<"X86ISD::CVTUDQ2PD", SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, SDTCisVT<1, v4i32>]>>; def X86pshufb : SDNode<"X86ISD::PSHUFB", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86psadbw : SDNode<"X86ISD::PSADBW", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; +def X86dbpsadbw : SDNode<"X86ISD::DBPSADBW", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, i8>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>, SDTCisInt<3>]>>; def X86andnp : SDNode<"X86ISD::ANDNP", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; @@ -86,9 +99,11 @@ def X86psign : SDNode<"X86ISD::PSIGN", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; def X86pextrb : SDNode<"X86ISD::PEXTRB", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v16i8>, + SDTCisPtrTy<2>]>>; def X86pextrw : SDNode<"X86ISD::PEXTRW", - SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>; + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, v8i16>, + SDTCisPtrTy<2>]>>; def X86pinsrb : SDNode<"X86ISD::PINSRB", SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>; @@ -114,19 +129,17 @@ def X86vsext : SDNode<"X86ISD::VSEXT", SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>]>>; -def X86vtrunc : SDNode<"X86ISD::VTRUNC", - SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisOpSmallerThanOp<0, 1>]>>; +def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisInt<0>, SDTCisInt<1>, + SDTCisOpSmallerThanOp<0, 1>]>; + +def X86vtrunc : SDNode<"X86ISD::VTRUNC", SDTVtrunc>; +def X86vtruncs : SDNode<"X86ISD::VTRUNCS", SDTVtrunc>; +def X86vtruncus : SDNode<"X86ISD::VTRUNCUS", SDTVtrunc>; + def X86trunc : SDNode<"X86ISD::TRUNC", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>]>>; - -def X86vtruncm : SDNode<"X86ISD::VTRUNCM", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisInt<0>, SDTCisInt<1>, - SDTCisVec<2>, SDTCisInt<2>, - SDTCisOpSmallerThanOp<0, 2>]>>; def X86vfpext : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, @@ -136,6 +149,35 @@ def X86vfpround: SDNode<"X86ISD::VFPROUND", SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>]>>; +def X86fround: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>]>>; +def X86froundRnd: SDNode<"X86ISD::VFPROUND", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, f64>, + SDTCVecEltisVT<2, f64>, + SDTCisOpSmallerThanOp<0, 1>, + SDTCisInt<3>]>>; + +def X86fpext : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>]>>; + +def X86fpextRnd : SDNode<"X86ISD::VFPEXT", + SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisFP<1>,SDTCisFP<2>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, + SDTCVecEltisVT<2, f32>, + SDTCisOpSmallerThanOp<1, 0>, + SDTCisInt<3>]>>; + def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; @@ -159,10 +201,15 @@ def X86CmpMaskCCRound : def X86CmpMaskCCScalar : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; -def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; -def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; -def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; -def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86CmpMaskCCScalarRound : + SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, + SDTCisInt<4>]>; + +def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; +def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; +def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; +def X86cmpms : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalar>; +def X86cmpmsRnd : SDNode<"X86ISD::FSETCC", X86CmpMaskCCScalarRound>; def X86vshl : SDNode<"X86ISD::VSHL", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, @@ -178,6 +225,29 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86vprot : SDNode<"X86ISD::VPROT", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vproti : SDNode<"X86ISD::VPROTI", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>]>>; + +def X86vpshl : SDNode<"X86ISD::VPSHL", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; +def X86vpsha : SDNode<"X86ISD::VPSHA", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>]>>; + +def X86vpcom : SDNode<"X86ISD::VPCOM", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; +def X86vpcomu : SDNode<"X86ISD::VPCOMU", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; + def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; @@ -190,6 +260,7 @@ def X86avg : SDNode<"X86ISD::AVG" , SDTIntBinOp>; def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>; def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>; def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>; +def X86ktest : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>; def X86testm : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>, SDTCVecEltisVT<0, i1>, @@ -201,11 +272,15 @@ def X86testnm : SDNode<"X86ISD::TESTNM", SDTypeProfile<1, 2, [SDTCisVec<0>, def X86select : SDNode<"X86ISD::SELECT" , SDTSelect>; def X86pmuludq : SDNode<"X86ISD::PMULUDQ", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSameAs<1,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; def X86pmuldq : SDNode<"X86ISD::PMULDQ", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, - SDTCisSameAs<1,2>]>>; + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i64>, + SDTCVecEltisVT<1, i32>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>>; def X86extrqi : SDNode<"X86ISD::EXTRQI", SDTypeProfile<1, 3, [SDTCisVT<0, v2i64>, SDTCisSameAs<0,1>, @@ -221,24 +296,30 @@ def X86insertqi : SDNode<"X86ISD::INSERTQI", def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>; -def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>; def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVec<2>]>; + SDTCisSameSizeAs<0,2>, + SDTCisSameNumEltsAs<0,2>]>; def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, - SDTCisSameAs<0,1>, SDTCisInt<2>]>; + SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, SDTCisInt<3>]>; + SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>; def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisInt<3>, SDTCisInt<4>]>; +def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisInt<2>, SDTCisInt<3>]>; def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>; -def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>; +def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisInt<0>, SDTCisInt<1>]>; def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>; +def SDTTernlog : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisSameAs<0,3>, + SDTCisVT<4, i8>]>; + def SDTFPBinOpRound : SDTypeProfile<1, 3, [ // fadd_round, fmul_round, etc. SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>]>; @@ -250,15 +331,17 @@ def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisInt<4>]>; def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<2>]>; + SDTCisVec<0>, SDTCisVT<2, i32>]>; def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<3>]>; + SDTCisVec<0>, SDTCisVT<3, i32>]>; def STDFp3SrcRm : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, - SDTCisVec<0>, SDTCisInt<3>, SDTCisInt<4>]>; + SDTCisVec<0>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; -def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; + +def X86Abs : SDNode<"X86ISD::ABS", SDTIntUnaryOp>; +def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>; @@ -281,33 +364,74 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>; def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>; def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>; -def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<2, 1>]>; +def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<1,2>]>; def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>; def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>; def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>; def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>; +def X86vpmaddubsw : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>; +def X86vpmaddwd : SDNode<"X86ISD::VPMADDWD" , SDTPack>; + def X86VPermilpv : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>; def X86VPermilpi : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>; -def X86VPermv : SDNode<"X86ISD::VPERMV", SDTShuff2Op>; +def X86VPermv : SDNode<"X86ISD::VPERMV", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>]>>; def X86VPermi : SDNode<"X86ISD::VPERMI", SDTShuff2OpI>; -def X86VPermv3 : SDNode<"X86ISD::VPERMV3", SDTShuff3Op>; -def X86VPermiv3 : SDNode<"X86ISD::VPERMIV3", SDTShuff3Op>; +def X86VPermt2 : SDNode<"X86ISD::VPERMV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>, + SDTCisSameSizeAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86VPermi2X : SDNode<"X86ISD::VPERMIV3", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>, []>; + +def X86vpternlog : SDNode<"X86ISD::VPTERNLOG", SDTTernlog>; def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; -def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>; -def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; +def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPBinOpImmRound>; +def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; +def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>; +def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>; +def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>; +def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>, + SDTCisVec<1>, SDTCisFP<1>, + SDTCisSameNumEltsAs<0,1>, + SDTCisVT<2, i32>]>, []>; +def X86Vfpclasss : SDNode<"X86ISD::VFPCLASSS", + SDTypeProfile<1, 2, [SDTCisVT<0, i1>, + SDTCisFP<1>, SDTCisVT<2, i32>]>,[]>; def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSubVecOfVec<1, 0>]>, []>; +// SDTCisSubVecOfVec restriction cannot be applied for 128 bit version of VBROADCASTI32x2. +def X86SubV32x2Broadcast : SDNode<"X86ISD::SUBV_BROADCAST", + SDTypeProfile<1, 1, [SDTCisVec<0>, + SDTCisSameAs<0,1>]>, []>; + def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; +def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Vinsert : SDNode<"X86ISD::VINSERT", SDTypeProfile<1, 3, - [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>; + [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>, + SDTCisPtrTy<3>]>, []>; def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisVec<1>, SDTCisPtrTy<2>]>, []>; + [SDTCisEltOfVec<0, 1>, SDTCisVec<1>, + SDTCisPtrTy<2>]>, []>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; @@ -317,11 +441,13 @@ def X86faddRnd : SDNode<"X86ISD::FADD_RND", SDTFPBinOpRound>; def X86fsubRnd : SDNode<"X86ISD::FSUB_RND", SDTFPBinOpRound>; def X86fmulRnd : SDNode<"X86ISD::FMUL_RND", SDTFPBinOpRound>; def X86fdivRnd : SDNode<"X86ISD::FDIV_RND", SDTFPBinOpRound>; -def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; -def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; -def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; -def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; -def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fmaxRnd : SDNode<"X86ISD::FMAX_RND", SDTFPBinOpRound>; +def X86scalef : SDNode<"X86ISD::SCALEF", SDTFPBinOpRound>; +def X86fminRnd : SDNode<"X86ISD::FMIN_RND", SDTFPBinOpRound>; +def X86fsqrtRnd : SDNode<"X86ISD::FSQRT_RND", SDTFPUnaryOpRound>; +def X86fsqrtRnds : SDNode<"X86ISD::FSQRT_RND", STDFp2SrcRm>; +def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; +def X86fgetexpRnds : SDNode<"X86ISD::FGETEXP_RND", STDFp2SrcRm>; def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>; @@ -341,9 +467,11 @@ def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; -def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; -def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; -def X86RndScale : SDNode<"X86ISD::RNDSCALE", STDFp3SrcRm>; +def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>; +def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>; +def X86RndScales : SDNode<"X86ISD::VRNDSCALE", STDFp3SrcRm>; +def X86Reduces : SDNode<"X86ISD::VREDUCE", STDFp3SrcRm>; +def X86GetMants : SDNode<"X86ISD::VGETMANT", STDFp3SrcRm>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -362,7 +490,8 @@ def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, - SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisInt<3>]>; + SDTCisSameAs<0,1>, SDTCisInt<2>, + SDTCisVT<3, i32>]>; def SDTDoubleToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; @@ -371,9 +500,12 @@ def SDTFloatToInt: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, def SDTDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f64>]>; +def SDTSDoubleToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>,SDTCisFP<1>, + SDTCVecEltisVT<1, f64>, SDTCisInt<2>]>; def SDTFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisInt<0>, SDTCVecEltisVT<1, f32>]>; - +def SDTSFloatToIntRnd: SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisFP<1>, + SDTCVecEltisVT<1, f32>, SDTCisInt<2>]>; def SDTVintToFPRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCVecEltisVT<1, i32>, SDTCisInt<2>]>; @@ -392,6 +524,10 @@ def SDTVFPToLongRound: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, def X86SintToFpRnd : SDNode<"X86ISD::SINT_TO_FP_RND", SDTintToFPRound>; def X86UintToFpRnd : SDNode<"X86ISD::UINT_TO_FP_RND", SDTintToFPRound>; +def X86cvttss2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSFloatToIntRnd>; +def X86cvttss2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSFloatToIntRnd>; +def X86cvttsd2IntRnd : SDNode<"X86ISD::FP_TO_SINT_RND", SDTSDoubleToIntRnd>; +def X86cvttsd2UIntRnd : SDNode<"X86ISD::FP_TO_UINT_RND", SDTSDoubleToIntRnd>; // Vector with rounding mode // cvtt fp-to-int staff @@ -417,17 +553,35 @@ def X86cvtps2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTFloatToInt>; def X86cvtpd2Int : SDNode<"X86ISD::FP_TO_SINT_RND", SDTDoubleToInt>; def X86cvtpd2UInt : SDNode<"X86ISD::FP_TO_UINT_RND", SDTDoubleToInt>; +def X86cvtph2ps : SDNode<"ISD::FP16_TO_FP", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>, + SDTCisFP<0>, + SDTCisVT<2, i32>]> >; + +def X86cvtps2ph : SDNode<"ISD::FP_TO_FP16", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCVecEltisVT<0, i16>, + SDTCVecEltisVT<1, f32>, + SDTCisFP<1>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>]> >; def X86vfpextRnd : SDNode<"X86ISD::VFPEXT", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, + SDTCVecEltisVT<0, f64>, + SDTCVecEltisVT<1, f32>, SDTCisOpSmallerThanOp<1, 0>, - SDTCisInt<2>]>>; + SDTCisVT<2, i32>]>>; def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, SDTCisFP<0>, SDTCisFP<1>, SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, f64>, - SDTCisInt<2>]>>; + SDTCisOpSmallerThanOp<0, 1>, + SDTCisVT<2, i32>]>>; + +def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>; //===----------------------------------------------------------------------===// // SSE Complex Patterns @@ -436,10 +590,10 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND", // These are 'extloads' from a scalar to the low element of a vector, zeroing // the top elements. These are used for the SSE 'ss' and 'sd' instruction // forms. -def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [], +def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [], [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPWantRoot]>; -def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [], +def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [], [SDNPHasChain, SDNPMayLoad, SDNPMemOperand, SDNPWantRoot]>; @@ -490,9 +644,9 @@ def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; // The memory operand is required to be a 128-bit load, so it must be converted // from a vector to a scalar. def loadf32_128 : PatFrag<(ops node:$ptr), - (f32 (vector_extract (loadv4f32 node:$ptr), (iPTR 0)))>; + (f32 (extractelt (loadv4f32 node:$ptr), (iPTR 0)))>; def loadf64_128 : PatFrag<(ops node:$ptr), - (f64 (vector_extract (loadv2f64 node:$ptr), (iPTR 0)))>; + (f64 (extractelt (loadv2f64 node:$ptr), (iPTR 0)))>; // Like 'store', but always requires 128-bit vector alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -590,9 +744,9 @@ def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; // The memory operand is required to be a 128-bit load, so it must be converted // from a vector to a scalar. def memopfsf32_128 : PatFrag<(ops node:$ptr), - (f32 (vector_extract (memopv4f32 node:$ptr), (iPTR 0)))>; + (f32 (extractelt (memopv4f32 node:$ptr), (iPTR 0)))>; def memopfsf64_128 : PatFrag<(ops node:$ptr), - (f64 (vector_extract (memopv2f64 node:$ptr), (iPTR 0)))>; + (f64 (extractelt (memopv2f64 node:$ptr), (iPTR 0)))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a @@ -604,32 +758,6 @@ def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; -// MOVNT Support -// Like 'store', but requires the non-temporal bit to be set -def nontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal(); - return false; -}]>; - -def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal() && !ST->isTruncatingStore() && - ST->getAddressingMode() == ISD::UNINDEXED && - ST->getAlignment() >= 16; - return false; -}]>; - -def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr), - (st node:$val, node:$ptr), [{ - if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) - return ST->isNonTemporal() && - ST->getAlignment() < 16; - return false; -}]>; - def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_gather node:$src1, node:$src2, node:$src3) , [{ if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) @@ -851,29 +979,59 @@ def masked_load_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), return isa<MaskedLoadSDNode>(N); }]>; +// masked store fragments. +// X86mstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return !cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; + def masked_store_aligned128 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 16; return false; }]>; def masked_store_aligned256 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 32; return false; }]>; def masked_store_aligned512 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ if (auto *Store = dyn_cast<MaskedStoreSDNode>(N)) return Store->getAlignment() >= 64; return false; }]>; def masked_store_unaligned : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_store node:$src1, node:$src2, node:$src3), [{ + (X86mstore node:$src1, node:$src2, node:$src3), [{ return isa<MaskedStoreSDNode>(N); }]>; +// masked truncstore fragments +// X86mtruncstore can't be implemented in core DAG files because some targets +// doesn't support vector type ( llvm-tblgen will fail) +def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (masked_store node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore(); +}]>; +def masked_truncstorevi8 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def masked_truncstorevi16 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def masked_truncstorevi32 : + PatFrag<(ops node:$src1, node:$src2, node:$src3), + (X86mtruncstore node:$src1, node:$src2, node:$src3), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index cf68ef0..63e78de 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" @@ -101,9 +102,11 @@ struct X86MemoryFoldTableEntry { void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) - : X86GenInstrInfo( - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), - (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), + : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 + : X86::ADJCALLSTACKDOWN32), + (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 + : X86::ADJCALLSTACKUP32), + X86::CATCHRET), Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { @@ -332,6 +335,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, { X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE }, { X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE }, + { X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD }, + { X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD }, + { X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD }, { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, @@ -495,7 +501,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, - { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, @@ -605,7 +610,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, - { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, @@ -1647,6 +1651,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PEXT32rr, X86::PEXT32rm, 0 }, { X86::PEXT64rr, X86::PEXT64rm, 0 }, + // ADX foldable instructions + { X86::ADCX32rr, X86::ADCX32rm, 0 }, + { X86::ADCX64rr, X86::ADCX64rm, 0 }, + { X86::ADOX32rr, X86::ADOX32rm, 0 }, + { X86::ADOX64rr, X86::ADOX64rm, 0 }, + // AVX-512 foldable instructions { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, @@ -1729,11 +1739,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, @@ -1749,11 +1765,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, @@ -1769,11 +1791,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, @@ -1789,11 +1817,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, @@ -2282,7 +2316,35 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, case X86::FsVMOVAPSrm: case X86::FsVMOVAPDrm: case X86::FsMOVAPSrm: - case X86::FsMOVAPDrm: { + case X86::FsMOVAPDrm: + // AVX-512 + case X86::VMOVAPDZ128rm: + case X86::VMOVAPDZ256rm: + case X86::VMOVAPDZrm: + case X86::VMOVAPSZ128rm: + case X86::VMOVAPSZ256rm: + case X86::VMOVAPSZrm: + case X86::VMOVDQA32Z128rm: + case X86::VMOVDQA32Z256rm: + case X86::VMOVDQA32Zrm: + case X86::VMOVDQA64Z128rm: + case X86::VMOVDQA64Z256rm: + case X86::VMOVDQA64Zrm: + case X86::VMOVDQU16Z128rm: + case X86::VMOVDQU16Z256rm: + case X86::VMOVDQU16Zrm: + case X86::VMOVDQU32Z128rm: + case X86::VMOVDQU32Z256rm: + case X86::VMOVDQU32Zrm: + case X86::VMOVDQU64Z128rm: + case X86::VMOVDQU64Z256rm: + case X86::VMOVDQU64Zrm: + case X86::VMOVDQU8Z128rm: + case X86::VMOVDQU8Z256rm: + case X86::VMOVDQU8Zrm: + case X86::VMOVUPSZ128rm: + case X86::VMOVUPSZ256rm: + case X86::VMOVUPSZrm: { // Loads from constant pools are trivially rematerializable. if (MI->getOperand(1+X86::AddrBaseReg).isReg() && MI->getOperand(1+X86::AddrScaleAmt).isImm() && @@ -2363,9 +2425,8 @@ bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, // It is safe to clobber EFLAGS at the end of a block of no successor has it // live in. if (Iter == E) { - for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), - SE = MBB.succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *S : MBB.successors()) + if (S->isLiveIn(X86::EFLAGS)) return false; return true; } @@ -2411,13 +2472,29 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { - // MOV32r0 is implemented with a xor which clobbers condition code. - // Re-materialize it as movri instructions to avoid side effects. - unsigned Opc = Orig->getOpcode(); - if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { + bool ClobbersEFLAGS = false; + for (const MachineOperand &MO : Orig->operands()) { + if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { + ClobbersEFLAGS = true; + break; + } + } + + if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { + // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side + // effects. + int Value; + switch (Orig->getOpcode()) { + case X86::MOV32r0: Value = 0; break; + case X86::MOV32r1: Value = 1; break; + case X86::MOV32r_1: Value = -1; break; + default: + llvm_unreachable("Unexpected instruction!"); + } + DebugLoc DL = Orig->getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) - .addImm(0); + .addImm(Value); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); @@ -2428,7 +2505,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, } /// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead. -static bool hasLiveCondCodeDef(MachineInstr *MI) { +bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr *MI) const { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isDef() && @@ -2453,7 +2530,7 @@ inline static unsigned getTruncatedShiftCount(MachineInstr *MI, inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address // instructions if we can encode them appropriately. - // A LEA instruction utilizes a SIB byte to encode it's scale factor. + // A LEA instruction utilizes a SIB byte to encode its scale factor. // The SIB.scale field is two bits wide which means that we can encode any // shift amount less than 4. return ShAmt < 4 && ShAmt > 0; @@ -2493,7 +2570,7 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, ImplicitOp = Src; ImplicitOp.setImplicit(); - NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64); + NewSrc = getX86SubSuperRegister(Src.getReg(), 64); MachineBasicBlock::LivenessQueryResult LQR = MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); @@ -2914,10 +2991,162 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, return NewMI; } -/// We have a few instructions that must be hacked on to commute them. -/// -MachineInstr * -X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { +/// Returns true if the given instruction opcode is FMA3. +/// Otherwise, returns false. +/// The second parameter is optional and is used as the second return from +/// the function. It is set to true if the given instruction has FMA3 opcode +/// that is used for lowering of scalar FMA intrinsics, and it is set to false +/// otherwise. +static bool isFMA3(unsigned Opcode, bool *IsIntrinsic = nullptr) { + if (IsIntrinsic) + *IsIntrinsic = false; + + switch (Opcode) { + case X86::VFMADDSDr132r: case X86::VFMADDSDr132m: + case X86::VFMADDSSr132r: case X86::VFMADDSSr132m: + case X86::VFMSUBSDr132r: case X86::VFMSUBSDr132m: + case X86::VFMSUBSSr132r: case X86::VFMSUBSSr132m: + case X86::VFNMADDSDr132r: case X86::VFNMADDSDr132m: + case X86::VFNMADDSSr132r: case X86::VFNMADDSSr132m: + case X86::VFNMSUBSDr132r: case X86::VFNMSUBSDr132m: + case X86::VFNMSUBSSr132r: case X86::VFNMSUBSSr132m: + + case X86::VFMADDSDr213r: case X86::VFMADDSDr213m: + case X86::VFMADDSSr213r: case X86::VFMADDSSr213m: + case X86::VFMSUBSDr213r: case X86::VFMSUBSDr213m: + case X86::VFMSUBSSr213r: case X86::VFMSUBSSr213m: + case X86::VFNMADDSDr213r: case X86::VFNMADDSDr213m: + case X86::VFNMADDSSr213r: case X86::VFNMADDSSr213m: + case X86::VFNMSUBSDr213r: case X86::VFNMSUBSDr213m: + case X86::VFNMSUBSSr213r: case X86::VFNMSUBSSr213m: + + case X86::VFMADDSDr231r: case X86::VFMADDSDr231m: + case X86::VFMADDSSr231r: case X86::VFMADDSSr231m: + case X86::VFMSUBSDr231r: case X86::VFMSUBSDr231m: + case X86::VFMSUBSSr231r: case X86::VFMSUBSSr231m: + case X86::VFNMADDSDr231r: case X86::VFNMADDSDr231m: + case X86::VFNMADDSSr231r: case X86::VFNMADDSSr231m: + case X86::VFNMSUBSDr231r: case X86::VFNMSUBSDr231m: + case X86::VFNMSUBSSr231r: case X86::VFNMSUBSSr231m: + + case X86::VFMADDSUBPDr132r: case X86::VFMADDSUBPDr132m: + case X86::VFMADDSUBPSr132r: case X86::VFMADDSUBPSr132m: + case X86::VFMSUBADDPDr132r: case X86::VFMSUBADDPDr132m: + case X86::VFMSUBADDPSr132r: case X86::VFMSUBADDPSr132m: + case X86::VFMADDSUBPDr132rY: case X86::VFMADDSUBPDr132mY: + case X86::VFMADDSUBPSr132rY: case X86::VFMADDSUBPSr132mY: + case X86::VFMSUBADDPDr132rY: case X86::VFMSUBADDPDr132mY: + case X86::VFMSUBADDPSr132rY: case X86::VFMSUBADDPSr132mY: + + case X86::VFMADDPDr132r: case X86::VFMADDPDr132m: + case X86::VFMADDPSr132r: case X86::VFMADDPSr132m: + case X86::VFMSUBPDr132r: case X86::VFMSUBPDr132m: + case X86::VFMSUBPSr132r: case X86::VFMSUBPSr132m: + case X86::VFNMADDPDr132r: case X86::VFNMADDPDr132m: + case X86::VFNMADDPSr132r: case X86::VFNMADDPSr132m: + case X86::VFNMSUBPDr132r: case X86::VFNMSUBPDr132m: + case X86::VFNMSUBPSr132r: case X86::VFNMSUBPSr132m: + case X86::VFMADDPDr132rY: case X86::VFMADDPDr132mY: + case X86::VFMADDPSr132rY: case X86::VFMADDPSr132mY: + case X86::VFMSUBPDr132rY: case X86::VFMSUBPDr132mY: + case X86::VFMSUBPSr132rY: case X86::VFMSUBPSr132mY: + case X86::VFNMADDPDr132rY: case X86::VFNMADDPDr132mY: + case X86::VFNMADDPSr132rY: case X86::VFNMADDPSr132mY: + case X86::VFNMSUBPDr132rY: case X86::VFNMSUBPDr132mY: + case X86::VFNMSUBPSr132rY: case X86::VFNMSUBPSr132mY: + + case X86::VFMADDSUBPDr213r: case X86::VFMADDSUBPDr213m: + case X86::VFMADDSUBPSr213r: case X86::VFMADDSUBPSr213m: + case X86::VFMSUBADDPDr213r: case X86::VFMSUBADDPDr213m: + case X86::VFMSUBADDPSr213r: case X86::VFMSUBADDPSr213m: + case X86::VFMADDSUBPDr213rY: case X86::VFMADDSUBPDr213mY: + case X86::VFMADDSUBPSr213rY: case X86::VFMADDSUBPSr213mY: + case X86::VFMSUBADDPDr213rY: case X86::VFMSUBADDPDr213mY: + case X86::VFMSUBADDPSr213rY: case X86::VFMSUBADDPSr213mY: + + case X86::VFMADDPDr213r: case X86::VFMADDPDr213m: + case X86::VFMADDPSr213r: case X86::VFMADDPSr213m: + case X86::VFMSUBPDr213r: case X86::VFMSUBPDr213m: + case X86::VFMSUBPSr213r: case X86::VFMSUBPSr213m: + case X86::VFNMADDPDr213r: case X86::VFNMADDPDr213m: + case X86::VFNMADDPSr213r: case X86::VFNMADDPSr213m: + case X86::VFNMSUBPDr213r: case X86::VFNMSUBPDr213m: + case X86::VFNMSUBPSr213r: case X86::VFNMSUBPSr213m: + case X86::VFMADDPDr213rY: case X86::VFMADDPDr213mY: + case X86::VFMADDPSr213rY: case X86::VFMADDPSr213mY: + case X86::VFMSUBPDr213rY: case X86::VFMSUBPDr213mY: + case X86::VFMSUBPSr213rY: case X86::VFMSUBPSr213mY: + case X86::VFNMADDPDr213rY: case X86::VFNMADDPDr213mY: + case X86::VFNMADDPSr213rY: case X86::VFNMADDPSr213mY: + case X86::VFNMSUBPDr213rY: case X86::VFNMSUBPDr213mY: + case X86::VFNMSUBPSr213rY: case X86::VFNMSUBPSr213mY: + + case X86::VFMADDSUBPDr231r: case X86::VFMADDSUBPDr231m: + case X86::VFMADDSUBPSr231r: case X86::VFMADDSUBPSr231m: + case X86::VFMSUBADDPDr231r: case X86::VFMSUBADDPDr231m: + case X86::VFMSUBADDPSr231r: case X86::VFMSUBADDPSr231m: + case X86::VFMADDSUBPDr231rY: case X86::VFMADDSUBPDr231mY: + case X86::VFMADDSUBPSr231rY: case X86::VFMADDSUBPSr231mY: + case X86::VFMSUBADDPDr231rY: case X86::VFMSUBADDPDr231mY: + case X86::VFMSUBADDPSr231rY: case X86::VFMSUBADDPSr231mY: + + case X86::VFMADDPDr231r: case X86::VFMADDPDr231m: + case X86::VFMADDPSr231r: case X86::VFMADDPSr231m: + case X86::VFMSUBPDr231r: case X86::VFMSUBPDr231m: + case X86::VFMSUBPSr231r: case X86::VFMSUBPSr231m: + case X86::VFNMADDPDr231r: case X86::VFNMADDPDr231m: + case X86::VFNMADDPSr231r: case X86::VFNMADDPSr231m: + case X86::VFNMSUBPDr231r: case X86::VFNMSUBPDr231m: + case X86::VFNMSUBPSr231r: case X86::VFNMSUBPSr231m: + case X86::VFMADDPDr231rY: case X86::VFMADDPDr231mY: + case X86::VFMADDPSr231rY: case X86::VFMADDPSr231mY: + case X86::VFMSUBPDr231rY: case X86::VFMSUBPDr231mY: + case X86::VFMSUBPSr231rY: case X86::VFMSUBPSr231mY: + case X86::VFNMADDPDr231rY: case X86::VFNMADDPDr231mY: + case X86::VFNMADDPSr231rY: case X86::VFNMADDPSr231mY: + case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPDr231mY: + case X86::VFNMSUBPSr231rY: case X86::VFNMSUBPSr231mY: + return true; + + case X86::VFMADDSDr132r_Int: case X86::VFMADDSDr132m_Int: + case X86::VFMADDSSr132r_Int: case X86::VFMADDSSr132m_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFMSUBSDr132m_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFMSUBSSr132m_Int: + case X86::VFNMADDSDr132r_Int: case X86::VFNMADDSDr132m_Int: + case X86::VFNMADDSSr132r_Int: case X86::VFNMADDSSr132m_Int: + case X86::VFNMSUBSDr132r_Int: case X86::VFNMSUBSDr132m_Int: + case X86::VFNMSUBSSr132r_Int: case X86::VFNMSUBSSr132m_Int: + + case X86::VFMADDSDr213r_Int: case X86::VFMADDSDr213m_Int: + case X86::VFMADDSSr213r_Int: case X86::VFMADDSSr213m_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFMSUBSDr213m_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFMSUBSSr213m_Int: + case X86::VFNMADDSDr213r_Int: case X86::VFNMADDSDr213m_Int: + case X86::VFNMADDSSr213r_Int: case X86::VFNMADDSSr213m_Int: + case X86::VFNMSUBSDr213r_Int: case X86::VFNMSUBSDr213m_Int: + case X86::VFNMSUBSSr213r_Int: case X86::VFNMSUBSSr213m_Int: + + case X86::VFMADDSDr231r_Int: case X86::VFMADDSDr231m_Int: + case X86::VFMADDSSr231r_Int: case X86::VFMADDSSr231m_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFMSUBSDr231m_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFMSUBSSr231m_Int: + case X86::VFNMADDSDr231r_Int: case X86::VFNMADDSDr231m_Int: + case X86::VFNMADDSSr231r_Int: case X86::VFNMADDSSr231m_Int: + case X86::VFNMSUBSDr231r_Int: case X86::VFNMSUBSDr231m_Int: + case X86::VFNMSUBSSr231r_Int: case X86::VFNMSUBSSr231m_Int: + if (IsIntrinsic) + *IsIntrinsic = true; + return true; + default: + return false; + } + llvm_unreachable("Opcode not handled by the switch"); +} + +MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr *MI, + bool NewMI, + unsigned OpIdx1, + unsigned OpIdx2) const { switch (MI->getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) @@ -2944,7 +3173,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { } MI->setDesc(get(Opc)); MI->getOperand(3).setImm(Size-Amt); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::BLENDPDrri: case X86::BLENDPSrri: @@ -2980,7 +3209,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Mask ^ Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ @@ -2995,7 +3224,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4)); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMPPDrri: case X86::CMPPSrri: @@ -3016,7 +3245,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { MI = MF.CloneMachineInstr(MI); NewMI = false; } - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); default: return nullptr; } @@ -3045,7 +3274,7 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { NewMI = false; } MI->getOperand(3).setImm(Imm); - return TargetInstrInfo::commuteInstruction(MI, NewMI); + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: @@ -3124,11 +3353,272 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { // Fallthrough intended. } default: - return TargetInstrInfo::commuteInstruction(MI, NewMI); + if (isFMA3(MI->getOpcode())) { + unsigned Opc = getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2); + if (Opc == 0) + return nullptr; + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->setDesc(get(Opc)); + } + return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2); + } +} + +bool X86InstrInfo::findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const { + + unsigned RegOpsNum = isMem(MI, 3) ? 2 : 3; + + // Only the first RegOpsNum operands are commutable. + // Also, the value 'CommuteAnyOperandIndex' is valid here as it means + // that the operand is not specified/fixed. + if (SrcOpIdx1 != CommuteAnyOperandIndex && + (SrcOpIdx1 < 1 || SrcOpIdx1 > RegOpsNum)) + return false; + if (SrcOpIdx2 != CommuteAnyOperandIndex && + (SrcOpIdx2 < 1 || SrcOpIdx2 > RegOpsNum)) + return false; + + // Look for two different register operands assumed to be commutable + // regardless of the FMA opcode. The FMA opcode is adjusted later. + if (SrcOpIdx1 == CommuteAnyOperandIndex || + SrcOpIdx2 == CommuteAnyOperandIndex) { + unsigned CommutableOpIdx1 = SrcOpIdx1; + unsigned CommutableOpIdx2 = SrcOpIdx2; + + // At least one of operands to be commuted is not specified and + // this method is free to choose appropriate commutable operands. + if (SrcOpIdx1 == SrcOpIdx2) + // Both of operands are not fixed. By default set one of commutable + // operands to the last register operand of the instruction. + CommutableOpIdx2 = RegOpsNum; + else if (SrcOpIdx2 == CommuteAnyOperandIndex) + // Only one of operands is not fixed. + CommutableOpIdx2 = SrcOpIdx1; + + // CommutableOpIdx2 is well defined now. Let's choose another commutable + // operand and assign its index to CommutableOpIdx1. + unsigned Op2Reg = MI->getOperand(CommutableOpIdx2).getReg(); + for (CommutableOpIdx1 = RegOpsNum; CommutableOpIdx1 > 0; CommutableOpIdx1--) { + // The commuted operands must have different registers. + // Otherwise, the commute transformation does not change anything and + // is useless then. + if (Op2Reg != MI->getOperand(CommutableOpIdx1).getReg()) + break; + } + + // No appropriate commutable operands were found. + if (CommutableOpIdx1 == 0) + return false; + + // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2 + // to return those values. + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + } + + // Check if we can adjust the opcode to preserve the semantics when + // commute the register operands. + return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2) != 0; +} + +unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const { + unsigned Opc = MI->getOpcode(); + + // Define the array that holds FMA opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned RegularOpcodeGroups[][3] = { + { X86::VFMADDSSr132r, X86::VFMADDSSr213r, X86::VFMADDSSr231r }, + { X86::VFMADDSDr132r, X86::VFMADDSDr213r, X86::VFMADDSDr231r }, + { X86::VFMADDPSr132r, X86::VFMADDPSr213r, X86::VFMADDPSr231r }, + { X86::VFMADDPDr132r, X86::VFMADDPDr213r, X86::VFMADDPDr231r }, + { X86::VFMADDPSr132rY, X86::VFMADDPSr213rY, X86::VFMADDPSr231rY }, + { X86::VFMADDPDr132rY, X86::VFMADDPDr213rY, X86::VFMADDPDr231rY }, + { X86::VFMADDSSr132m, X86::VFMADDSSr213m, X86::VFMADDSSr231m }, + { X86::VFMADDSDr132m, X86::VFMADDSDr213m, X86::VFMADDSDr231m }, + { X86::VFMADDPSr132m, X86::VFMADDPSr213m, X86::VFMADDPSr231m }, + { X86::VFMADDPDr132m, X86::VFMADDPDr213m, X86::VFMADDPDr231m }, + { X86::VFMADDPSr132mY, X86::VFMADDPSr213mY, X86::VFMADDPSr231mY }, + { X86::VFMADDPDr132mY, X86::VFMADDPDr213mY, X86::VFMADDPDr231mY }, + + { X86::VFMSUBSSr132r, X86::VFMSUBSSr213r, X86::VFMSUBSSr231r }, + { X86::VFMSUBSDr132r, X86::VFMSUBSDr213r, X86::VFMSUBSDr231r }, + { X86::VFMSUBPSr132r, X86::VFMSUBPSr213r, X86::VFMSUBPSr231r }, + { X86::VFMSUBPDr132r, X86::VFMSUBPDr213r, X86::VFMSUBPDr231r }, + { X86::VFMSUBPSr132rY, X86::VFMSUBPSr213rY, X86::VFMSUBPSr231rY }, + { X86::VFMSUBPDr132rY, X86::VFMSUBPDr213rY, X86::VFMSUBPDr231rY }, + { X86::VFMSUBSSr132m, X86::VFMSUBSSr213m, X86::VFMSUBSSr231m }, + { X86::VFMSUBSDr132m, X86::VFMSUBSDr213m, X86::VFMSUBSDr231m }, + { X86::VFMSUBPSr132m, X86::VFMSUBPSr213m, X86::VFMSUBPSr231m }, + { X86::VFMSUBPDr132m, X86::VFMSUBPDr213m, X86::VFMSUBPDr231m }, + { X86::VFMSUBPSr132mY, X86::VFMSUBPSr213mY, X86::VFMSUBPSr231mY }, + { X86::VFMSUBPDr132mY, X86::VFMSUBPDr213mY, X86::VFMSUBPDr231mY }, + + { X86::VFNMADDSSr132r, X86::VFNMADDSSr213r, X86::VFNMADDSSr231r }, + { X86::VFNMADDSDr132r, X86::VFNMADDSDr213r, X86::VFNMADDSDr231r }, + { X86::VFNMADDPSr132r, X86::VFNMADDPSr213r, X86::VFNMADDPSr231r }, + { X86::VFNMADDPDr132r, X86::VFNMADDPDr213r, X86::VFNMADDPDr231r }, + { X86::VFNMADDPSr132rY, X86::VFNMADDPSr213rY, X86::VFNMADDPSr231rY }, + { X86::VFNMADDPDr132rY, X86::VFNMADDPDr213rY, X86::VFNMADDPDr231rY }, + { X86::VFNMADDSSr132m, X86::VFNMADDSSr213m, X86::VFNMADDSSr231m }, + { X86::VFNMADDSDr132m, X86::VFNMADDSDr213m, X86::VFNMADDSDr231m }, + { X86::VFNMADDPSr132m, X86::VFNMADDPSr213m, X86::VFNMADDPSr231m }, + { X86::VFNMADDPDr132m, X86::VFNMADDPDr213m, X86::VFNMADDPDr231m }, + { X86::VFNMADDPSr132mY, X86::VFNMADDPSr213mY, X86::VFNMADDPSr231mY }, + { X86::VFNMADDPDr132mY, X86::VFNMADDPDr213mY, X86::VFNMADDPDr231mY }, + + { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr213r, X86::VFNMSUBSSr231r }, + { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr213r, X86::VFNMSUBSDr231r }, + { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr213r, X86::VFNMSUBPSr231r }, + { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr213r, X86::VFNMSUBPDr231r }, + { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr231rY }, + { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr231rY }, + { X86::VFNMSUBSSr132m, X86::VFNMSUBSSr213m, X86::VFNMSUBSSr231m }, + { X86::VFNMSUBSDr132m, X86::VFNMSUBSDr213m, X86::VFNMSUBSDr231m }, + { X86::VFNMSUBPSr132m, X86::VFNMSUBPSr213m, X86::VFNMSUBPSr231m }, + { X86::VFNMSUBPDr132m, X86::VFNMSUBPDr213m, X86::VFNMSUBPDr231m }, + { X86::VFNMSUBPSr132mY, X86::VFNMSUBPSr213mY, X86::VFNMSUBPSr231mY }, + { X86::VFNMSUBPDr132mY, X86::VFNMSUBPDr213mY, X86::VFNMSUBPDr231mY }, + + { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr231r }, + { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr231r }, + { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr231rY }, + { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr231rY }, + { X86::VFMADDSUBPSr132m, X86::VFMADDSUBPSr213m, X86::VFMADDSUBPSr231m }, + { X86::VFMADDSUBPDr132m, X86::VFMADDSUBPDr213m, X86::VFMADDSUBPDr231m }, + { X86::VFMADDSUBPSr132mY, X86::VFMADDSUBPSr213mY, X86::VFMADDSUBPSr231mY }, + { X86::VFMADDSUBPDr132mY, X86::VFMADDSUBPDr213mY, X86::VFMADDSUBPDr231mY }, + + { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr231r }, + { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr231r }, + { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr231rY }, + { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr231rY }, + { X86::VFMSUBADDPSr132m, X86::VFMSUBADDPSr213m, X86::VFMSUBADDPSr231m }, + { X86::VFMSUBADDPDr132m, X86::VFMSUBADDPDr213m, X86::VFMSUBADDPDr231m }, + { X86::VFMSUBADDPSr132mY, X86::VFMSUBADDPSr213mY, X86::VFMSUBADDPSr231mY }, + { X86::VFMSUBADDPDr132mY, X86::VFMSUBADDPDr213mY, X86::VFMSUBADDPDr231mY } + }; + + // Define the array that holds FMA*_Int opcodes in groups + // of 3 opcodes(132, 213, 231) in each group. + static const unsigned IntrinOpcodeGroups[][3] = { + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr213r_Int, X86::VFMADDSSr231r_Int }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr213r_Int, X86::VFMADDSDr231r_Int }, + { X86::VFMADDSSr132m_Int, X86::VFMADDSSr213m_Int, X86::VFMADDSSr231m_Int }, + { X86::VFMADDSDr132m_Int, X86::VFMADDSDr213m_Int, X86::VFMADDSDr231m_Int }, + + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr231r_Int }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr231r_Int }, + { X86::VFMSUBSSr132m_Int, X86::VFMSUBSSr213m_Int, X86::VFMSUBSSr231m_Int }, + { X86::VFMSUBSDr132m_Int, X86::VFMSUBSDr213m_Int, X86::VFMSUBSDr231m_Int }, + + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr231r_Int }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr231r_Int }, + { X86::VFNMADDSSr132m_Int, X86::VFNMADDSSr213m_Int, X86::VFNMADDSSr231m_Int }, + { X86::VFNMADDSDr132m_Int, X86::VFNMADDSDr213m_Int, X86::VFNMADDSDr231m_Int }, + + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr231r_Int }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr231r_Int }, + { X86::VFNMSUBSSr132m_Int, X86::VFNMSUBSSr213m_Int, X86::VFNMSUBSSr231m_Int }, + { X86::VFNMSUBSDr132m_Int, X86::VFNMSUBSDr213m_Int, X86::VFNMSUBSDr231m_Int }, + }; + + const unsigned Form132Index = 0; + const unsigned Form213Index = 1; + const unsigned Form231Index = 2; + const unsigned FormsNum = 3; + + bool IsIntrinOpcode; + isFMA3(Opc, &IsIntrinOpcode); + + size_t GroupsNum; + const unsigned (*OpcodeGroups)[3]; + if (IsIntrinOpcode) { + GroupsNum = array_lengthof(IntrinOpcodeGroups); + OpcodeGroups = IntrinOpcodeGroups; + } else { + GroupsNum = array_lengthof(RegularOpcodeGroups); + OpcodeGroups = RegularOpcodeGroups; + } + + const unsigned *FoundOpcodesGroup = nullptr; + size_t FormIndex; + + // Look for the input opcode in the corresponding opcodes table. + for (size_t GroupIndex = 0; GroupIndex < GroupsNum && !FoundOpcodesGroup; + ++GroupIndex) { + for (FormIndex = 0; FormIndex < FormsNum; ++FormIndex) { + if (OpcodeGroups[GroupIndex][FormIndex] == Opc) { + FoundOpcodesGroup = OpcodeGroups[GroupIndex]; + break; + } + } } + + // The input opcode does not match with any of the opcodes from the tables. + // The unsupported FMA opcode must be added to one of the two opcode groups + // defined above. + assert(FoundOpcodesGroup != nullptr && "Unexpected FMA3 opcode"); + + // Put the lowest index to SrcOpIdx1 to simplify the checks below. + if (SrcOpIdx1 > SrcOpIdx2) + std::swap(SrcOpIdx1, SrcOpIdx2); + + // TODO: Commuting the 1st operand of FMA*_Int requires some additional + // analysis. The commute optimization is legal only if all users of FMA*_Int + // use only the lowest element of the FMA*_Int instruction. Such analysis are + // not implemented yet. So, just return 0 in that case. + // When such analysis are available this place will be the right place for + // calling it. + if (IsIntrinOpcode && SrcOpIdx1 == 1) + return 0; + + unsigned Case; + if (SrcOpIdx1 == 1 && SrcOpIdx2 == 2) + Case = 0; + else if (SrcOpIdx1 == 1 && SrcOpIdx2 == 3) + Case = 1; + else if (SrcOpIdx1 == 2 && SrcOpIdx2 == 3) + Case = 2; + else + return 0; + + // Define the FMA forms mapping array that helps to map input FMA form + // to output FMA form to preserve the operation semantics after + // commuting the operands. + static const unsigned FormMapping[][3] = { + // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2; + // FMA132 A, C, b; ==> FMA231 C, A, b; + // FMA213 B, A, c; ==> FMA213 A, B, c; + // FMA231 C, A, b; ==> FMA132 A, C, b; + { Form231Index, Form213Index, Form132Index }, + // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3; + // FMA132 A, c, B; ==> FMA132 B, c, A; + // FMA213 B, a, C; ==> FMA231 C, a, B; + // FMA231 C, a, B; ==> FMA213 B, a, C; + { Form132Index, Form231Index, Form213Index }, + // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3; + // FMA132 a, C, B; ==> FMA213 a, B, C; + // FMA213 b, A, C; ==> FMA132 b, C, A; + // FMA231 c, A, B; ==> FMA231 c, B, A; + { Form213Index, Form132Index, Form231Index } + }; + + // Everything is ready, just adjust the FMA opcode and return it. + FormIndex = FormMapping[Case][FormIndex]; + return FoundOpcodesGroup[FormIndex]; } -bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, +bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { case X86::CMPPDrri: @@ -3141,46 +3631,22 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, // Ordered/Unordered/Equal/NotEqual tests unsigned Imm = MI->getOperand(3).getImm() & 0x7; switch (Imm) { - case 0x00: // EQUAL - case 0x03: // UNORDERED - case 0x04: // NOT EQUAL - case 0x07: // ORDERED - SrcOpIdx1 = 1; - SrcOpIdx2 = 2; - return true; + case 0x00: // EQUAL + case 0x03: // UNORDERED + case 0x04: // NOT EQUAL + case 0x07: // ORDERED + // The indices of the commutable operands are 1 and 2. + // Assign them to the returned operand indices here. + return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2); } return false; } - case X86::VFMADDPDr231r: - case X86::VFMADDPSr231r: - case X86::VFMADDSDr231r: - case X86::VFMADDSSr231r: - case X86::VFMSUBPDr231r: - case X86::VFMSUBPSr231r: - case X86::VFMSUBSDr231r: - case X86::VFMSUBSSr231r: - case X86::VFNMADDPDr231r: - case X86::VFNMADDPSr231r: - case X86::VFNMADDSDr231r: - case X86::VFNMADDSSr231r: - case X86::VFNMSUBPDr231r: - case X86::VFNMSUBPSr231r: - case X86::VFNMSUBSDr231r: - case X86::VFNMSUBSSr231r: - case X86::VFMADDPDr231rY: - case X86::VFMADDPSr231rY: - case X86::VFMSUBPDr231rY: - case X86::VFMSUBPSr231rY: - case X86::VFNMADDPDr231rY: - case X86::VFNMADDPSr231rY: - case X86::VFNMSUBPDr231rY: - case X86::VFNMSUBPSr231rY: - SrcOpIdx1 = 2; - SrcOpIdx2 = 3; - return true; default: + if (isFMA3(MI->getOpcode())) + return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } + return false; } static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { @@ -3821,15 +4287,58 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, return 0; } -inline static bool MaskRegClassContains(unsigned Reg) { +static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } + +static bool GRRegClassContains(unsigned Reg) { + return X86::GR64RegClass.contains(Reg) || + X86::GR32RegClass.contains(Reg) || + X86::GR16RegClass.contains(Reg) || + X86::GR8RegClass.contains(Reg); +} +static +unsigned copyPhysRegOpcode_AVX512_DQ(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && X86::GR8RegClass.contains(DestReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); + return X86::KMOVBrk; + } + if (MaskRegClassContains(DestReg) && X86::GR8RegClass.contains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); + return X86::KMOVBkr; + } + return 0; +} + static -unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { +unsigned copyPhysRegOpcode_AVX512_BW(unsigned& DestReg, unsigned& SrcReg) { + if (MaskRegClassContains(SrcReg) && MaskRegClassContains(DestReg)) + return X86::KMOVQkk; + if (MaskRegClassContains(SrcReg) && X86::GR32RegClass.contains(DestReg)) + return X86::KMOVDrk; + if (MaskRegClassContains(SrcReg) && X86::GR64RegClass.contains(DestReg)) + return X86::KMOVQrk; + if (MaskRegClassContains(DestReg) && X86::GR32RegClass.contains(SrcReg)) + return X86::KMOVDkr; + if (MaskRegClassContains(DestReg) && X86::GR64RegClass.contains(SrcReg)) + return X86::KMOVQkr; + return 0; +} + +static +unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg, + const X86Subtarget &Subtarget) +{ + if (Subtarget.hasDQI()) + if (auto Opc = copyPhysRegOpcode_AVX512_DQ(DestReg, SrcReg)) + return Opc; + if (Subtarget.hasBWI()) + if (auto Opc = copyPhysRegOpcode_AVX512_BW(DestReg, SrcReg)) + return Opc; if (X86::VR128XRegClass.contains(DestReg, SrcReg) || X86::VR256XRegClass.contains(DestReg, SrcReg) || X86::VR512RegClass.contains(DestReg, SrcReg)) { @@ -3837,21 +4346,14 @@ unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } - if (MaskRegClassContains(DestReg) && - MaskRegClassContains(SrcReg)) + if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) return X86::KMOVWkk; - if (MaskRegClassContains(DestReg) && - (X86::GR32RegClass.contains(SrcReg) || - X86::GR16RegClass.contains(SrcReg) || - X86::GR8RegClass.contains(SrcReg))) { - SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); + if (MaskRegClassContains(DestReg) && GRRegClassContains(SrcReg)) { + SrcReg = getX86SubSuperRegister(SrcReg, 32); return X86::KMOVWkr; } - if ((X86::GR32RegClass.contains(DestReg) || - X86::GR16RegClass.contains(DestReg) || - X86::GR8RegClass.contains(DestReg)) && - MaskRegClassContains(SrcReg)) { - DestReg = getX86SubSuperRegister(DestReg, MVT::i32); + if (GRRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) { + DestReg = getX86SubSuperRegister(DestReg, 32); return X86::KMOVWrk; } return 0; @@ -3886,7 +4388,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; else if (HasAVX512) - Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); + Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg, Subtarget); else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) @@ -3900,34 +4402,86 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - // Moving EFLAGS to / from another register requires a push and a pop. - // Notice that we have to adjust the stack if we don't want to clobber the - // first frame index. See X86FrameLowering.cpp - clobbersTheStack. - if (SrcReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF64)); - BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); + bool FromEFLAGS = SrcReg == X86::EFLAGS; + bool ToEFLAGS = DestReg == X86::EFLAGS; + int Reg = FromEFLAGS ? DestReg : SrcReg; + bool is32 = X86::GR32RegClass.contains(Reg); + bool is64 = X86::GR64RegClass.contains(Reg); + + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int PopF = is64 ? X86::POPF64 : X86::POPF32; + int AX = is64 ? X86::RAX : X86::EAX; + + if (!Subtarget.hasLAHFSAHF()) { + assert(Subtarget.is64Bit() && + "Not having LAHF/SAHF only happens on 64-bit."); + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(PushF)); + BuildMI(MBB, MI, DL, get(Pop), DestReg); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Push)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(PopF)); + } return; } - if (X86::GR32RegClass.contains(DestReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSHF32)); - BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); - return; + + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is + // inefficient. Instead: + // - Save the overflow flag OF into AL using SETO, and restore it using a + // signed 8-bit addition of AL and INT8_MAX. + // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH + // using LAHF/SAHF. + // - When RAX/EAX is live and isn't the destination register, make sure it + // isn't clobbered by PUSH/POP'ing it before and after saving/restoring + // the flags. + // This approach is ~2.25x faster than using PUSHF/POPF. + // + // This is still somewhat inefficient because we don't know which flags are + // actually live inside EFLAGS. Were we able to do a single SETcc instead of + // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. + // + // PUSHF/POPF is also potentially incorrect because it affects other flags + // such as TF/IF/DF, which LLVM doesn't model. + // + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + + + bool AXDead = (Reg == AX) || + (MachineBasicBlock::LQR_Dead == + MBB.computeRegisterLiveness(&getRegisterInfo(), AX, MI)); + if (!AXDead) { + // FIXME: If computeRegisterLiveness() reported LQR_Unknown then AX may + // actually be dead. This is not a problem for correctness as we are just + // (unnecessarily) saving+restoring a dead register. However the + // MachineVerifier expects operands that read from dead registers + // to be marked with the "undef" flag. + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); } - } - if (DestReg == X86::EFLAGS) { - if (X86::GR64RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH64r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF64)); - return; + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); + BuildMI(MBB, MI, DL, get(X86::LAHF)); + BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); } - if (X86::GR32RegClass.contains(SrcReg)) { - BuildMI(MBB, MI, DL, get(X86::PUSH32r)) - .addReg(SrcReg, getKillRegState(KillSrc)); - BuildMI(MBB, MI, DL, get(X86::POPF32)); - return; + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) + .addReg(X86::AL) + .addImm(INT8_MAX); + BuildMI(MBB, MI, DL, get(X86::SAHF)); } + if (!AXDead) + BuildMI(MBB, MI, DL, get(Pop), AX); + return; } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) @@ -4602,9 +5156,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, // live-out. If it is live-out, do not optimize. if ((IsCmpZero || IsSwapped) && !IsSafe) { MachineBasicBlock *MBB = CmpInstr->getParent(); - for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), - SE = MBB->succ_end(); SI != SE; ++SI) - if ((*SI)->isLiveIn(X86::EFLAGS)) + for (MachineBasicBlock *Successor : MBB->successors()) + if (Successor->isLiveIn(X86::EFLAGS)) return false; } @@ -4645,8 +5198,8 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. - for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++) - OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second)); + for (auto &Op : OpsToUpdate) + Op.first->setDesc(get(Op.second)); return true; } @@ -4694,8 +5247,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, return nullptr; // Check whether we can fold the def into SrcOperandId. - MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI); - if (FoldMI) { + if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandId, DefMI)) { FoldAsLoadDefReg = 0; return FoldMI; } @@ -4725,6 +5277,82 @@ static bool Expand2AddrUndef(MachineInstrBuilder &MIB, return true; } +/// Expand a single-def pseudo instruction to a two-addr +/// instruction with two %k0 reads. +/// This is used for mapping: +/// %k4 = K_SET1 +/// to: +/// %k4 = KXNORrr %k0, %k0 +static bool Expand2AddrKreg(MachineInstrBuilder &MIB, + const MCInstrDesc &Desc, unsigned Reg) { + assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); + MIB->setDesc(Desc); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + return true; +} + +static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII, + bool MinusOne) { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + unsigned Reg = MIB->getOperand(0).getReg(); + + // Insert the XOR. + BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + + // Turn the pseudo into an INC or DEC. + MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r)); + MIB.addReg(Reg); + + return true; +} + +bool X86InstrInfo::ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const { + MachineBasicBlock &MBB = *MIB->getParent(); + DebugLoc DL = MIB->getDebugLoc(); + int64_t Imm = MIB->getOperand(1).getImm(); + assert(Imm != 0 && "Using push/pop for 0 is not efficient."); + MachineBasicBlock::iterator I = MIB.getInstr(); + + int StackAdjustment; + + if (Subtarget.is64Bit()) { + assert(MIB->getOpcode() == X86::MOV64ImmSExti8 || + MIB->getOpcode() == X86::MOV32ImmSExti8); + // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and + // widen the register if necessary. + StackAdjustment = 8; + BuildMI(MBB, I, DL, get(X86::PUSH64i8)).addImm(Imm); + MIB->setDesc(get(X86::POP64r)); + MIB->getOperand(0) + .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64)); + } else { + assert(MIB->getOpcode() == X86::MOV32ImmSExti8); + StackAdjustment = 4; + BuildMI(MBB, I, DL, get(X86::PUSH32i8)).addImm(Imm); + MIB->setDesc(get(X86::POP32r)); + } + + // Build CFI if necessary. + MachineFunction &MF = *MBB.getParent(); + const X86FrameLowering *TFL = Subtarget.getFrameLowering(); + bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool NeedsDwarfCFI = + !IsWin64Prologue && + (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); + bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; + if (EmitCFI) { + TFL->BuildCFI(MBB, I, DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment)); + TFL->BuildCFI(MBB, std::next(I), DL, + MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment)); + } + + return true; +} + // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, @@ -4735,8 +5363,8 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, const GlobalValue *GV = cast<GlobalValue>((*MIB->memoperands_begin())->getValue()); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; - MachineMemOperand *MMO = MBB.getParent()-> - getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8); + MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand( + MachinePointerInfo::getGOT(*MBB.getParent()), Flag, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) @@ -4753,6 +5381,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { switch (MI->getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); + case X86::MOV32r1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); + case X86::MOV32r_1: + return expandMOV32r1(MIB, *this, /*MinusOne=*/ true); + case X86::MOV32ImmSExti8: + case X86::MOV64ImmSExti8: + return ExpandMOVImmSExti8(MIB); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: @@ -4777,10 +5412,22 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; + + // KNL does not recognize dependency-breaking idioms for mask registers, + // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1. + // Using %k0 as the undef input register is a performance heuristic based + // on the assumption that %k0 is used less frequently than the other mask + // registers, since it is not usable as a write mask. + // FIXME: A more advanced approach would be to choose the best input mask + // register based on context. case X86::KSET0B: - case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); + case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0); + case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0); + case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0); case X86::KSET1B: - case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); + case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0); + case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0); + case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0); case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; @@ -4788,12 +5435,28 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { return false; } -static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs) { +static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, + int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); - for (unsigned i = 0; i != NumAddrOps; ++i) - MIB.addOperand(MOs[i]); - if (NumAddrOps < 4) // FrameIndex only - addOffset(MIB, 0); + + if (NumAddrOps < 4) { + // FrameIndex only - add an immediate offset (whether its zero or not). + for (unsigned i = 0; i != NumAddrOps; ++i) + MIB.addOperand(MOs[i]); + addOffset(MIB, PtrOffset); + } else { + // General Memory Addressing - we need to add any offset to an existing + // offset. + assert(MOs.size() == 5 && "Unexpected memory operand list length"); + for (unsigned i = 0; i != NumAddrOps; ++i) { + const MachineOperand &MO = MOs[i]; + if (i == 3 && PtrOffset != 0) { + MIB.addDisp(MO, PtrOffset); + } else { + MIB.addOperand(MO); + } + } + } } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, @@ -4828,7 +5491,8 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, - MachineInstr *MI, const TargetInstrInfo &TII) { + MachineInstr *MI, const TargetInstrInfo &TII, + int PtrOffset = 0) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); @@ -4838,7 +5502,7 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); - addOperands(MIB, MOs); + addOperands(MIB, MOs, PtrOffset); } else { MIB.addOperand(MO); } @@ -4860,6 +5524,40 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, return MIB.addImm(0); } +MachineInstr *X86InstrInfo::foldMemoryOperandCustom( + MachineFunction &MF, MachineInstr *MI, unsigned OpNum, + ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const { + switch (MI->getOpcode()) { + case X86::INSERTPSrr: + case X86::VINSERTPSrr: + // Attempt to convert the load of inserted vector into a fold load + // of a single float. + if (OpNum == 2) { + unsigned Imm = MI->getOperand(MI->getNumOperands() - 1).getImm(); + unsigned ZMask = Imm & 15; + unsigned DstIdx = (Imm >> 4) & 3; + unsigned SrcIdx = (Imm >> 6) & 3; + + unsigned RCSize = getRegClass(MI->getDesc(), OpNum, &RI, MF)->getSize(); + if (Size <= RCSize && 4 <= Align) { + int PtrOffset = SrcIdx * 4; + unsigned NewImm = (DstIdx << 4) | ZMask; + unsigned NewOpCode = + (MI->getOpcode() == X86::VINSERTPSrr ? X86::VINSERTPSrm + : X86::INSERTPSrm); + MachineInstr *NewMI = + FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset); + NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm); + return NewMI; + } + } + break; + }; + + return nullptr; +} + MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, unsigned OpNum, ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt, @@ -4869,10 +5567,13 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; - // For CPUs that favor the register form of a call, - // do not fold loads into calls. - if (isCallRegIndirect && - (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) + // For CPUs that favor the register form of a call or push, + // do not fold loads into calls or pushes, unless optimizing for size + // aggressively. + if (isCallRegIndirect && !MF.getFunction()->optForMinSize() && + (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r || + MI->getOpcode() == X86::PUSH16r || MI->getOpcode() == X86::PUSH32r || + MI->getOpcode() == X86::PUSH64r)) return nullptr; unsigned NumOps = MI->getDesc().getNumOperands(); @@ -4886,6 +5587,12 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; MachineInstr *NewMI = nullptr; + + // Attempt to fold any custom cases we have. + if (MachineInstr *CustomMI = + foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + return CustomMI; + // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. @@ -4963,60 +5670,56 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { - unsigned OriginalOpIdx = OpNum, CommuteOpIdx1, CommuteOpIdx2; + unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); - bool Tied0 = - 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied1 = + 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); + bool Tied2 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. - if ((HasDef && Reg0 == Reg1 && Tied0) || - (HasDef && Reg0 == Reg2 && Tied1)) + if ((HasDef && Reg0 == Reg1 && Tied1) || + (HasDef && Reg0 == Reg2 && Tied2)) return nullptr; - if ((CommuteOpIdx1 == OriginalOpIdx) || - (CommuteOpIdx2 == OriginalOpIdx)) { - MachineInstr *CommutedMI = commuteInstruction(MI, false); - if (!CommutedMI) { - // Unable to commute. - return nullptr; - } - if (CommutedMI != MI) { - // New instruction. We can't fold from this. - CommutedMI->eraseFromParent(); - return nullptr; - } + MachineInstr *CommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!CommutedMI) { + // Unable to commute. + return nullptr; + } + if (CommutedMI != MI) { + // New instruction. We can't fold from this. + CommutedMI->eraseFromParent(); + return nullptr; + } - // Attempt to fold with the commuted version of the instruction. - unsigned CommuteOp = - (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); - NewMI = - foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, InsertPt, Size, Align, - /*AllowCommute=*/false); - if (NewMI) - return NewMI; - - // Folding failed again - undo the commute before returning. - MachineInstr *UncommutedMI = commuteInstruction(MI, false); - if (!UncommutedMI) { - // Unable to commute. - return nullptr; - } - if (UncommutedMI != MI) { - // New instruction. It doesn't need to be kept. - UncommutedMI->eraseFromParent(); - return nullptr; - } + // Attempt to fold with the commuted version of the instruction. + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, + Size, Align, /*AllowCommute=*/false); + if (NewMI) + return NewMI; - // Return here to prevent duplicate fuse failure report. + // Folding failed again - undo the commute before returning. + MachineInstr *UncommutedMI = + commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2); + if (!UncommutedMI) { + // Unable to commute. return nullptr; } + if (UncommutedMI != MI) { + // New instruction. It doesn't need to be kept. + UncommutedMI->eraseFromParent(); + return nullptr; + } + + // Return here to prevent duplicate fuse failure report. + return nullptr; } } @@ -5208,13 +5911,14 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, // If MI kills this register, the false dependence is already broken. if (MI->killsRegister(Reg, TRI)) return; + if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. - bool HasAVX = Subtarget.hasAVX(); - unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; + unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); + MI->addRegisterKilled(Reg, TRI, true); } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. @@ -5222,21 +5926,20 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); - } else - return; - MI->addRegisterKilled(Reg, TRI, true); + MI->addRegisterKilled(Reg, TRI, true); + } } MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex) const { // Check switch flag - if (NoFusing) return nullptr; + if (NoFusing) + return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -5303,6 +6006,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: + case X86::VFMADDSSr132r_Int: case X86::VFNMADDSSr132r_Int: + case X86::VFMADDSSr213r_Int: case X86::VFNMADDSSr213r_Int: + case X86::VFMADDSSr231r_Int: case X86::VFNMADDSSr231r_Int: + case X86::VFMSUBSSr132r_Int: case X86::VFNMSUBSSr132r_Int: + case X86::VFMSUBSSr213r_Int: case X86::VFNMSUBSSr213r_Int: + case X86::VFMSUBSSr231r_Int: case X86::VFNMSUBSSr231r_Int: return false; default: return true; @@ -5318,6 +6027,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: + case X86::VFMADDSDr132r_Int: case X86::VFNMADDSDr132r_Int: + case X86::VFMADDSDr213r_Int: case X86::VFNMADDSDr213r_Int: + case X86::VFMADDSDr231r_Int: case X86::VFNMADDSDr231r_Int: + case X86::VFMSUBSDr132r_Int: case X86::VFNMSUBSDr132r_Int: + case X86::VFMSUBSDr213r_Int: case X86::VFNMSUBSDr213r_Int: + case X86::VFMSUBSDr231r_Int: case X86::VFNMSUBSDr231r_Int: return false; default: return true; @@ -5342,10 +6057,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( // Check switch flag if (NoFusing) return nullptr; - // Unless optimizing for size, don't fold to avoid partial - // register update stalls - if (!MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) && - hasPartialRegUpdate(MI->getOpcode())) + // Avoid partial register update stalls unless optimizing for size. + if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI->getOpcode())) return nullptr; // Determine the alignment of the load. @@ -5460,62 +6173,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( /*Size=*/0, Alignment, /*AllowCommute=*/true); } -bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, - ArrayRef<unsigned> Ops) const { - // Check switch flag - if (NoFusing) return 0; - - if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { - switch (MI->getOpcode()) { - default: return false; - case X86::TEST8rr: - case X86::TEST16rr: - case X86::TEST32rr: - case X86::TEST64rr: - return true; - case X86::ADD32ri: - // FIXME: AsmPrinter doesn't know how to handle - // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. - if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) - return false; - break; - } - } - - if (Ops.size() != 1) - return false; - - unsigned OpNum = Ops[0]; - unsigned Opc = MI->getOpcode(); - unsigned NumOps = MI->getDesc().getNumOperands(); - bool isTwoAddr = NumOps > 1 && - MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; - - // Folding a memory location into the two-address part of a two-address - // instruction is different than folding it other places. It requires - // replacing the *two* registers with the memory location. - const DenseMap<unsigned, - std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr; - if (isTwoAddr && NumOps >= 2 && OpNum < 2) { - OpcodeTablePtr = &RegOp2MemOpTable2Addr; - } else if (OpNum == 0) { - if (Opc == X86::MOV32r0) - return true; - - OpcodeTablePtr = &RegOp2MemOpTable0; - } else if (OpNum == 1) { - OpcodeTablePtr = &RegOp2MemOpTable1; - } else if (OpNum == 2) { - OpcodeTablePtr = &RegOp2MemOpTable2; - } else if (OpNum == 3) { - OpcodeTablePtr = &RegOp2MemOpTable3; - } - - if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) - return true; - return TargetInstrInfo::canFoldMemoryOperand(MI, Ops); -} - bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl<MachineInstr*> &NewMIs) const { @@ -5536,9 +6193,10 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); + // TODO: Check if 32-byte or greater accesses are slow too? if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. @@ -5582,20 +6240,19 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, if (FoldedStore) MIB.addReg(Reg, RegState::Define); - for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) - MIB.addOperand(BeforeOps[i]); + for (MachineOperand &BeforeOp : BeforeOps) + MIB.addOperand(BeforeOp); if (FoldedLoad) MIB.addReg(Reg); - for (unsigned i = 0, e = AfterOps.size(); i != e; ++i) - MIB.addOperand(AfterOps[i]); - for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) { - MachineOperand &MO = ImpOps[i]; - MIB.addReg(MO.getReg(), - getDefRegState(MO.isDef()) | + for (MachineOperand &AfterOp : AfterOps) + MIB.addOperand(AfterOp); + for (MachineOperand &ImpOp : ImpOps) { + MIB.addReg(ImpOp.getReg(), + getDefRegState(ImpOp.isDef()) | RegState::Implicit | - getKillRegState(MO.isKill()) | - getDeadRegState(MO.isDead()) | - getUndefRegState(MO.isUndef())); + getKillRegState(ImpOp.isKill()) | + getDeadRegState(ImpOp.isDead()) | + getUndefRegState(ImpOp.isUndef())); } // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (DataMI->getOpcode()) { @@ -5686,9 +6343,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned load. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -5729,9 +6388,11 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, cast<MachineSDNode>(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && - !Subtarget.isUnalignedMemAccessFast()) + Subtarget.isUnalignedMem16Slow()) // Do not introduce a slow unaligned store. return false; + // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte + // memory access is slow above. unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; @@ -6192,16 +6853,16 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) - if (ReplaceableInstrs[i][domain-1] == opcode) - return ReplaceableInstrs[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrs) + if (Row[domain-1] == opcode) + return Row; return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { - for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) - if (ReplaceableInstrsAVX2[i][domain-1] == opcode) - return ReplaceableInstrsAVX2[i]; + for (const uint16_t (&Row)[3] : ReplaceableInstrsAVX2) + if (Row[domain-1] == opcode) + return Row; return nullptr; } @@ -6347,230 +7008,181 @@ hasHighOperandLatency(const TargetSchedModel &SchedModel, return isHighLatencyDef(DefMI->getOpcode()); } -static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst, - const MachineBasicBlock *MBB) { - assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators"); - const MachineOperand &Op1 = Inst.getOperand(1); - const MachineOperand &Op2 = Inst.getOperand(2); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - - // We need virtual register definitions. - MachineInstr *MI1 = nullptr; - MachineInstr *MI2 = nullptr; - if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg())) - MI1 = MRI.getUniqueVRegDef(Op1.getReg()); - if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg())) - MI2 = MRI.getUniqueVRegDef(Op2.getReg()); - - // And they need to be in the trace (otherwise, they won't have a depth). - if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB) - return true; +bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const { + assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) && + "Reassociation needs binary operators"); + + // Integer binary math/logic instructions have a third source operand: + // the EFLAGS register. That operand must be both defined here and never + // used; ie, it must be dead. If the EFLAGS operand is live, then we can + // not change anything because rearranging the operands could affect other + // instructions that depend on the exact status flags (zero, sign, etc.) + // that are set by using these particular operands with this operation. + if (Inst.getNumOperands() == 4) { + assert(Inst.getOperand(3).isReg() && + Inst.getOperand(3).getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + if (!Inst.getOperand(3).isDead()) + return false; + } - return false; -} - -static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) { - const MachineBasicBlock *MBB = Inst.getParent(); - const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg()); - MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg()); - unsigned AssocOpcode = Inst.getOpcode(); - - // If only one operand has the same opcode and it's the second source operand, - // the operands must be commuted. - Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode; - if (Commuted) - std::swap(MI1, MI2); - - // 1. The previous instruction must be the same type as Inst. - // 2. The previous instruction must have virtual register definitions for its - // operands in the same basic block as Inst. - // 3. The previous instruction's result must only be used by Inst. - if (MI1->getOpcode() == AssocOpcode && - hasVirtualRegDefsInBasicBlock(*MI1, MBB) && - MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg())) - return true; - - return false; + return TargetInstrInfo::hasReassociableOperands(Inst, MBB); } // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) -// 2. Other math / logic operations (and, or) -static bool isAssociativeAndCommutative(unsigned Opcode) { - switch (Opcode) { +// 2. Other math / logic operations (xor, or) +// 3. Other forms of the same operation (intrinsics and other variants) +bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const { + switch (Inst.getOpcode()) { + case X86::AND8rr: + case X86::AND16rr: + case X86::AND32rr: + case X86::AND64rr: + case X86::OR8rr: + case X86::OR16rr: + case X86::OR32rr: + case X86::OR64rr: + case X86::XOR8rr: + case X86::XOR16rr: + case X86::XOR32rr: + case X86::XOR64rr: + case X86::IMUL16rr: + case X86::IMUL32rr: + case X86::IMUL64rr: + case X86::PANDrr: + case X86::PORrr: + case X86::PXORrr: + case X86::VPANDrr: + case X86::VPANDYrr: + case X86::VPORrr: + case X86::VPORYrr: + case X86::VPXORrr: + case X86::VPXORYrr: + // Normal min/max instructions are not commutative because of NaN and signed + // zero semantics, but these are. Thus, there's no need to check for global + // relaxed math; the instructions themselves have the properties we need. + case X86::MAXCPDrr: + case X86::MAXCPSrr: + case X86::MAXCSDrr: + case X86::MAXCSSrr: + case X86::MINCPDrr: + case X86::MINCPSrr: + case X86::MINCSDrr: + case X86::MINCSSrr: + case X86::VMAXCPDrr: + case X86::VMAXCPSrr: + case X86::VMAXCPDYrr: + case X86::VMAXCPSYrr: + case X86::VMAXCSDrr: + case X86::VMAXCSSrr: + case X86::VMINCPDrr: + case X86::VMINCPSrr: + case X86::VMINCPDYrr: + case X86::VMINCPSYrr: + case X86::VMINCSDrr: + case X86::VMINCSSrr: + return true; + case X86::ADDPDrr: + case X86::ADDPSrr: case X86::ADDSDrr: case X86::ADDSSrr: - case X86::VADDSDrr: - case X86::VADDSSrr: + case X86::MULPDrr: + case X86::MULPSrr: case X86::MULSDrr: case X86::MULSSrr: + case X86::VADDPDrr: + case X86::VADDPSrr: + case X86::VADDPDYrr: + case X86::VADDPSYrr: + case X86::VADDSDrr: + case X86::VADDSSrr: + case X86::VMULPDrr: + case X86::VMULPSrr: + case X86::VMULPDYrr: + case X86::VMULPSYrr: case X86::VMULSDrr: case X86::VMULSSrr: - return true; + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; default: return false; } } -/// Return true if the input instruction is part of a chain of dependent ops -/// that are suitable for reassociation, otherwise return false. -/// If the instruction's operands must be commuted to have a previous -/// instruction of the same type define the first source operand, Commuted will -/// be set to true. -static bool isReassocCandidate(const MachineInstr &Inst, bool &Commuted) { - // 1. The operation must be associative and commutative. - // 2. The instruction must have virtual register definitions for its - // operands in the same basic block. - // 3. The instruction must have a reassociable sibling. - if (isAssociativeAndCommutative(Inst.getOpcode()) && - hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) && - hasReassocSibling(Inst, Commuted)) - return true; - - return false; -} - -// FIXME: This has the potential to be expensive (compile time) while not -// improving the code at all. Some ways to limit the overhead: -// 1. Track successful transforms; bail out if hit rate gets too low. -// 2. Only enable at -O3 or some other non-default optimization level. -// 3. Pre-screen pattern candidates here: if an operand of the previous -// instruction is known to not increase the critical path, then don't match -// that pattern. -bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Patterns) const { - if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath) - return false; - - // TODO: There is nothing x86-specific here except the instruction type. - // This logic could be hoisted into the machine combiner pass itself. - - // Look for this reassociation pattern: - // B = A op X (Prev) - // C = B op Y (Root) - - bool Commute; - if (isReassocCandidate(Root, Commute)) { - // We found a sequence of instructions that may be suitable for a - // reassociation of operands to increase ILP. Specify each commutation - // possibility for the Prev instruction in the sequence and let the - // machine combiner decide if changing the operands is worthwhile. - if (Commute) { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB); - } else { - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY); - Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY); - } - return true; - } +/// This is an architecture-specific helper function of reassociateOps. +/// Set special operand attributes for new instructions after reassociation. +void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, + MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const { + // Integer instructions define an implicit EFLAGS source register operand as + // the third source (fourth total) operand. + if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4) + return; - return false; + assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 && + "Unexpected instruction type for reassociation"); + + MachineOperand &OldOp1 = OldMI1.getOperand(3); + MachineOperand &OldOp2 = OldMI2.getOperand(3); + MachineOperand &NewOp1 = NewMI1.getOperand(3); + MachineOperand &NewOp2 = NewMI2.getOperand(3); + + assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() && + "Must have dead EFLAGS operand in reassociable instruction"); + + (void)OldOp1; + (void)OldOp2; + + assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS && + "Unexpected operand in reassociable instruction"); + + // Mark the new EFLAGS operands as dead to be helpful to subsequent iterations + // of this pass or other passes. The EFLAGS operands must be dead in these new + // instructions because the EFLAGS operands in the original instructions must + // be dead in order for reassociation to occur. + NewOp1.setIsDead(); + NewOp2.setIsDead(); } -/// Attempt the following reassociation to reduce critical path length: -/// B = A op X (Prev) -/// C = B op Y (Root) -/// ===> -/// B = X op Y -/// C = A op B -static void reassociateOps(MachineInstr &Root, MachineInstr &Prev, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) { - MachineFunction *MF = Root.getParent()->getParent(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); - const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); - const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI); - - // This array encodes the operand index for each parameter because the - // operands may be commuted. Each row corresponds to a pattern value, - // and each column specifies the index of A, B, X, Y. - unsigned OpIdx[4][4] = { - { 1, 1, 2, 2 }, - { 1, 2, 2, 1 }, - { 2, 1, 1, 2 }, - { 2, 2, 1, 1 } - }; - - MachineOperand &OpA = Prev.getOperand(OpIdx[Pattern][0]); - MachineOperand &OpB = Root.getOperand(OpIdx[Pattern][1]); - MachineOperand &OpX = Prev.getOperand(OpIdx[Pattern][2]); - MachineOperand &OpY = Root.getOperand(OpIdx[Pattern][3]); - MachineOperand &OpC = Root.getOperand(0); - - unsigned RegA = OpA.getReg(); - unsigned RegB = OpB.getReg(); - unsigned RegX = OpX.getReg(); - unsigned RegY = OpY.getReg(); - unsigned RegC = OpC.getReg(); - - if (TargetRegisterInfo::isVirtualRegister(RegA)) - MRI.constrainRegClass(RegA, RC); - if (TargetRegisterInfo::isVirtualRegister(RegB)) - MRI.constrainRegClass(RegB, RC); - if (TargetRegisterInfo::isVirtualRegister(RegX)) - MRI.constrainRegClass(RegX, RC); - if (TargetRegisterInfo::isVirtualRegister(RegY)) - MRI.constrainRegClass(RegY, RC); - if (TargetRegisterInfo::isVirtualRegister(RegC)) - MRI.constrainRegClass(RegC, RC); - - // Create a new virtual register for the result of (X op Y) instead of - // recycling RegB because the MachineCombiner's computation of the critical - // path requires a new register definition rather than an existing one. - unsigned NewVR = MRI.createVirtualRegister(RC); - InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); - - unsigned Opcode = Root.getOpcode(); - bool KillA = OpA.isKill(); - bool KillX = OpX.isKill(); - bool KillY = OpY.isKill(); - - // Create new instructions for insertion. - MachineInstrBuilder MIB1 = - BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR) - .addReg(RegX, getKillRegState(KillX)) - .addReg(RegY, getKillRegState(KillY)); - InsInstrs.push_back(MIB1); - - MachineInstrBuilder MIB2 = - BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC) - .addReg(RegA, getKillRegState(KillA)) - .addReg(NewVR, getKillRegState(true)); - InsInstrs.push_back(MIB2); - - // Record old instructions for deletion. - DelInstrs.push_back(&Prev); - DelInstrs.push_back(&Root); +std::pair<unsigned, unsigned> +X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF, 0u); } -void X86InstrInfo::genAlternativeCodeSequence( - MachineInstr &Root, - MachineCombinerPattern::MC_PATTERN Pattern, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const { - MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo(); - - // Select the previous instruction in the sequence based on the input pattern. - MachineInstr *Prev = nullptr; - switch (Pattern) { - case MachineCombinerPattern::MC_REASSOC_AX_BY: - case MachineCombinerPattern::MC_REASSOC_XA_BY: - Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); - break; - case MachineCombinerPattern::MC_REASSOC_AX_YB: - case MachineCombinerPattern::MC_REASSOC_XA_YB: - Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg()); - } - assert(Prev && "Unknown pattern for machine combiner"); - - reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg); - return; +ArrayRef<std::pair<unsigned, const char *>> +X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace X86II; + static const std::pair<unsigned, const char *> TargetFlags[] = { + {MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"}, + {MO_PIC_BASE_OFFSET, "x86-pic-base-offset"}, + {MO_GOT, "x86-got"}, + {MO_GOTOFF, "x86-gotoff"}, + {MO_GOTPCREL, "x86-gotpcrel"}, + {MO_PLT, "x86-plt"}, + {MO_TLSGD, "x86-tlsgd"}, + {MO_TLSLD, "x86-tlsld"}, + {MO_TLSLDM, "x86-tlsldm"}, + {MO_GOTTPOFF, "x86-gottpoff"}, + {MO_INDNTPOFF, "x86-indntpoff"}, + {MO_TPOFF, "x86-tpoff"}, + {MO_DTPOFF, "x86-dtpoff"}, + {MO_NTPOFF, "x86-ntpoff"}, + {MO_GOTNTPOFF, "x86-gotntpoff"}, + {MO_DLLIMPORT, "x86-dllimport"}, + {MO_DARWIN_STUB, "x86-darwin-stub"}, + {MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"}, + {MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"}, + {MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE, "x86-darwin-hidden-nonlazy-pic-base"}, + {MO_TLVP, "x86-tlvp"}, + {MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"}, + {MO_SECREL, "x86-secrel"}}; + return makeArrayRef(TargetFlags); } namespace { diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index bf63336..9d40334 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -23,22 +23,10 @@ #include "X86GenInstrInfo.inc" namespace llvm { + class MachineInstrBuilder; class X86RegisterInfo; class X86Subtarget; - namespace MachineCombinerPattern { - enum MC_PATTERN : int { - // These are commutative variants for reassociating a computation chain - // of the form: - // B = A op X (Prev) - // C = B op Y (Root) - MC_REASSOC_AX_BY = 0, - MC_REASSOC_AX_YB = 1, - MC_REASSOC_XA_BY = 2, - MC_REASSOC_XA_YB = 3, - }; - } // end namespace MachineCombinerPattern - namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. @@ -259,14 +247,64 @@ public: MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const override; - /// commuteInstruction - We have a few instructions that must be hacked on to - /// commute them. + /// Returns true iff the routine could find two commutable operands in the + /// given machine instruction. + /// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their + /// input values can be re-defined in this method only if the input values + /// are not pre-defined, which is designated by the special value + /// 'CommuteAnyOperandIndex' assigned to it. + /// If both of indices are pre-defined and refer to some operands, then the + /// method simply returns true if the corresponding operands are commutable + /// and returns false otherwise. /// - MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; - + /// For example, calling this method this way: + /// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex; + /// findCommutedOpIndices(MI, Op1, Op2); + /// can be interpreted as a query asking to find an operand that would be + /// commutable with the operand#1. bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + /// Returns true if the routine could find two commutable operands + /// in the given FMA instruction. Otherwise, returns false. + /// + /// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments. + /// The output indices of the commuted operands are returned in these + /// arguments. Also, the input values of these arguments may be preset either + /// to indices of operands that must be commuted or be equal to a special + /// value 'CommuteAnyOperandIndex' which means that the corresponding + /// operand index is not set and this method is free to pick any of + /// available commutable operands. + /// + /// For example, calling this method this way: + /// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex; + /// findFMA3CommutedOpIndices(MI, Idx1, Idx2); + /// can be interpreted as a query asking if the operand #1 can be swapped + /// with any other available operand (e.g. operand #2, operand #3, etc.). + /// + /// The returned FMA opcode may differ from the opcode in the given MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + bool findFMA3CommutedOpIndices(MachineInstr *MI, + unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; + + /// Returns an adjusted FMA opcode that must be used in FMA instruction that + /// performs the same computations as the given MI but which has the operands + /// \p SrcOpIdx1 and \p SrcOpIdx2 commuted. + /// It may return 0 if it is unsafe to commute the operands. + /// + /// The returned FMA opcode may differ from the opcode in the given \p MI. + /// For example, commuting the operands #1 and #3 in the following FMA + /// FMA213 #1, #2, #3 + /// results into instruction with adjusted opcode: + /// FMA231 #3, #2, #1 + unsigned getFMA3OpcodeToCommuteOperands(MachineInstr *MI, + unsigned SrcOpIdx1, + unsigned SrcOpIdx2) const; + // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr* MI) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, @@ -342,11 +380,6 @@ public: MachineBasicBlock::iterator InsertPt, MachineInstr *LoadMI) const override; - /// canFoldMemoryOperand - Returns true if the specified load / store is - /// folding is possible. - bool canFoldMemoryOperand(const MachineInstr *, - ArrayRef<unsigned>) const override; - /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is /// possible, returns true as well as the new instructions by reference. @@ -406,10 +439,9 @@ public: bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; - static bool isX86_64ExtendedReg(const MachineOperand &MO) { - if (!MO.isReg()) return false; - return X86II::isX86_64ExtendedReg(MO.getReg()); - } + /// True if MI has a condition code def, e.g. EFLAGS, that is + /// not marked dead. + bool hasLiveCondCodeDef(MachineInstr *MI) const; /// getGlobalBaseReg - Return a virtual register initialized with the /// the global base register value. Output instructions required to @@ -452,26 +484,19 @@ public: const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const override; - bool useMachineCombiner() const override { return true; } - - /// Return true when there is potentially a faster code sequence - /// for an instruction chain ending in <Root>. All potential patterns are - /// output in the <Pattern> array. - bool getMachineCombinerPatterns( - MachineInstr &Root, - SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const override; - - /// When getMachineCombinerPatterns() finds a pattern, this function generates - /// the instructions that could replace the original code sequence. - void genAlternativeCodeSequence( - MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, - SmallVectorImpl<MachineInstr *> &InsInstrs, - SmallVectorImpl<MachineInstr *> &DelInstrs, - DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override; + + bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + + bool hasReassociableOperands(const MachineInstr &Inst, + const MachineBasicBlock *MBB) const override; + + void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2 if having two register operands, and the value it @@ -500,16 +525,49 @@ public: unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + +protected: + /// Commutes the operands in the given instruction by changing the operands + /// order and/or changing the instruction's opcode and/or the immediate value + /// operand. + /// + /// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands + /// to be commuted. + /// + /// Do not call this method for a non-commutable instruction or + /// non-commutable operands. + /// Even though the instruction is commutable, the method may still + /// fail to commute the operands, null pointer is returned in such cases. + MachineInstr *commuteInstructionImpl(MachineInstr *MI, bool NewMI, + unsigned CommuteOpIdx1, + unsigned CommuteOpIdx2) const override; + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const; + /// Handles memory folding for special case instructions, for instance those + /// requiring custom manipulation of the address. + MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr *MI, + unsigned OpNum, + ArrayRef<MachineOperand> MOs, + MachineBasicBlock::iterator InsertPt, + unsigned Size, unsigned Align) const; + /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const; + + /// Expand the MOVImmSExti8 pseudo-instructions. + bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB) const; }; } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.td b/contrib/llvm/lib/Target/X86/X86InstrInfo.td index 52bab9c..f4ca2b8 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.td +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.td @@ -106,8 +106,6 @@ def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; -def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>; - def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>; def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>; @@ -158,6 +156,8 @@ def X86cas16 : SDNode<"X86ISD::LCMPXCHG16_DAG", SDTX86caspair, def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; +def X86iret : SDNode<"X86ISD::IRET", SDTX86Ret, + [SDNPHasChain, SDNPOptInGlue]>; def X86vastart_save_xmm_regs : SDNode<"X86ISD::VASTART_SAVE_XMM_REGS", @@ -250,9 +250,6 @@ def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA, def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; -def X86WinFTOL : SDNode<"X86ISD::WIN_FTOL", SDT_X86WIN_FTOL, - [SDNPHasChain, SDNPOutGlue]>; - //===----------------------------------------------------------------------===// // X86 Operand Definitions. // @@ -344,18 +341,21 @@ def vy64xmem : X86VMemOperand<VR256X, "printi64mem", X86MemVY64XOperand>; def vz32mem : X86VMemOperand<VR512, "printi32mem", X86MemVZ32Operand>; def vz64mem : X86VMemOperand<VR512, "printi64mem", X86MemVZ64Operand>; -// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of -// plain GR64, so that it doesn't potentially require a REX prefix. -def i8mem_NOREX : Operand<i64> { +// A version of i8mem for use on x86-64 and x32 that uses a NOREX GPR instead +// of a plain GPR, so that it doesn't potentially require a REX prefix. +def ptr_rc_norex : PointerLikeRegClass<2>; +def ptr_rc_norex_nosp : PointerLikeRegClass<3>; + +def i8mem_NOREX : Operand<iPTR> { let PrintMethod = "printi8mem"; - let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm); + let MIOperandInfo = (ops ptr_rc_norex, i8imm, ptr_rc_norex_nosp, i32imm, i8imm); let ParserMatchClass = X86Mem8AsmOperand; let OperandType = "OPERAND_MEMORY"; } // GPRs available for tailcall. // It represents GR32_TC, GR64_TC or GR64_TCW64. -def ptr_rc_tailcall : PointerLikeRegClass<2>; +def ptr_rc_tailcall : PointerLikeRegClass<4>; // Special i32mem for addresses of load folding tail calls. These are not // allowed to use callee-saved registers since they must be scheduled @@ -697,34 +697,34 @@ def lea64mem : Operand<i64> { // X86 Complex Pattern Definitions. // -// Define X86 specific addressing mode. -def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>; -def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr", +// Define X86-specific addressing mode. +def addr : ComplexPattern<iPTR, 5, "selectAddr", [], [SDNPWantParent]>; +def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex], []>; // In 64-bit mode 32-bit LEAs can use RIP-relative addressing. -def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr", +def lea64_32addr : ComplexPattern<i32, 5, "selectLEA64_32Addr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; -def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", +def tls32addr : ComplexPattern<i32, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def tls32baseaddr : ComplexPattern<i32, 5, "SelectTLSADDRAddr", +def tls32baseaddr : ComplexPattern<i32, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def lea64addr : ComplexPattern<i64, 5, "SelectLEAAddr", +def lea64addr : ComplexPattern<i64, 5, "selectLEAAddr", [add, sub, mul, X86mul_imm, shl, or, frameindex, X86WrapperRIP], []>; -def tls64addr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", +def tls64addr : ComplexPattern<i64, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def tls64baseaddr : ComplexPattern<i64, 5, "SelectTLSADDRAddr", +def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr", [tglobaltlsaddr], []>; -def vectoraddr : ComplexPattern<iPTR, 5, "SelectVectorAddr", [],[SDNPWantParent]>; +def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>; //===----------------------------------------------------------------------===// // X86 Instruction Predicate Definitions. @@ -767,12 +767,21 @@ def HasDQI : Predicate<"Subtarget->hasDQI()">, def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">, AssemblerPredicate<"FeatureBWI", "AVX-512 BW ISA">; +def NoBWI : Predicate<"!Subtarget->hasBWI()">; def HasVLX : Predicate<"Subtarget->hasVLX()">, AssemblerPredicate<"FeatureVLX", "AVX-512 VL ISA">; def NoVLX : Predicate<"!Subtarget->hasVLX()">; +def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; +def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; +def PKU : Predicate<"!Subtarget->hasPKU()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; +def HasFXSR : Predicate<"Subtarget->hasFXSR()">; +def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; +def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; +def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; +def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; def UseFMAOnAVX : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">; @@ -794,6 +803,7 @@ def HasSHA : Predicate<"Subtarget->hasSHA()">; def HasPRFCHW : Predicate<"Subtarget->hasPRFCHW()">; def HasRDSEED : Predicate<"Subtarget->hasRDSEED()">; def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">; +def HasLAHFSAHF : Predicate<"Subtarget->hasLAHFSAHF()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; @@ -812,6 +822,8 @@ def In32BitMode : Predicate<"Subtarget->is32Bit()">, AssemblerPredicate<"Mode32Bit", "32-bit mode">; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; def NotWin64 : Predicate<"!Subtarget->isTargetWin64()">; +def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||" + "Subtarget->getFrameLowering()->hasFP(*MF)">; def IsPS4 : Predicate<"Subtarget->isTargetPS4()">; def NotPS4 : Predicate<"!Subtarget->isTargetPS4()">; def IsNaCl : Predicate<"Subtarget->isTargetNaCl()">; @@ -825,6 +837,7 @@ def NearData : Predicate<"TM.getCodeModel() == CodeModel::Small ||" def IsStatic : Predicate<"TM.getRelocationModel() == Reloc::Static">; def IsNotPIC : Predicate<"TM.getRelocationModel() != Reloc::PIC_">; def OptForSize : Predicate<"OptForSize">; +def OptForMinSize : Predicate<"OptForMinSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; @@ -867,20 +880,54 @@ def X86_COND_E_OR_NE : ImmLeaf<i8, [{ }]>; -def i16immSExt8 : ImmLeaf<i16, [{ return Imm == (int8_t)Imm; }]>; -def i32immSExt8 : ImmLeaf<i32, [{ return Imm == (int8_t)Imm; }]>; -def i64immSExt8 : ImmLeaf<i64, [{ return Imm == (int8_t)Imm; }]>; +def i16immSExt8 : ImmLeaf<i16, [{ return isInt<8>(Imm); }]>; +def i32immSExt8 : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>; +def i64immSExt8 : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>; + +// If we have multiple users of an immediate, it's much smaller to reuse +// the register, rather than encode the immediate in every instruction. +// This has the risk of increasing register pressure from stretched live +// ranges, however, the immediates should be trivial to rematerialize by +// the RA in the event of high register pressure. +// TODO : This is currently enabled for stores and binary ops. There are more +// cases for which this can be enabled, though this catches the bulk of the +// issues. +// TODO2 : This should really also be enabled under O2, but there's currently +// an issue with RA where we don't pull the constants into their users +// when we rematerialize them. I'll follow-up on enabling O2 after we fix that +// issue. +// TODO3 : This is currently limited to single basic blocks (DAG creation +// pulls block immediates to the top and merges them if necessary). +// Eventually, it would be nice to allow ConstantHoisting to merge constants +// globally for potentially added savings. +// +def imm8_su : PatLeaf<(i8 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm16_su : PatLeaf<(i16 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def imm32_su : PatLeaf<(i32 imm), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; + +def i16immSExt8_su : PatLeaf<(i16immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; +def i32immSExt8_su : PatLeaf<(i32immSExt8), [{ + return !shouldAvoidImmediateInstFormsForSize(N); +}]>; -def i64immSExt32 : ImmLeaf<i64, [{ return Imm == (int32_t)Imm; }]>; +def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>; // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit // unsigned field. -def i64immZExt32 : ImmLeaf<i64, [{ return (uint64_t)Imm == (uint32_t)Imm; }]>; +def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>; def i64immZExt32SExt8 : ImmLeaf<i64, [{ - return (uint64_t)Imm == (uint32_t)Imm && (int32_t)Imm == (int8_t)Imm; + return isUInt<32>(Imm) && isInt<8>(static_cast<int32_t>(Imm)); }]>; // Helper fragments for loads. @@ -914,11 +961,12 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{ return false; }]>; -def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; -def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; -def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; -def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; -def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadi8 : PatFrag<(ops node:$ptr), (i8 (load node:$ptr))>; +def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; +def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; +def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; +def loadf128 : PatFrag<(ops node:$ptr), (f128 (load node:$ptr))>; def sextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>; def sextloadi32i8 : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>; @@ -1020,12 +1068,8 @@ def PUSH32r : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[], IIC_PUSH_REG>, OpSize16; -def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], - IIC_PUSH_MEM>, OpSize16; def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[], IIC_PUSH_REG>, OpSize32, Requires<[Not64BitMode]>; -def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], - IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; def PUSH16i8 : Ii8<0x6a, RawFrm, (outs), (ins i16i8imm:$imm), "push{w}\t$imm", [], IIC_PUSH_IMM>, OpSize16; @@ -1039,6 +1083,14 @@ def PUSHi32 : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), "push{l}\t$imm", [], IIC_PUSH_IMM>, OpSize32, Requires<[Not64BitMode]>; } // mayStore, SchedRW + +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { +def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[], + IIC_PUSH_MEM>, OpSize16; +def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[], + IIC_PUSH_MEM>, OpSize32, Requires<[Not64BitMode]>; +} // mayLoad, mayStore, SchedRW + } let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, hasSideEffects=0, @@ -1071,9 +1123,11 @@ def PUSH64r : I<0x50, AddRegFrm, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", [], IIC_PUSH_REG>, OpSize32, Requires<[In64BitMode]>; +} // mayStore, SchedRW +let mayLoad = 1, mayStore = 1, SchedRW = [WriteRMW] in { def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", [], IIC_PUSH_MEM>, OpSize32, Requires<[In64BitMode]>; -} // mayStore, SchedRW +} // mayLoad, mayStore, SchedRW } let Defs = [RSP], Uses = [RSP], hasSideEffects = 0, mayStore = 1, @@ -1275,13 +1329,13 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src), let SchedRW = [WriteStore] in { def MOV8mi : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src), "mov{b}\t{$src, $dst|$dst, $src}", - [(store (i8 imm:$src), addr:$dst)], IIC_MOV_MEM>; + [(store (i8 imm8_su:$src), addr:$dst)], IIC_MOV_MEM>; def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src), "mov{w}\t{$src, $dst|$dst, $src}", - [(store (i16 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; + [(store (i16 imm16_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize16; def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), "mov{l}\t{$src, $dst|$dst, $src}", - [(store (i32 imm:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; + [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>; @@ -1457,10 +1511,12 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem, let SchedRW = [WriteALU] in { let Defs = [EFLAGS], Uses = [AH] in def SAHF : I<0x9E, RawFrm, (outs), (ins), "sahf", - [(set EFLAGS, (X86sahf AH))], IIC_AHF>; + [(set EFLAGS, (X86sahf AH))], IIC_AHF>, + Requires<[HasLAHFSAHF]>; let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in def LAHF : I<0x9F, RawFrm, (outs), (ins), "lahf", [], - IIC_AHF>; // AH = flags + IIC_AHF>, // AH = flags + Requires<[HasLAHFSAHF]>; } // SchedRW //===----------------------------------------------------------------------===// @@ -1894,37 +1950,38 @@ def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", [], IIC_CLTS>, TB; } // Table lookup instructions +let Uses = [AL,EBX], Defs = [AL], hasSideEffects = 0, mayLoad = 1 in def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", [], IIC_XLAT>, Sched<[WriteLoad]>; let SchedRW = [WriteMicrocoded] in { // ASCII Adjust After Addition -// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAA : I<0x37, RawFrm, (outs), (ins), "aaa", [], IIC_AAA>, Requires<[Not64BitMode]>; // ASCII Adjust AX Before Division -// sets AL, AH and EFLAGS and uses AL and AH +let Uses = [AX], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAD8i8 : Ii8<0xD5, RawFrm, (outs), (ins i8imm:$src), "aad\t$src", [], IIC_AAD>, Requires<[Not64BitMode]>; // ASCII Adjust AX After Multiply -// sets AL, AH and EFLAGS and uses AL +let Uses = [AL], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAM8i8 : Ii8<0xD4, RawFrm, (outs), (ins i8imm:$src), "aam\t$src", [], IIC_AAM>, Requires<[Not64BitMode]>; // ASCII Adjust AL After Subtraction - sets -// sets AL, AH and CF and AF of EFLAGS and uses AL and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AX,EFLAGS], hasSideEffects = 0 in def AAS : I<0x3F, RawFrm, (outs), (ins), "aas", [], IIC_AAS>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Addition -// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAA : I<0x27, RawFrm, (outs), (ins), "daa", [], IIC_DAA>, Requires<[Not64BitMode]>; // Decimal Adjust AL after Subtraction -// sets AL, CF and AF of EFLAGS and uses AL, CF and AF of EFLAGS +let Uses = [AL,EFLAGS], Defs = [AL,EFLAGS], hasSideEffects = 0 in def DAS : I<0x2F, RawFrm, (outs), (ins), "das", [], IIC_DAS>, Requires<[Not64BitMode]>; } // SchedRW @@ -2357,6 +2414,32 @@ defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>; } // HasTBM, EFLAGS //===----------------------------------------------------------------------===// +// MONITORX/MWAITX Instructions +// +let SchedRW = [WriteSystem] in { +let Uses = [EAX, ECX, EDX] in +def MONITORXrrr : I<0x01, MRM_FA, (outs), (ins), "monitorx", [], + IIC_SSE_MONITOR>, TB; +let Uses = [ECX, EAX, EBX] in +def MWAITXrr : I<0x01, MRM_FB, (outs), (ins), "mwaitx", [], IIC_SSE_MWAIT>, + TB; +} // SchedRW + +def : InstAlias<"mwaitx\t{%eax, %ecx, %ebx|ebx, ecx, eax}", (MWAITXrr)>, Requires<[Not64BitMode]>; +def : InstAlias<"mwaitx\t{%rax, %rcx, %rbx|rbx, rcx, rax}", (MWAITXrr)>, Requires<[In64BitMode]>; + +def : InstAlias<"monitorx\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORXrrr)>, + Requires<[Not64BitMode]>; +def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>, + Requires<[In64BitMode]>; + +//===----------------------------------------------------------------------===// +// CLZERO Instruction +// +let Uses = [EAX] in +def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB; + +//===----------------------------------------------------------------------===// // Pattern fragments to auto generate TBM instructions. //===----------------------------------------------------------------------===// @@ -2498,8 +2581,8 @@ def : MnemonicAlias<"lret", "lretl", "att">, Requires<[Not16BitMode]>; def : MnemonicAlias<"leavel", "leave", "att">, Requires<[Not64BitMode]>; def : MnemonicAlias<"leaveq", "leave", "att">, Requires<[In64BitMode]>; -def : MnemonicAlias<"loopz", "loope", "att">; -def : MnemonicAlias<"loopnz", "loopne", "att">; +def : MnemonicAlias<"loopz", "loope">; +def : MnemonicAlias<"loopnz", "loopne">; def : MnemonicAlias<"pop", "popw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"pop", "popl", "att">, Requires<[In32BitMode]>; @@ -2532,14 +2615,15 @@ def : MnemonicAlias<"pusha", "pushaw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"popa", "popal", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"pusha", "pushal", "att">, Requires<[In32BitMode]>; -def : MnemonicAlias<"repe", "rep", "att">; -def : MnemonicAlias<"repz", "rep", "att">; -def : MnemonicAlias<"repnz", "repne", "att">; +def : MnemonicAlias<"repe", "rep">; +def : MnemonicAlias<"repz", "rep">; +def : MnemonicAlias<"repnz", "repne">; def : MnemonicAlias<"ret", "retw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"ret", "retl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"ret", "retq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"sal", "shl", "intel">; def : MnemonicAlias<"salb", "shlb", "att">; def : MnemonicAlias<"salw", "shlw", "att">; def : MnemonicAlias<"sall", "shll", "att">; @@ -2579,14 +2663,14 @@ def : MnemonicAlias<"fcmova", "fcmovnbe", "att">; def : MnemonicAlias<"fcmovnae", "fcmovb", "att">; def : MnemonicAlias<"fcmovna", "fcmovbe", "att">; def : MnemonicAlias<"fcmovae", "fcmovnb", "att">; -def : MnemonicAlias<"fcomip", "fcompi", "att">; +def : MnemonicAlias<"fcomip", "fcompi">; def : MnemonicAlias<"fildq", "fildll", "att">; def : MnemonicAlias<"fistpq", "fistpll", "att">; def : MnemonicAlias<"fisttpq", "fisttpll", "att">; def : MnemonicAlias<"fldcww", "fldcw", "att">; def : MnemonicAlias<"fnstcww", "fnstcw", "att">; def : MnemonicAlias<"fnstsww", "fnstsw", "att">; -def : MnemonicAlias<"fucomip", "fucompi", "att">; +def : MnemonicAlias<"fucomip", "fucompi">; def : MnemonicAlias<"fwait", "wait">; def : MnemonicAlias<"fxsaveq", "fxsave64", "att">; @@ -2594,7 +2678,9 @@ def : MnemonicAlias<"fxrstorq", "fxrstor64", "att">; def : MnemonicAlias<"xsaveq", "xsave64", "att">; def : MnemonicAlias<"xrstorq", "xrstor64", "att">; def : MnemonicAlias<"xsaveoptq", "xsaveopt64", "att">; - +def : MnemonicAlias<"xrstorsq", "xrstors64", "att">; +def : MnemonicAlias<"xsavecq", "xsavec64", "att">; +def : MnemonicAlias<"xsavesq", "xsaves64", "att">; class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond, string VariantName> @@ -2640,8 +2726,8 @@ defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">; //===----------------------------------------------------------------------===// // aad/aam default to base 10 if no operand is specified. -def : InstAlias<"aad", (AAD8i8 10)>; -def : InstAlias<"aam", (AAM8i8 10)>; +def : InstAlias<"aad", (AAD8i8 10)>, Requires<[Not64BitMode]>; +def : InstAlias<"aam", (AAM8i8 10)>, Requires<[Not64BitMode]>; // Disambiguate the mem/imm form of bt-without-a-suffix as btl. // Likewise for btc/btr/bts. @@ -2719,8 +2805,10 @@ def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>; // Various unary fpstack operations default to operating on on ST1. // For example, "fxch" -> "fxch %st(1)" def : InstAlias<"faddp", (ADD_FPrST0 ST1), 0>; +def: InstAlias<"fadd", (ADD_FPrST0 ST1), 0>; def : InstAlias<"fsub{|r}p", (SUBR_FPrST0 ST1), 0>; def : InstAlias<"fsub{r|}p", (SUB_FPrST0 ST1), 0>; +def : InstAlias<"fmul", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fmulp", (MUL_FPrST0 ST1), 0>; def : InstAlias<"fdiv{|r}p", (DIVR_FPrST0 ST1), 0>; def : InstAlias<"fdiv{r|}p", (DIV_FPrST0 ST1), 0>; @@ -2798,20 +2886,20 @@ def : InstAlias<"jmp {*}$dst", (JMP16m i16mem:$dst), 0>, Requires<[In16Bit // "imul <imm>, B" is an alias for "imul <imm>, B, B". -def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; -def : InstAlias<"imulw {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; -def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; -def : InstAlias<"imull {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; -def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; -def : InstAlias<"imulq {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; +def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri GR16:$r, GR16:$r, i16imm:$imm), 0>; +def : InstAlias<"imul{w} {$imm, $r|$r, $imm}", (IMUL16rri8 GR16:$r, GR16:$r, i16i8imm:$imm), 0>; +def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri GR32:$r, GR32:$r, i32imm:$imm), 0>; +def : InstAlias<"imul{l} {$imm, $r|$r, $imm}", (IMUL32rri8 GR32:$r, GR32:$r, i32i8imm:$imm), 0>; +def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri32 GR64:$r, GR64:$r, i64i32imm:$imm), 0>; +def : InstAlias<"imul{q} {$imm, $r|$r, $imm}", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm), 0>; // inb %dx -> inb %al, %dx def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>; def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>; def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>; -def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>; -def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>; -def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>; +def : InstAlias<"inb\t$port", (IN8ri u8imm:$port), 0>; +def : InstAlias<"inw\t$port", (IN16ri u8imm:$port), 0>; +def : InstAlias<"inl\t$port", (IN32ri u8imm:$port), 0>; // jmp and call aliases for lcall and ljmp. jmp $42,$5 -> ljmp @@ -2861,9 +2949,9 @@ def : InstAlias<"movzx {$src, $dst|$dst, $src}", (MOVZX64rr16_Q GR64:$dst, GR16: def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>; def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>; def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>; -def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>; -def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>; -def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>; +def : InstAlias<"outb\t$port", (OUT8ir u8imm:$port), 0>; +def : InstAlias<"outw\t$port", (OUT16ir u8imm:$port), 0>; +def : InstAlias<"outl\t$port", (OUT32ir u8imm:$port), 0>; // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same // effect (both store to a 16-bit mem). Force to sldtw to avoid ambiguity @@ -2940,3 +3028,34 @@ def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>; def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>; + +// These aliases exist to get the parser to prioritize matching 8-bit +// immediate encodings over matching the implicit ax/eax/rax encodings. By +// explicitly mentioning the A register here, these entries will be ordered +// first due to the more explicit immediate type. +def : InstAlias<"adc{w}\t{$imm, %ax|ax, $imm}", (ADC16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"add{w}\t{$imm, %ax|ax, $imm}", (ADD16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"and{w}\t{$imm, %ax|ax, $imm}", (AND16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"cmp{w}\t{$imm, %ax|ax, $imm}", (CMP16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"or{w}\t{$imm, %ax|ax, $imm}", (OR16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sbb{w}\t{$imm, %ax|ax, $imm}", (SBB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"sub{w}\t{$imm, %ax|ax, $imm}", (SUB16ri8 AX, i16i8imm:$imm), 0>; +def : InstAlias<"xor{w}\t{$imm, %ax|ax, $imm}", (XOR16ri8 AX, i16i8imm:$imm), 0>; + +def : InstAlias<"adc{l}\t{$imm, %eax|eax, $imm}", (ADC32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"add{l}\t{$imm, %eax|eax, $imm}", (ADD32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"and{l}\t{$imm, %eax|eax, $imm}", (AND32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"cmp{l}\t{$imm, %eax|eax, $imm}", (CMP32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"or{l}\t{$imm, %eax|eax, $imm}", (OR32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sbb{l}\t{$imm, %eax|eax, $imm}", (SBB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"sub{l}\t{$imm, %eax|eax, $imm}", (SUB32ri8 EAX, i32i8imm:$imm), 0>; +def : InstAlias<"xor{l}\t{$imm, %eax|eax, $imm}", (XOR32ri8 EAX, i32i8imm:$imm), 0>; + +def : InstAlias<"adc{q}\t{$imm, %rax|rax, $imm}", (ADC64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"add{q}\t{$imm, %rax|rax, $imm}", (ADD64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"and{q}\t{$imm, %rax|rax, $imm}", (AND64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"cmp{q}\t{$imm, %rax|rax, $imm}", (CMP64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"or{q}\t{$imm, %rax|rax, $imm}", (OR64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sbb{q}\t{$imm, %rax|rax, $imm}", (SBB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"sub{q}\t{$imm, %rax|rax, $imm}", (SUB64ri8 RAX, i64i8imm:$imm), 0>; +def : InstAlias<"xor{q}\t{$imm, %rax|rax, $imm}", (XOR64ri8 RAX, i64i8imm:$imm), 0>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrMMX.td b/contrib/llvm/lib/Target/X86/X86InstrMMX.td index eaa7894..11dc1e7 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrMMX.td +++ b/contrib/llvm/lib/Target/X86/X86InstrMMX.td @@ -249,6 +249,7 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src), (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>; +let isBitcast = 1 in def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src), "movd\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (bitconvert GR64:$src))], @@ -262,7 +263,7 @@ def MMX_MOVD64to64rm : MMXRI<0x6E, MRMSrcMem, (outs VR64:$dst), // These are 64 bit moves, but since the OS X assembler doesn't // recognize a register-register movq, we write them as // movd. -let SchedRW = [WriteMove] in { +let SchedRW = [WriteMove], isBitcast = 1 in { def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR64:$src), "movd\t{$src, $dst|$dst, $src}", @@ -303,7 +304,7 @@ def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}", [(set VR64:$dst, (x86mmx (bitconvert - (i64 (vector_extract (v2i64 VR128:$src), + (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))))))], IIC_MMX_MOVQ_RR>; @@ -326,6 +327,7 @@ def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), } } // SchedRW +let Predicates = [HasSSE1] in def MMX_MOVNTQmr : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src), "movntq\t{$src, $dst|$dst, $src}", [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)], @@ -355,6 +357,7 @@ defm MMX_PADDW : MMXI_binop_rm_int<0xFD, "paddw", int_x86_mmx_padd_w, MMX_INTALU_ITINS, 1>; defm MMX_PADDD : MMXI_binop_rm_int<0xFE, "paddd", int_x86_mmx_padd_d, MMX_INTALU_ITINS, 1>; +let Predicates = [HasSSE2] in defm MMX_PADDQ : MMXI_binop_rm_int<0xD4, "paddq", int_x86_mmx_padd_q, MMX_INTALUQ_ITINS, 1>; defm MMX_PADDSB : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, @@ -382,6 +385,7 @@ defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w, MMX_INTALU_ITINS>; defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d, MMX_INTALU_ITINS>; +let Predicates = [HasSSE2] in defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q, MMX_INTALUQ_ITINS>; @@ -408,8 +412,10 @@ defm MMX_PMULLW : MMXI_binop_rm_int<0xD5, "pmullw", int_x86_mmx_pmull_w, defm MMX_PMULHW : MMXI_binop_rm_int<0xE5, "pmulhw", int_x86_mmx_pmulh_w, MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE1] in defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, MMX_PMUL_ITINS, 1>; +let Predicates = [HasSSE2] in defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, MMX_PMUL_ITINS, 1>; let isCommutable = 1 in @@ -422,6 +428,7 @@ defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, defm MMX_PMADDUBSW : SS3I_binop_rm_int_mm<0x04, "pmaddubsw", int_x86_ssse3_pmadd_ub_sw, MMX_PMUL_ITINS>; +let Predicates = [HasSSE1] in { defm MMX_PAVGB : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, MMX_MISC_FUNC_ITINS, 1>; defm MMX_PAVGW : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, @@ -439,6 +446,7 @@ defm MMX_PMAXSW : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, defm MMX_PSADBW : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, MMX_PSADBW_ITINS, 1>; +} defm MMX_PSIGNB : SS3I_binop_rm_int_mm<0x08, "psignb", int_x86_ssse3_psign_b, MMX_MISC_FUNC_ITINS>; @@ -594,6 +602,7 @@ let Constraints = "$src1 = $dst" in { } // Extract / Insert +let Predicates = [HasSSE1] in def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src1, i32u8imm:$src2), "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -601,6 +610,7 @@ def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg, imm:$src2))], IIC_MMX_PEXTR>, Sched<[WriteShuffle]>; let Constraints = "$src1 = $dst" in { +let Predicates = [HasSSE1] in { def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src1, GR32orGR64:$src2, i32u8imm:$src3), @@ -618,8 +628,10 @@ let Constraints = "$src1 = $dst" in { imm:$src3))], IIC_MMX_PINSRW>, Sched<[WriteShuffleLd, ReadAfterLd]>; } +} // Mask creation +let Predicates = [HasSSE1] in def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src), "pmovmskb\t{$src, $dst|$dst, $src}", @@ -639,12 +651,12 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))), // Misc. let SchedRW = [WriteShuffle] in { -let Uses = [EDI] in +let Uses = [EDI], Predicates = [HasSSE1,In32BitMode] in def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)], IIC_MMX_MASKMOV>; -let Uses = [RDI] in +let Uses = [RDI], Predicates = [HasSSE1,In64BitMode] in def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), "maskmovq\t{$mask, $src|$src, $mask}", [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)], @@ -653,10 +665,6 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask), // 64-bit bit convert. let Predicates = [HasSSE2] in { -def : Pat<(x86mmx (bitconvert (i64 GR64:$src))), - (MMX_MOVD64to64rr GR64:$src)>; -def : Pat<(i64 (bitconvert (x86mmx VR64:$src))), - (MMX_MOVD64from64rr VR64:$src)>; def : Pat<(f64 (bitconvert (x86mmx VR64:$src))), (MMX_MOVQ2FR64rr VR64:$src)>; def : Pat<(x86mmx (bitconvert (f64 FR64:$src))), diff --git a/contrib/llvm/lib/Target/X86/X86InstrSSE.td b/contrib/llvm/lib/Target/X86/X86InstrSSE.td index 99386b0..7a44212 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSSE.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSSE.td @@ -330,9 +330,9 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, //===----------------------------------------------------------------------===// // A vector extract of the first f32/f64 position is a subregister copy -def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), +def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; -def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), +def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; // A 128-bit subvector extract from the first 256-bit vector position @@ -413,6 +413,8 @@ let Predicates = [HasSSE2] in { def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; + def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; + def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; } // Bitcasts between 256-bit vector types. Return the original type since @@ -650,10 +652,10 @@ let Predicates = [UseAVX] in { } // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>; - def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst), (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>; @@ -736,7 +738,7 @@ let Predicates = [UseSSE1] in { } // Extract and store. - def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))), + def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), addr:$dst), (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>; @@ -770,7 +772,7 @@ let Predicates = [UseSSE2] in { } // Extract and store. - def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))), + def : Pat<(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst), (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>; @@ -935,22 +937,6 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, IIC_SSE_MOVU_P_RR>, VEX, VEX_L; } -let Predicates = [HasAVX] in { -def : Pat<(v8i32 (X86vzmovl - (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v4i64 (X86vzmovl - (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v8f32 (X86vzmovl - (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -def : Pat<(v4f64 (X86vzmovl - (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>; -} - - def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src), (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src), @@ -1172,12 +1158,13 @@ multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode, multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode, string base_opc, InstrItinClass itin> { - defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + let Predicates = [UseAVX] in + defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", itin>, VEX_4V; -let Constraints = "$src1 = $dst" in - defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, + let Constraints = "$src1 = $dst" in + defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc, "\t{$src2, $dst|$dst, $src2}", itin>; } @@ -1188,29 +1175,31 @@ let AddedComplexity = 20 in { } let SchedRW = [WriteStore] in { +let Predicates = [UseAVX] in { def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (v2f64 VR128:$src), + [(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +}// UseAVX def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)), + [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movlpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract (v2f64 VR128:$src), + [(store (f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; } // SchedRW -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // Shuffle with VMOVLPS def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), (VMOVLPSrm VR128:$src1, addr:$src2)>; @@ -1243,7 +1232,7 @@ let Predicates = [HasAVX] in { let Predicates = [UseSSE1] in { // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS - def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)), + def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)), (iPTR 0))), addr:$src1), (MOVLPSmr addr:$src1, VR128:$src2)>; @@ -1297,31 +1286,33 @@ let AddedComplexity = 20 in { let SchedRW = [WriteStore] in { // v2f64 extract element 1 is always custom lowered to unpack high to low // and extract element 0 so the non-store version isn't too horrible. +let Predicates = [UseAVX] in { def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), (bc_v2f64 (v4f32 VR128:$src))), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX; +} // UseAVX def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhps\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)), (bc_v2f64 (v4f32 VR128:$src))), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movhpd\t{$src, $dst|$dst, $src}", - [(store (f64 (vector_extract + [(store (f64 (extractelt (v2f64 (X86Unpckh VR128:$src, VR128:$src)), (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>; } // SchedRW -let Predicates = [HasAVX] in { +let Predicates = [UseAVX] in { // VMOVHPS patterns def : Pat<(X86Movlhps VR128:$src1, (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), @@ -1345,7 +1336,7 @@ let Predicates = [HasAVX] in { (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(store (f64 (vector_extract + def : Pat<(store (f64 (extractelt (v2f64 (X86VPermilpi VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (VMOVHPDmr addr:$dst, VR128:$src)>; @@ -1377,7 +1368,7 @@ let Predicates = [UseSSE2] in { (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (MOVHPDrm VR128:$src1, addr:$src2)>; - def : Pat<(store (f64 (vector_extract + def : Pat<(store (f64 (extractelt (v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))), (iPTR 0))), addr:$dst), (MOVHPDmr addr:$dst, VR128:$src)>; @@ -2073,14 +2064,16 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>; let Predicates = [HasAVX] in { - def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), + def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), + def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), (VCVTDQ2PSrm addr:$src)>; +} - def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src), +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))), (VCVTDQ2PSrr VR128:$src)>; - def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))), + def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))), (VCVTDQ2PSrm addr:$src)>; def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))), @@ -2149,7 +2142,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src), def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>; -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), @@ -2306,7 +2299,9 @@ let Predicates = [HasAVX] in { (VCVTDQ2PSYrr VR256:$src)>; def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))), (VCVTDQ2PSYrm addr:$src)>; +} +let Predicates = [HasAVX, NoVLX] in { // Match fround and fextend for 128/256-bit conversions def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))), (VCVTPD2PSrr VR128:$src)>; @@ -2452,9 +2447,9 @@ let Defs = [EFLAGS] in { defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, "ucomisd">, PD, VEX, VEX_LIG; let Pattern = []<dag> in { - defm VCOMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, "comiss">, PS, VEX, VEX_LIG; - defm VCOMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, "comisd">, PD, VEX, VEX_LIG; } @@ -2475,9 +2470,9 @@ let Defs = [EFLAGS] in { "ucomisd">, PD; let Pattern = []<dag> in { - defm COMISS : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load, + defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, "comiss">, PS; - defm COMISD : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load, + defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, "comisd">, PD; } @@ -2605,19 +2600,20 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, Sched<[WriteFShuffle]>; } -defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, +let Predicates = [HasAVX, NoVLX] in { + defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv4f32, SSEPackedSingle>, PS, VEX_4V; -defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, + defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L; -defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, + defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv2f64, SSEPackedDouble>, PD, VEX_4V; -defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, + defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L; - +} let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", @@ -2627,7 +2623,7 @@ let Constraints = "$src1 = $dst" in { memopv2f64, SSEPackedDouble>, PD; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Shufp VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; @@ -2694,6 +2690,7 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, Sched<[WriteFShuffleLd, ReadAfterLd]>; } +let Predicates = [HasAVX, NoVLX] in { defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedSingle>, PS, VEX_4V; @@ -2719,7 +2716,7 @@ defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", SSEPackedDouble>, PD, VEX_4V, VEX_L; - +}// Predicates = [HasAVX, NoVLX] let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", @@ -2845,8 +2842,8 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode, ValueType OpVT128, ValueType OpVT256, - OpndItins itins, bit IsCommutable = 0> { -let Predicates = [HasAVX, NoVLX] in + OpndItins itins, bit IsCommutable = 0, Predicate prd> { +let Predicates = [HasAVX, prd] in defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128, VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V; @@ -2854,7 +2851,7 @@ let Constraints = "$src1 = $dst" in defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128, memopv2i64, i128mem, itins, IsCommutable, 1>; -let Predicates = [HasAVX2, NoVLX] in +let Predicates = [HasAVX2, prd] in defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT256, VR256, loadv4i64, i256mem, itins, IsCommutable, 0>, VEX_4V, VEX_L; @@ -2863,13 +2860,13 @@ let Predicates = [HasAVX2, NoVLX] in // These are ordered here for pattern ordering requirements with the fp versions defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1>; + SSE_VEC_BIT_ITINS_P, 1, NoVLX>; defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 0>; + SSE_VEC_BIT_ITINS_P, 0, NoVLX>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Logical Instructions @@ -2911,7 +2908,7 @@ let isCodeGenOnly = 1 in { // Multiclass for vectors using the X86 logical operation aliases for FP. multiclass sse12_fp_packed_vector_logical_alias< bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { - let Predicates = [HasAVX, NoVLX] in { + let Predicates = [HasAVX, NoVLX_Or_NoDQI] in { defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128, v4f32, f128mem, loadv4f32, SSEPackedSingle, itins, 0>, PS, VEX_4V; @@ -2923,7 +2920,7 @@ multiclass sse12_fp_packed_vector_logical_alias< defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256, v8f32, f256mem, loadv8f32, SSEPackedSingle, itins, 0>, PS, VEX_4V, VEX_L; - + defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256, v4f64, f256mem, loadv4f64, SSEPackedDouble, itins, 0>, PD, VEX_4V, VEX_L; @@ -3183,7 +3180,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE1] in { // extracted scalar math op with insert via movss def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))))), (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3198,7 +3195,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE41] in { // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (i8 1))), (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3215,7 +3212,7 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { let Predicates = [HasAVX] in { // extracted scalar math op with insert via blend def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))), + (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), FR32:$src))), (i8 1))), (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; @@ -3241,7 +3238,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE2] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3256,7 +3253,7 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [UseSSE41] in { // extracted scalar math op with insert via blend def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))), (i8 1))), (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3271,14 +3268,14 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { let Predicates = [HasAVX] in { // extracted scalar math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))))), (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; // extracted scalar math op with insert via blend def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))), + (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), FR64:$src))), (i8 1))), (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; @@ -3449,8 +3446,8 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, /// sse1_fp_unop_p - SSE1 unops in packed form. multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { -let Predicates = [HasAVX] in { + OpndItins itins, list<Predicate> prds> { +let Predicates = prds in { def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat("v", OpcodeStr, "ps\t{$src, $dst|$dst, $src}"), @@ -3546,16 +3543,16 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, // Square root. defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>; + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>; + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; // There is no f64 version of the reciprocal approximation instructions. @@ -4018,39 +4015,43 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode, } // ExeDomain = SSEPackedInt defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX>; defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64, - SSE_INTALUQ_ITINS_P, 1>; + SSE_INTALUQ_ITINS_P, 1, NoVLX>; defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16, - SSE_INTMUL_ITINS_P, 1>; + SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX>; defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64, - SSE_INTALUQ_ITINS_P, 0>; + SSE_INTALUQ_ITINS_P, 0, NoVLX>; defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; +defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16, + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; // Intrinsic forms defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b, @@ -4067,26 +4068,18 @@ defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w, int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>; defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, int_x86_avx2_pmadd_wd, SSE_PMADD, 1>; -defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b, - int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>; -defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w, - int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>; -defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw, - int_x86_avx2_psad_bw, SSE_PMADD, 1>; - -let Predicates = [HasAVX2] in - def : Pat<(v32i8 (X86psadbw (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPSADBWYrr VR256:$src2, VR256:$src1)>; let Predicates = [HasAVX] in - def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPSADBWrr VR128:$src2, VR128:$src1)>; - -def : Pat<(v16i8 (X86psadbw (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PSADBWrr VR128:$src2, VR128:$src1)>; +defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128, + loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V; +let Predicates = [HasAVX2] in +defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256, + loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 1, 0>, + VEX_4V, VEX_L; +let Constraints = "$src1 = $dst" in +defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128, + memopv2i64, i128mem, SSE_INTALU_ITINS_P, 1>; let Predicates = [HasAVX] in defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128, @@ -4105,9 +4098,6 @@ defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128, //===---------------------------------------------------------------------===// let Predicates = [HasAVX, NoVLX] in { -defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSLLD : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4115,9 +4105,6 @@ defm VPSLLQ : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRLD : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; @@ -4125,14 +4112,26 @@ defm VPSRLQ : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, VR128, v2i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; -defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR128, v8i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; defm VPSRAD : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR128, v4i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { +defm VPSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRLW : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +defm VPSRAW : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR128, v8i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V; +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] + + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] , + Predicates = [HasAVX, NoVLX_Or_NoBWI]in { // 128-bit logical shifts. def VPSLLDQri : PDIi8<0x73, MRM7r, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), @@ -4147,13 +4146,9 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in { (v2i64 (X86vshrdq VR128:$src1, (i8 imm:$src2))))]>, VEX_4V; // PSRADQri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX_Or_NoBWI] let Predicates = [HasAVX2, NoVLX] in { -defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -4161,9 +4156,6 @@ defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli, VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; @@ -4171,14 +4163,25 @@ defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli, VR256, v4i64, v2i64, bc_v2i64, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; -defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, - VR256, v16i16, v8i16, bc_v8i16, loadv2i64, - SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai, VR256, v8i32, v4i32, bc_v4i32, loadv2i64, SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX] -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai, + VR256, v16i16, v8i16, bc_v8i16, loadv2i64, + SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L; +}// Predicates = [HasAVX2, NoVLX_Or_NoBWI] + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 , + Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // 256-bit logical shifts. def VPSLLDQYri : PDIi8<0x73, MRM7r, (outs VR256:$dst), (ins VR256:$src1, u8imm:$src2), @@ -4193,8 +4196,7 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { (v4i64 (X86vshrdq VR256:$src1, (i8 imm:$src2))))]>, VEX_4V, VEX_L; // PSRADQYri doesn't exist in SSE[1-3]. -} -} // Predicates = [HasAVX2] +} // Predicates = [HasAVX2, NoVLX_Or_NoBWI] let Constraints = "$src1 = $dst" in { defm PSLLW : PDI_binop_rmi<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli, @@ -4247,17 +4249,17 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift], hasSideEffects = 0 in { //===---------------------------------------------------------------------===// defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>; defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32, - SSE_INTALU_ITINS_P, 1>; + SSE_INTALU_ITINS_P, 1, NoVLX>; defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>; defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, - SSE_INTALU_ITINS_P, 0>; + SSE_INTALU_ITINS_P, 0, NoVLX>; //===---------------------------------------------------------------------===// // SSE2 - Packed Integer Shuffle Instructions @@ -4511,40 +4513,43 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, bc_v8i16, loadv2i64, 0>, VEX_4V; - defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, - bc_v4i32, loadv2i64, 0>, VEX_4V; - defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, - bc_v2i64, loadv2i64, 0>, VEX_4V; - defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, bc_v16i8, loadv2i64, 0>, VEX_4V; defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, bc_v8i16, loadv2i64, 0>, VEX_4V; +} +let Predicates = [HasAVX, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, + bc_v4i32, loadv2i64, 0>, VEX_4V; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, + bc_v2i64, loadv2i64, 0>, VEX_4V; defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, bc_v4i32, loadv2i64, 0>, VEX_4V; defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, bc_v2i64, loadv2i64, 0>, VEX_4V; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl, bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl, bc_v16i16>, VEX_4V, VEX_L; - defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, - bc_v8i32>, VEX_4V, VEX_L; - defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, - bc_v4i64>, VEX_4V, VEX_L; - defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh, bc_v32i8>, VEX_4V, VEX_L; defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh, bc_v16i16>, VEX_4V, VEX_L; +} +let Predicates = [HasAVX2, NoVLX] in { + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl, + bc_v8i32>, VEX_4V, VEX_L; + defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, + bc_v4i64>, VEX_4V, VEX_L; defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh, bc_v8i32>, VEX_4V, VEX_L; defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, @@ -4600,7 +4605,7 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> { } // Extract -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in def VPEXTRWri : Ii8<0xC5, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2), "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -4615,7 +4620,7 @@ def PEXTRWri : PDIi8<0xC5, MRMSrcReg, Sched<[WriteShuffleLd, ReadAfterLd]>; // Insert -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V; let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in @@ -4683,7 +4688,7 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask), } // ExeDomain = SSEPackedInt //===---------------------------------------------------------------------===// -// SSE2 - Move Doubleword +// SSE2 - Move Doubleword/Quadword //===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===// @@ -4770,23 +4775,23 @@ let isCodeGenOnly = 1 in { // def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>; def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128:$src), + [(store (i32 (extractelt (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (vector_extract (v4i32 VR128:$src), + [(set GR32:$dst, (extractelt (v4i32 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (vector_extract (v4i32 VR128:$src), + [(store (i32 (extractelt (v4i32 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; @@ -4808,24 +4813,25 @@ def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))), let SchedRW = [WriteMove] in { def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), - (iPTR 0)))], + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), + (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX; def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (vector_extract (v2i64 VR128:$src), + [(set GR64:$dst, (extractelt (v2i64 VR128:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>; } //SchedRW let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs i64mem:$dst), - (ins VR128:$src), "movq\t{$src, $dst|$dst, $src}", +def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs), + (ins i64mem:$dst, VR128:$src), + "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in -def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs i64mem:$dst), (ins VR128:$src), +def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "mov{d|q}\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>; @@ -4883,30 +4889,18 @@ let isCodeGenOnly = 1 in { IIC_SSE_MOVDQ>, Sched<[WriteStore]>; } -//===---------------------------------------------------------------------===// -// Patterns and instructions to describe movd/movq to XMM register zero-extends -// -let isCodeGenOnly = 1, SchedRW = [WriteMove] in { -let AddedComplexity = 15 in { -def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "movq\t{$src, $dst|$dst, $src}", // X86-64 only - [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))], - IIC_SSE_MOVDQ>, - VEX, VEX_W; -def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src), - "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only - [(set VR128:$dst, (v2i64 (X86vzmovl - (v2i64 (scalar_to_vector GR64:$src)))))], - IIC_SSE_MOVDQ>; -} -} // isCodeGenOnly, SchedRW - let Predicates = [UseAVX] in { - let AddedComplexity = 15 in + let AddedComplexity = 15 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (VMOVDI2PDIrr GR32:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (VMOV64toPQIrr GR64:$src)>; + + def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, + (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), + (SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>; + } // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part. // These instructions also write zeros in the high part of a 256-bit register. let AddedComplexity = 20 in { @@ -4924,16 +4918,16 @@ let Predicates = [UseAVX] in { def : Pat<(v8i32 (X86vzmovl (insert_subvector undef, (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))), (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, - (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))), - (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>; } let Predicates = [UseSSE2] in { - let AddedComplexity = 15 in + let AddedComplexity = 15 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))), (MOVDI2PDIrr GR32:$src)>; + def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))), + (MOV64toPQIrr GR64:$src)>; + } let AddedComplexity = 20 in { def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))), (MOVDI2PDIrm addr:$src)>; @@ -4985,12 +4979,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (vector_extract (v2i64 VR128:$src), + [(store (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, VEX; def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (vector_extract (v2i64 VR128:$src), + [(store (i64 (extractelt (v2i64 VR128:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>; } // ExeDomain, SchedRW @@ -5119,7 +5113,7 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", v4f32, VR128, loadv4f32, f128mem>, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", @@ -5134,7 +5128,7 @@ defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128, defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, memopv4f32, f128mem>; -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), @@ -5190,21 +5184,30 @@ def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (v4f64 (X86Movddup - (scalar_to_vector (loadf64 addr:$src)))))]>, + (v4f64 (X86Movddup (loadv4f64 addr:$src))))]>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX; defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L; } defm MOVDDUP : sse3_replicate_dfp<"movddup">; -let Predicates = [HasAVX] in { + +let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; + + // 256-bit version + def : Pat<(X86Movddup (loadv4i64 addr:$src)), + (VMOVDDUPYrm addr:$src)>; + def : Pat<(X86Movddup (v4i64 VR256:$src)), + (VMOVDDUPYrr VR256:$src)>; +} + +let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))), @@ -5212,16 +5215,6 @@ let Predicates = [HasAVX] in { def : Pat<(X86Movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src))))), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - - // 256-bit version - def : Pat<(X86Movddup (loadv4f64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (loadv4i64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 VR256:$src)), - (VMOVDDUPYrr VR256:$src)>; } let Predicates = [UseAVX, OptForSize] in { @@ -5791,37 +5784,37 @@ let Predicates = [HasAVX2] in let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in defm PALIGN : ssse3_palignr<"palignr">; -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>; + (VPALIGNR256rr VR256:$src1, VR256:$src2, imm:$imm)>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (VPALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; } let Predicates = [UseSSSE3] in { def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>; + (PALIGNR128rr VR128:$src1, VR128:$src2, imm:$imm)>; } //===---------------------------------------------------------------------===// @@ -6145,7 +6138,7 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { imm:$src2)))), addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX; defm PEXTRB : SS41I_extract8<0x14, "pextrb">; @@ -6170,7 +6163,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { imm:$src2)))), addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX; defm PEXTRW : SS41I_extract16<0x15, "pextrw">; @@ -6194,7 +6187,7 @@ multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> { addr:$dst)]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX; defm PEXTRD : SS41I_extract32<0x16, "pextrd">; @@ -6217,7 +6210,7 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> { addr:$dst)]>, REX_W; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W; defm PEXTRQ : SS41I_extract64<0x16, "pextrq">; @@ -6285,7 +6278,7 @@ multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoBWI] in defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRB : SS41I_insert8<0x20, "pinsrb">; @@ -6311,7 +6304,7 @@ multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V; let Constraints = "$src1 = $dst" in defm PINSRD : SS41I_insert32<0x22, "pinsrd">; @@ -6337,7 +6330,7 @@ multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> { imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>; } -let Predicates = [HasAVX] in +let Predicates = [HasAVX, NoDQI] in defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W; let Constraints = "$src1 = $dst" in defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W; @@ -6543,71 +6536,71 @@ let Predicates = [HasAVX] in { let Predicates = [UseAVX] in { def : Pat<(ffloor FR32:$src), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; } let Predicates = [HasAVX] in { def : Pat<(v4f32 (ffloor VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x1))>; + (VROUNDPSr VR128:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), (VROUNDPSr VR128:$src, (i32 0xC))>; def : Pat<(v4f32 (fceil VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x2))>; + (VROUNDPSr VR128:$src, (i32 0xA))>; def : Pat<(v4f32 (frint VR128:$src)), (VROUNDPSr VR128:$src, (i32 0x4))>; def : Pat<(v4f32 (ftrunc VR128:$src)), - (VROUNDPSr VR128:$src, (i32 0x3))>; + (VROUNDPSr VR128:$src, (i32 0xB))>; def : Pat<(v2f64 (ffloor VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x1))>; + (VROUNDPDr VR128:$src, (i32 0x9))>; def : Pat<(v2f64 (fnearbyint VR128:$src)), (VROUNDPDr VR128:$src, (i32 0xC))>; def : Pat<(v2f64 (fceil VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x2))>; + (VROUNDPDr VR128:$src, (i32 0xA))>; def : Pat<(v2f64 (frint VR128:$src)), (VROUNDPDr VR128:$src, (i32 0x4))>; def : Pat<(v2f64 (ftrunc VR128:$src)), - (VROUNDPDr VR128:$src, (i32 0x3))>; + (VROUNDPDr VR128:$src, (i32 0xB))>; def : Pat<(v8f32 (ffloor VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x1))>; + (VROUNDYPSr VR256:$src, (i32 0x9))>; def : Pat<(v8f32 (fnearbyint VR256:$src)), (VROUNDYPSr VR256:$src, (i32 0xC))>; def : Pat<(v8f32 (fceil VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x2))>; + (VROUNDYPSr VR256:$src, (i32 0xA))>; def : Pat<(v8f32 (frint VR256:$src)), (VROUNDYPSr VR256:$src, (i32 0x4))>; def : Pat<(v8f32 (ftrunc VR256:$src)), - (VROUNDYPSr VR256:$src, (i32 0x3))>; + (VROUNDYPSr VR256:$src, (i32 0xB))>; def : Pat<(v4f64 (ffloor VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x1))>; + (VROUNDYPDr VR256:$src, (i32 0x9))>; def : Pat<(v4f64 (fnearbyint VR256:$src)), (VROUNDYPDr VR256:$src, (i32 0xC))>; def : Pat<(v4f64 (fceil VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x2))>; + (VROUNDYPDr VR256:$src, (i32 0xA))>; def : Pat<(v4f64 (frint VR256:$src)), (VROUNDYPDr VR256:$src, (i32 0x4))>; def : Pat<(v4f64 (ftrunc VR256:$src)), - (VROUNDYPDr VR256:$src, (i32 0x3))>; + (VROUNDYPDr VR256:$src, (i32 0xB))>; } defm ROUND : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128, @@ -6619,47 +6612,47 @@ defm ROUND : sse41_fp_binop_rm<0x0A, 0x0B, "round", let Predicates = [UseSSE41] in { def : Pat<(ffloor FR32:$src), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>; def : Pat<(f64 (ffloor FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>; def : Pat<(f32 (fnearbyint FR32:$src)), (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>; def : Pat<(f64 (fnearbyint FR64:$src)), (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>; def : Pat<(f32 (fceil FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>; def : Pat<(f64 (fceil FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>; def : Pat<(f32 (frint FR32:$src)), (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>; def : Pat<(f64 (frint FR64:$src)), (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>; def : Pat<(f32 (ftrunc FR32:$src)), - (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>; + (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>; def : Pat<(f64 (ftrunc FR64:$src)), - (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>; + (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; def : Pat<(v4f32 (ffloor VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x1))>; + (ROUNDPSr VR128:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), (ROUNDPSr VR128:$src, (i32 0xC))>; def : Pat<(v4f32 (fceil VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x2))>; + (ROUNDPSr VR128:$src, (i32 0xA))>; def : Pat<(v4f32 (frint VR128:$src)), (ROUNDPSr VR128:$src, (i32 0x4))>; def : Pat<(v4f32 (ftrunc VR128:$src)), - (ROUNDPSr VR128:$src, (i32 0x3))>; + (ROUNDPSr VR128:$src, (i32 0xB))>; def : Pat<(v2f64 (ffloor VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x1))>; + (ROUNDPDr VR128:$src, (i32 0x9))>; def : Pat<(v2f64 (fnearbyint VR128:$src)), (ROUNDPDr VR128:$src, (i32 0xC))>; def : Pat<(v2f64 (fceil VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x2))>; + (ROUNDPDr VR128:$src, (i32 0xA))>; def : Pat<(v2f64 (frint VR128:$src)), (ROUNDPDr VR128:$src, (i32 0x4))>; def : Pat<(v2f64 (ftrunc VR128:$src)), - (ROUNDPDr VR128:$src, (i32 0x3))>; + (ROUNDPDr VR128:$src, (i32 0xB))>; } //===----------------------------------------------------------------------===// @@ -7815,13 +7808,7 @@ def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), // VBROADCAST - Load from memory and broadcast to all elements of the // destination operand // -class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, - X86MemOperand x86memop, Intrinsic Int, SchedWrite Sched> : - AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX; - -class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, +class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType VT, PatFrag ld_frag, SchedWrite Sched> : AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), @@ -7832,38 +7819,33 @@ class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC, } // AVX2 adds register forms -class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, - Intrinsic Int, SchedWrite Sched> : +class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType ResVT, ValueType OpVT, SchedWrite Sched> : AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX; + [(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>, + Sched<[Sched]>, VEX; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128, + def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128, f32mem, v4f32, loadf32, WriteLoad>; - def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256, + def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256, f32mem, v8f32, loadf32, WriteFShuffleLd>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem, +def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem, v4f64, loadf64, WriteFShuffleLd>, VEX_L; -def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, - int_x86_avx_vbroadcastf128_pd_256, - WriteFShuffleLd>, VEX_L; let ExeDomain = SSEPackedSingle in { - def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, - int_x86_avx2_vbroadcast_ss_ps, - WriteFShuffle>; - def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, - int_x86_avx2_vbroadcast_ss_ps_256, - WriteFShuffle256>, VEX_L; + def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128, + v4f32, v4f32, WriteFShuffle>; + def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256, + v8f32, v4f32, WriteFShuffle256>, VEX_L; } let ExeDomain = SSEPackedDouble in -def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, - int_x86_avx2_vbroadcast_sd_pd_256, - WriteFShuffle256>, VEX_L; +def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, + v4f64, v2f64, WriteFShuffle256>, VEX_L; let mayLoad = 1, Predicates = [HasAVX2] in def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), @@ -7871,6 +7853,13 @@ def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst), "vbroadcasti128\t{$src, $dst|$dst, $src}", []>, Sched<[WriteLoad]>, VEX, VEX_L; +def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst), + (ins f128mem:$src), + "vbroadcastf128\t{$src, $dst|$dst, $src}", + [(set VR256:$dst, + (int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>, + Sched<[WriteFShuffleLd]>, VEX, VEX_L; + let Predicates = [HasAVX] in def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; @@ -7891,7 +7880,7 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR imm)), (VINSERTF128rr VR256:$src1, VR128:$src2, @@ -8080,17 +8069,19 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, (bitconvert (i_frag addr:$src2))))]>, VEX_4V, Sched<[WriteFShuffleLd, ReadAfterLd]>; - def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), + let Predicates = [HasAVX, NoVLX] in { + def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffle]>; - def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), + def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst), (ins x86memop_f:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX, Sched<[WriteFShuffleLd]>; + }// Predicates = [HasAVX, NoVLX] } let ExeDomain = SSEPackedSingle in { @@ -8106,7 +8097,7 @@ let ExeDomain = SSEPackedDouble in { loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))), (VPERMILPSYrr VR256:$src1, VR256:$src2)>; def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))), @@ -8245,11 +8236,11 @@ let Predicates = [HasF16C] in { def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), (VCVTPH2PSrm addr:$src)>; - def : Pat<(store (f64 (vector_extract (bc_v2f64 (v8i16 + def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (i64 (vector_extract (bc_v2i64 (v8i16 + def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), addr:$dst), (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; @@ -8309,97 +8300,62 @@ defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, // multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, X86MemOperand x86memop, PatFrag ld_frag, - Intrinsic Int128, Intrinsic Int256> { - def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + ValueType OpVT128, ValueType OpVT256, Predicate prd> { + let Predicates = [HasAVX2, prd] in { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int128 VR128:$src))]>, + [(set VR128:$dst, + (OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle]>, VEX; - def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, + (OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX; - def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int256 VR128:$src))]>, + [(set VR256:$dst, + (OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>, Sched<[WriteShuffle256]>, VEX, VEX_L; - def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, + (OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>, Sched<[WriteLoad]>, VEX, VEX_L; + + // Provide aliases for broadcast from the same register class that + // automatically does the extract. + def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))), + (!cast<Instruction>(NAME#"Yrr") + (OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>; + } } defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, - int_x86_avx2_pbroadcastb_128, - int_x86_avx2_pbroadcastb_256>; + v16i8, v32i8, NoVLX_Or_NoBWI>; defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, - int_x86_avx2_pbroadcastw_128, - int_x86_avx2_pbroadcastw_256>; + v8i16, v16i16, NoVLX_Or_NoBWI>; defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, - int_x86_avx2_pbroadcastd_128, - int_x86_avx2_pbroadcastd_256>; + v4i32, v8i32, NoVLX>; defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, - int_x86_avx2_pbroadcastq_128, - int_x86_avx2_pbroadcastq_256>; + v2i64, v4i64, NoVLX>; let Predicates = [HasAVX2] in { - def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBrm addr:$src)>; - def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))), - (VPBROADCASTBYrm addr:$src)>; - def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWrm addr:$src)>; - def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))), - (VPBROADCASTWYrm addr:$src)>; - def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDrm addr:$src)>; - def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))), - (VPBROADCASTDYrm addr:$src)>; - def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQrm addr:$src)>; - def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))), - (VPBROADCASTQYrm addr:$src)>; - - def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBrr VR128:$src)>; - def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))), - (VPBROADCASTBYrr VR128:$src)>; - def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWrr VR128:$src)>; - def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))), - (VPBROADCASTWYrr VR128:$src)>; - def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDrr VR128:$src)>; - def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))), - (VPBROADCASTDYrr VR128:$src)>; - def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))), - (VPBROADCASTQYrr VR128:$src)>; - def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSrr VR128:$src)>; - def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))), - (VBROADCASTSSYrr VR128:$src)>; - def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))), - (VPBROADCASTQrr VR128:$src)>; - def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), - (VBROADCASTSDYrr VR128:$src)>; + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + def : Pat<(v8i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast + (i16 (trunc (i32 (zextloadi16 addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; // Provide aliases for broadcast from the same register class that // automatically does the extract. - def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), - (VPBROADCASTBYrr (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), - sub_xmm)))>; - def : Pat<(v16i16 (X86VBroadcast (v16i16 VR256:$src))), - (VPBROADCASTWYrr (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), - sub_xmm)))>; - def : Pat<(v8i32 (X86VBroadcast (v8i32 VR256:$src))), - (VPBROADCASTDYrr (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), - sub_xmm)))>; - def : Pat<(v4i64 (X86VBroadcast (v4i64 VR256:$src))), - (VPBROADCASTQYrr (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), - sub_xmm)))>; def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))), (VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)))>; @@ -8598,7 +8554,7 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; } -let Predicates = [HasAVX2] in { +let Predicates = [HasAVX2, NoVLX] in { def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR imm)), (VINSERTI128rr VR256:$src1, VR128:$src2, @@ -8722,16 +8678,16 @@ defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq", int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; -def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), (VMASKMOVPSYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v8i32 VR256:$mask), (v8i32 VR256:$src)), (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4f32 VR128:$src)), (VMASKMOVPSmr addr:$ptr, VR128:$mask, VR128:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src)), (VPMASKMOVDmr addr:$ptr, VR128:$mask, VR128:$src)>; def: Pat<(v8f32 (masked_load addr:$ptr, (v8i32 VR256:$mask), undef)), @@ -8776,10 +8732,10 @@ def: Pat<(v4i32 (masked_load addr:$ptr, (v4i32 VR128:$mask), (v4i32 VR128:$src0) (VBLENDVPSrr VR128:$src0, (VPMASKMOVDrm VR128:$mask, addr:$ptr), VR128:$mask)>; -def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4f64 VR256:$src)), (VMASKMOVPDYmr addr:$ptr, VR256:$mask, VR256:$src)>; -def: Pat<(masked_store addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), +def: Pat<(X86mstore addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src)), (VPMASKMOVQYmr addr:$ptr, VR256:$mask, VR256:$src)>; def: Pat<(v4f64 (masked_load addr:$ptr, (v4i64 VR256:$mask), undef)), @@ -8804,10 +8760,10 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0) (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), VR256:$mask)>; -def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2f64 VR128:$src)), (VMASKMOVPDmr addr:$ptr, VR128:$mask, VR128:$src)>; -def: Pat<(masked_store addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), +def: Pat<(X86mstore addr:$ptr, (v2i64 VR128:$mask), (v2i64 VR128:$src)), (VPMASKMOVQmr addr:$ptr, VR128:$mask, VR128:$src)>; def: Pat<(v2f64 (masked_load addr:$ptr, (v2i64 VR128:$mask), undef)), @@ -8865,12 +8821,13 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; } -defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; -defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; -defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; -defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; -defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; - +let Predicates = [HasAVX2, NoVLX] in { + defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>; + defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W; + defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>; + defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W; + defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>; +} //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, @@ -8905,3 +8862,59 @@ let mayLoad = 1, Constraints defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx32mem, vy32mem>; } } + +//===----------------------------------------------------------------------===// +// Extra selection patterns for FR128, f128, f128mem + +// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2. +def : Pat<(store (f128 FR128:$src), addr:$dst), + (MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>; + +def : Pat<(loadf128 addr:$src), + (COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>; + +// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2 +def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fand FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(and FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86for FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(or FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)), + (COPY_TO_REGCLASS + (XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2), + FR128)>; + +def : Pat<(X86fxor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +def : Pat<(xor FR128:$src1, FR128:$src2), + (COPY_TO_REGCLASS + (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), + (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; diff --git a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td index caecf70..c1df978 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td +++ b/contrib/llvm/lib/Target/X86/X86InstrShiftRotate.td @@ -31,21 +31,21 @@ def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>; } // Uses = [CL] -def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def SHL8ri : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "shl{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; let isConvertibleToThreeAddress = 1 in { // Can transform into LEA. -def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SHL16ri : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shl{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SHL32ri : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "shl{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "shl{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -85,19 +85,19 @@ def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), "shl{q}\t{%cl, $dst|$dst, cl}", [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src), +def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src), "shl{b}\t{$src, $dst|$dst, $src}", [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src), +def SHL16mi : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, u8imm:$src), "shl{w}\t{$src, $dst|$dst, $src}", [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src), +def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src), "shl{l}\t{$src, $dst|$dst, $src}", [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src), +def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src), "shl{q}\t{$src, $dst|$dst, $src}", [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -137,18 +137,18 @@ def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>; } -def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), +def SHR8ri : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$src2), "shr{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SHR16ri : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "shr{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SHR32ri : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "shr{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; -def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2), +def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$src2), "shr{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -185,19 +185,19 @@ def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), "shr{q}\t{%cl, $dst|$dst, cl}", [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src), +def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src), "shr{b}\t{$src, $dst|$dst, $src}", [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src), +def SHR16mi : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, u8imm:$src), "shr{w}\t{$src, $dst|$dst, $src}", [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src), +def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src), "shr{l}\t{$src, $dst|$dst, $src}", [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src), +def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src), "shr{q}\t{$src, $dst|$dst, $src}", [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -241,20 +241,20 @@ def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1), IIC_SR>; } -def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def SAR8ri : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "sar{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def SAR16ri : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "sar{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def SAR32ri : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "sar{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def SAR64ri : RIi8<0xC1, MRM7r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "sar{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -298,19 +298,19 @@ def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src), +def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src), "sar{b}\t{$src, $dst|$dst, $src}", [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src), +def SAR16mi : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, u8imm:$src), "sar{w}\t{$src, $dst|$dst, $src}", [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src), +def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src), "sar{l}\t{$src, $dst|$dst, $src}", [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src), +def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src), "sar{q}\t{$src, $dst|$dst, $src}", [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -342,7 +342,7 @@ let hasSideEffects = 0 in { let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1), "rcl{b}\t$dst", [], IIC_SR>; -def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), +def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), @@ -350,7 +350,7 @@ def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1), def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; -def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), +def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; let Uses = [CL] in def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), @@ -358,7 +358,7 @@ def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1), def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1), "rcl{l}\t$dst", [], IIC_SR>, OpSize32; -def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), +def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; let Uses = [CL] in def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), @@ -367,7 +367,7 @@ def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1), def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1), "rcl{q}\t$dst", [], IIC_SR>; -def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), +def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), @@ -376,7 +376,7 @@ def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1), def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1), "rcr{b}\t$dst", [], IIC_SR>; -def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt), +def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, u8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), @@ -384,7 +384,7 @@ def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1), def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; -def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt), +def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; let Uses = [CL] in def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), @@ -392,7 +392,7 @@ def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1), def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1), "rcr{l}\t$dst", [], IIC_SR>, OpSize32; -def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt), +def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; let Uses = [CL] in def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), @@ -400,7 +400,7 @@ def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1), def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1), "rcr{q}\t$dst", [], IIC_SR>; -def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt), +def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), @@ -411,36 +411,36 @@ def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1), let SchedRW = [WriteShiftLd, WriteRMW] in { def RCL8m1 : I<0xD0, MRM2m, (outs), (ins i8mem:$dst), "rcl{b}\t$dst", [], IIC_SR>; -def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, i8imm:$cnt), +def RCL8mi : Ii8<0xC0, MRM2m, (outs), (ins i8mem:$dst, u8imm:$cnt), "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCL16m1 : I<0xD1, MRM2m, (outs), (ins i16mem:$dst), "rcl{w}\t$dst", [], IIC_SR>, OpSize16; -def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, i8imm:$cnt), +def RCL16mi : Ii8<0xC1, MRM2m, (outs), (ins i16mem:$dst, u8imm:$cnt), "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), "rcl{l}\t$dst", [], IIC_SR>, OpSize32; -def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, i8imm:$cnt), +def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), "rcl{q}\t$dst", [], IIC_SR>; -def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, i8imm:$cnt), +def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt), "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), "rcr{b}\t$dst", [], IIC_SR>; -def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, i8imm:$cnt), +def RCR8mi : Ii8<0xC0, MRM3m, (outs), (ins i8mem:$dst, u8imm:$cnt), "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; def RCR16m1 : I<0xD1, MRM3m, (outs), (ins i16mem:$dst), "rcr{w}\t$dst", [], IIC_SR>, OpSize16; -def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, i8imm:$cnt), +def RCR16mi : Ii8<0xC1, MRM3m, (outs), (ins i16mem:$dst, u8imm:$cnt), "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize16; def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), "rcr{l}\t$dst", [], IIC_SR>, OpSize32; -def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, i8imm:$cnt), +def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), "rcr{q}\t$dst", [], IIC_SR>; -def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt), +def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt), "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; let Uses = [CL] in { @@ -482,19 +482,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>; } -def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def ROL8ri : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "rol{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def ROL16ri : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "rol{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def ROL32ri : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "rol{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def ROL64ri : RIi8<0xC1, MRM0r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "rol{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -537,19 +537,19 @@ def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src1), +def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1), "rol{b}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi8 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>; -def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src1), +def ROL16mi : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, u8imm:$src1), "rol{w}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi16 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>, OpSize16; -def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src1), +def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1), "rol{l}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi32 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>, OpSize32; -def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src1), +def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1), "rol{q}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)], IIC_SR>; @@ -589,19 +589,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1), [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>; } -def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2), +def ROR8ri : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2), "ror{b}\t{$src2, $dst|$dst, $src2}", [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))], IIC_SR>; -def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2), +def ROR16ri : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2), "ror{w}\t{$src2, $dst|$dst, $src2}", [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize16; -def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2), +def ROR32ri : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2), "ror{l}\t{$src2, $dst|$dst, $src2}", [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))], IIC_SR>, OpSize32; def ROR64ri : RIi8<0xC1, MRM1r, (outs GR64:$dst), - (ins GR64:$src1, i8imm:$src2), + (ins GR64:$src1, u8imm:$src2), "ror{q}\t{$src2, $dst|$dst, $src2}", [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))], IIC_SR>; @@ -644,19 +644,19 @@ def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; } -def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src), +def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src), "ror{b}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; -def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src), +def ROR16mi : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, u8imm:$src), "ror{w}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize16; -def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src), +def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src), "ror{l}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>, OpSize32; -def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src), +def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), "ror{q}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], IIC_SR>; @@ -727,42 +727,42 @@ def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), let isCommutable = 1 in { // These instructions commute to each other. def SHLD16rri8 : Ii8<0xA4, MRMDestReg, (outs GR16:$dst), - (ins GR16:$src1, GR16:$src2, i8imm:$src3), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHRD16rri8 : Ii8<0xAC, MRMDestReg, (outs GR16:$dst), - (ins GR16:$src1, GR16:$src2, i8imm:$src3), + (ins GR16:$src1, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, (i8 imm:$src3)))], IIC_SHD16_REG_IM>, TB, OpSize16; def SHLD32rri8 : Ii8<0xA4, MRMDestReg, (outs GR32:$dst), - (ins GR32:$src1, GR32:$src2, i8imm:$src3), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHRD32rri8 : Ii8<0xAC, MRMDestReg, (outs GR32:$dst), - (ins GR32:$src1, GR32:$src2, i8imm:$src3), + (ins GR32:$src1, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, (i8 imm:$src3)))], IIC_SHD32_REG_IM>, TB, OpSize32; def SHLD64rri8 : RIi8<0xA4, MRMDestReg, (outs GR64:$dst), - (ins GR64:$src1, GR64:$src2, i8imm:$src3), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, TB; def SHRD64rri8 : RIi8<0xAC, MRMDestReg, (outs GR64:$dst), - (ins GR64:$src1, GR64:$src2, i8imm:$src3), + (ins GR64:$src1, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, (i8 imm:$src3)))], IIC_SHD64_REG_IM>, @@ -801,14 +801,14 @@ def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), } def SHLD16mri8 : Ii8<0xA4, MRMDestMem, - (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi16 addr:$dst), GR16:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD16_MEM_IM>, TB, OpSize16; def SHRD16mri8 : Ii8<0xAC, MRMDestMem, - (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3), + (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3), "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, (i8 imm:$src3)), addr:$dst)], @@ -816,14 +816,14 @@ def SHRD16mri8 : Ii8<0xAC, MRMDestMem, TB, OpSize16; def SHLD32mri8 : Ii8<0xA4, MRMDestMem, - (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi32 addr:$dst), GR32:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD32_MEM_IM>, TB, OpSize32; def SHRD32mri8 : Ii8<0xAC, MRMDestMem, - (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3), + (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3), "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, (i8 imm:$src3)), addr:$dst)], @@ -831,14 +831,14 @@ def SHRD32mri8 : Ii8<0xAC, MRMDestMem, TB, OpSize32; def SHLD64mri8 : RIi8<0xA4, MRMDestMem, - (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shld (loadi64 addr:$dst), GR64:$src2, (i8 imm:$src3)), addr:$dst)], IIC_SHD64_MEM_IM>, TB; def SHRD64mri8 : RIi8<0xAC, MRMDestMem, - (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3), + (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3), "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, (i8 imm:$src3)), addr:$dst)], @@ -860,12 +860,12 @@ def ROT64L2R_imm8 : SDNodeXForm<imm, [{ multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> { let hasSideEffects = 0 in { - def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2), + def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, u8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShift]>; let mayLoad = 1 in def mi : Ii8<0xF0, MRMSrcMem, (outs RC:$dst), - (ins x86memop:$src1, i8imm:$src2), + (ins x86memop:$src1, u8imm:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, TAXD, VEX, Sched<[WriteShiftLd]>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrSystem.td b/contrib/llvm/lib/Target/X86/X86InstrSystem.td index 0350566..85e17f5 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrSystem.td +++ b/contrib/llvm/lib/Target/X86/X86InstrSystem.td @@ -44,7 +44,7 @@ def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", let SchedRW = [WriteSystem] in { -def INT : Ii8<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", +def INT : Ii8<0xcd, RawFrm, (outs), (ins u8imm:$trap), "int\t$trap", [(int_x86_int imm:$trap)], IIC_INT>; @@ -60,12 +60,6 @@ def SYSEXIT : I<0x35, RawFrm, (outs), (ins), "sysexit{l}", [], IIC_SYS_ENTER_EXIT>, TB; def SYSEXIT64 :RI<0x35, RawFrm, (outs), (ins), "sysexit{q}", [], IIC_SYS_ENTER_EXIT>, TB, Requires<[In64BitMode]>; - -def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>, OpSize16; -def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l|d}", [], IIC_IRET>, - OpSize32; -def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>, - Requires<[In64BitMode]>; } // SchedRW def : Pat<(debugtrap), @@ -88,13 +82,13 @@ def IN32rr : I<0xED, RawFrm, (outs), (ins), "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>, OpSize32; let Defs = [AL] in -def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port), +def IN8ri : Ii8<0xE4, RawFrm, (outs), (ins u8imm:$port), "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>; let Defs = [AX] in -def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), +def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize16; let Defs = [EAX] in -def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port), +def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins u8imm:$port), "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>, OpSize32; let Uses = [DX, AL] in @@ -108,13 +102,13 @@ def OUT32rr : I<0xEF, RawFrm, (outs), (ins), "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>, OpSize32; let Uses = [AL] in -def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port), +def OUT8ir : Ii8<0xE6, RawFrm, (outs), (ins u8imm:$port), "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>; let Uses = [AX] in -def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), +def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize16; let Uses = [EAX] in -def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port), +def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins u8imm:$port), "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>, OpSize32; } // SchedRW @@ -478,39 +472,60 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; //===----------------------------------------------------------------------===// // XSAVE instructions let SchedRW = [WriteSystem] in { +let Predicates = [HasXSAVE] in { let Defs = [EDX, EAX], Uses = [ECX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; let Uses = [EDX, EAX, ECX] in def XSETBV : I<0x01, MRM_D1, (outs), (ins), "xsetbv", []>, TB; +} -let Uses = [RDX, RAX] in { - def XSAVE : I<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave\t$dst", []>, TB; - def XSAVE64 : RI<0xAE, MRM4m, (outs opaque512mem:$dst), (ins), - "xsave64\t$dst", []>, TB, Requires<[In64BitMode]>; +let Uses = [EDX, EAX] in { +let Predicates = [HasXSAVE] in { + def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave\t$dst", + [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB; + def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave64\t$dst", + [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor\t$dst", []>, TB; + "xrstor\t$dst", + [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB; def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVEOPT : I<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt\t$dst", []>, PS; - def XSAVEOPT64 : RI<0xAE, MRM6m, (outs opaque512mem:$dst), (ins), - "xsaveopt64\t$dst", []>, PS, Requires<[In64BitMode]>; - + "xrstor64\t$dst", + [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEOPT] in { + def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt\t$dst", + [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, TB; + def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt64\t$dst", + [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVEC] in { + def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec\t$dst", + [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB; + def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec64\t$dst", + [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; +} +let Predicates = [HasXSAVES] in { + def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves\t$dst", + [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB; + def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves64\t$dst", + [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors\t$dst", []>, TB; + "xrstors\t$dst", + [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB; def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVEC : I<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), - "xsavec\t$dst", []>, TB; - def XSAVEC64 : RI<0xC7, MRM4m, (outs opaque512mem:$dst), (ins), - "xsavec64\t$dst", []>, TB, Requires<[In64BitMode]>; - def XSAVES : I<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), - "xsaves\t$dst", []>, TB; - def XSAVES64 : RI<0xC7, MRM5m, (outs opaque512mem:$dst), (ins), - "xsaves64\t$dst", []>, TB, Requires<[In64BitMode]>; + "xrstors64\t$dst", + [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; } +} // Uses } // SchedRW //===----------------------------------------------------------------------===// @@ -534,6 +549,12 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { } let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; +//==-----------------------------------------------------------------------===// +// PKU - enable protection key +let Defs = [EAX, EDX], Uses = [ECX] in + def RDPKRU : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; +let Uses = [EAX, ECX, EDX] in + def WRPKRU : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; //===----------------------------------------------------------------------===// // FS/GS Base Instructions diff --git a/contrib/llvm/lib/Target/X86/X86InstrXOP.td b/contrib/llvm/lib/Target/X86/X86InstrXOP.td index 8455b8d..4cb2304 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrXOP.td +++ b/contrib/llvm/lib/Target/X86/X86InstrXOP.td @@ -83,57 +83,64 @@ let ExeDomain = SSEPackedDouble in { defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; } -multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2))]>, XOP_4VOp3; + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2))))]>, - XOP_4V, VEX_W; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, + XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>; def mr : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (loadv2i64 addr:$src1)), VR128:$src2))]>, - XOP_4VOp3; + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), + (vt128 VR128:$src2))))]>, + XOP_4VOp3, Sched<[WriteVarVecShift, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { - defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; - defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; - defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; - defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; - defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; - defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; - defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; - defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; - defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; - defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; - defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; - defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; + defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>; + defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>; + defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>; + defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>; + defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>; + defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>; + defm VPSHAW : xop3op<0x99, "vpshaw", X86vpsha, v8i16>; + defm VPSHLB : xop3op<0x94, "vpshlb", X86vpshl, v16i8>; + defm VPSHLD : xop3op<0x96, "vpshld", X86vpshl, v4i32>; + defm VPSHLQ : xop3op<0x97, "vpshlq", X86vpshl, v2i64>; + defm VPSHLW : xop3op<0x95, "vpshlw", X86vpshl, v8i16>; } -multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> { +multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType vt128> { def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, i8imm:$src2), + (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, imm:$src2))]>, XOP; + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins i128mem:$src1, i8imm:$src2), + (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int (bitconvert (loadv2i64 addr:$src1)), imm:$src2))]>, XOP; + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP; } let ExeDomain = SSEPackedInt in { - defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; - defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; - defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; - defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>; } // Instruction where second source can be memory, but third must be register @@ -170,30 +177,34 @@ let ExeDomain = SSEPackedInt in { } // Instruction where second source can be memory, third must be imm8 -multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { +multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> { let isCommutable = 1 in def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, XOPCC:$cc), !strconcat("vpcom${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, i8immZExt3:$cc))]>, + [(set VR128:$dst, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + i8immZExt3:$cc)))]>, XOP_4V; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), !strconcat("vpcom${cc}", Suffix, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), - i8immZExt3:$cc))]>, XOP_4V; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (loadv2i64 addr:$src2))), + i8immZExt3:$cc)))]>, + XOP_4V; let isAsmParserOnly = 1, hasSideEffects = 0 in { def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, i8imm:$src3), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, XOP_4V; let mayLoad = 1 in def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, i8imm:$src3), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, XOP_4V; @@ -201,14 +212,14 @@ multiclass xopvpcom<bits<8> opc, string Suffix, Intrinsic Int> { } let ExeDomain = SSEPackedInt in { // SSE integer instructions - defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; - defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; - defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; - defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; - defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; - defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; - defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; - defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; + defm VPCOMB : xopvpcom<0xCC, "b", X86vpcom, v16i8>; + defm VPCOMW : xopvpcom<0xCD, "w", X86vpcom, v8i16>; + defm VPCOMD : xopvpcom<0xCE, "d", X86vpcom, v4i32>; + defm VPCOMQ : xopvpcom<0xCF, "q", X86vpcom, v2i64>; + defm VPCOMUB : xopvpcom<0xEC, "ub", X86vpcomu, v16i8>; + defm VPCOMUW : xopvpcom<0xED, "uw", X86vpcomu, v8i16>; + defm VPCOMUD : xopvpcom<0xEE, "ud", X86vpcomu, v4i32>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", X86vpcomu, v2i64>; } // Instruction where either second or third source can be memory @@ -270,42 +281,52 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> { let ExeDomain = SSEPackedInt in defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; +let Predicates = [HasXOP] in { + def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1), + (X86andnp VR128:$src3, VR128:$src2))), + (VPCMOVrr VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1), + (X86andnp VR256:$src3, VR256:$src2))), + (VPCMOVrrY VR256:$src1, VR256:$src2, VR256:$src3)>; +} + multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3, i8imm:$src4), + (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3, imm:$src4))]>; def rm : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, f128mem:$src3, i8imm:$src4), + (ins VR128:$src1, VR128:$src2, f128mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, (ld_128 addr:$src3), imm:$src4))]>, VEX_W, MemOp4; def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, f128mem:$src2, VR128:$src3, i8imm:$src4), + (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR128:$dst, (Int128 VR128:$src1, (ld_128 addr:$src2), VR128:$src3, imm:$src4))]>; def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3, i8imm:$src4), + (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L; def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4), + (ins VR256:$src1, VR256:$src2, f256mem:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>, VEX_W, MemOp4, VEX_L; def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4), + (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set VR256:$dst, diff --git a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 2c8b95b..dc6d85d 100644 --- a/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -18,14 +18,19 @@ namespace llvm { enum IntrinsicType { INTR_NO_TYPE, - GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, - INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, - CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI, - INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, - INTR_TYPE_3OP_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, - VPERM_3OP_MASKZ, - INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, - EXPAND_FROM_MEM, BLEND + GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, FPCLASS, FPCLASSS, + INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP, INTR_TYPE_4OP, + CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, VSHIFT_MASK, COMI, COMI_RM, + INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, + INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK, + INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, + FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, VPERM_3OP_MASK, + VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, + INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, + TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, + EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK }; struct IntrinsicData { @@ -138,6 +143,42 @@ static const IntrinsicData IntrinsicsWithChain[] = { EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8, + X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0), X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0), @@ -209,6 +250,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -241,6 +284,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(avx2_psign_b, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(avx2_psign_d, INTR_TYPE_2OP, X86ISD::PSIGN, 0), @@ -274,16 +318,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::FP_TO_UINT_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::UINT_TO_FP_RND, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), + X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), - + X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), + X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, @@ -371,6 +455,50 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_blend_w_128, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_256, BLEND, X86ISD::SELECT, 0), X86_INTRINSIC_DATA(avx512_mask_blend_w_512, BLEND, X86ISD::SELECT, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_sd_pd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcast_ss_ps_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, INTR_TYPE_1OP_MASK, + X86ISD::SUBV_BROADCAST, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC, + X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -388,6 +516,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0), + X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), + X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCC, + X86ISD::FSETCC), X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -415,7 +547,184 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), - + X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK, + X86ISD::CONFLICT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), //er + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP_ROUND, X86ISD::VFPROUND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK, + ISD::FP_EXTEND, X86ISD::VFPEXT), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_SINT_RND, X86ISD::FP_TO_SINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK, + X86ISD::FP_TO_UINT_RND, X86ISD::FP_TO_UINT_RND), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::SINT_TO_FP, ISD::SINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPROUND, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::VFPEXT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_SINT, ISD::FP_TO_SINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, 0), + X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK, + ISD::FP_TO_UINT, ISD::FP_TO_UINT), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_128, INTR_TYPE_1OP_MASK, + X86ISD::CVTUDQ2PD, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), // no rm + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK, + ISD::UINT_TO_FP, ISD::UINT_TO_FP), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), + X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::DBPSADBW, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_128, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_256, INTR_TYPE_2OP_MASK, ISD::FDIV, 0), X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV, @@ -452,6 +761,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0), + X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM, @@ -464,6 +781,62 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FGETEXP_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM, X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FGETEXP_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, + X86ISD::VGETMANT, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_insertf64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti32x8_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_256, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x2_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_inserti64x4_512, INSERT_SUBVEC, + ISD::INSERT_SUBVECTOR, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), + X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK, + ISD::CTLZ, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, @@ -472,10 +845,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, @@ -484,10 +857,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_movddup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movddup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVDDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSD, 0), + X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::MOVSS, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, @@ -554,6 +949,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), + X86_INTRINSIC_DATA(avx512_mask_palignr_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::PALIGNR, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_128, INTR_TYPE_2OP_MASK, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_256, INTR_TYPE_2OP_MASK, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_mask_pand_d_512, INTR_TYPE_2OP_MASK, ISD::AND, 0), @@ -596,6 +997,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0), X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDUBSW, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), + X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_128, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_256, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), X86_INTRINSIC_DATA(avx512_mask_pmaxs_b_512, INTR_TYPE_2OP_MASK, ISD::SMAX, 0), @@ -644,6 +1057,114 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_pminu_w_128, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_256, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), X86_INTRINSIC_DATA(avx512_mask_pminu_w_512, INTR_TYPE_2OP_MASK, ISD::UMIN, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNC, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK, + X86ISD::VTRUNCUS, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_128, INTR_TYPE_2OP_MASK, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(avx512_mask_pmul_dq_256, INTR_TYPE_2OP_MASK, @@ -700,6 +1221,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psrav_q, INTR_TYPE_2OP_MASK, ISD::SRA, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_d, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), X86_INTRINSIC_DATA(avx512_mask_psrl_q, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_128, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_256, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_w_512, INTR_TYPE_2OP_MASK, X86ISD::VSRL, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_128, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_256, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), + X86_INTRINSIC_DATA(avx512_mask_psrl_wi_512, INTR_TYPE_2OP_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_d, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrli_q, VSHIFT_MASK, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(avx512_mask_psrlv_d, INTR_TYPE_2OP_MASK, ISD::SRL, 0), @@ -728,16 +1255,98 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckhw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklb_w_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpckld_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklqd_q_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_punpcklw_d_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_d_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_128, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_256, INTR_TYPE_2OP_MASK, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_pxor_q_512, INTR_TYPE_2OP_MASK, ISD::XOR, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::RNDSCALE, 0), + X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::RNDSCALE, 0), + X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, @@ -750,6 +1359,38 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::SCALEF, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUF128, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_pd_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), + X86_INTRINSIC_DATA(avx512_mask_shuf_ps_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::SHUFP, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, @@ -758,6 +1399,10 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK_RM, ISD::FSQRT, X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRT_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_128, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_256, INTR_TYPE_2OP_MASK, ISD::FSUB, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, @@ -782,9 +1427,54 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), - X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0), - + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckh_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKH, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_unpckl_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::UNPCKL, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_128, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_256, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_IMM8_MASK, + X86ISD::VALIGN, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM, + ISD::FP16_TO_FP, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK_RM, + ISD::FP_TO_FP16, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, @@ -821,7 +1511,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK, @@ -852,54 +1541,56 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_pd_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_128, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_256, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermil_ps_512, INTR_TYPE_2OP_IMM8_MASK, + X86ISD::VPERMILPI, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_128, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_256, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpermilvar_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK, @@ -910,7 +1601,18 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_xor_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), X86_INTRINSIC_DATA(avx512_mask_xor_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0), - + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), + X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ, + X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD, @@ -959,14 +1661,59 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastb_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastd_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastq_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_128, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_256, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_pbroadcastw_512, INTR_TYPE_1OP_MASK, + X86ISD::VBROADCAST, 0), + X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), + X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0), + X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), + X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), @@ -1017,6 +1764,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), @@ -1024,6 +1773,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0), + X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(sse2_pshuf_d, INTR_TYPE_2OP, X86ISD::PSHUFD, 0), X86_INTRINSIC_DATA(sse2_pshufh_w, INTR_TYPE_2OP, X86ISD::PSHUFHW, 0), X86_INTRINSIC_DATA(sse2_pshufl_w, INTR_TYPE_2OP, X86ISD::PSHUFLW, 0), @@ -1066,12 +1816,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxbw, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxdq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwd, INTR_TYPE_1OP, X86ISD::VSEXT, 0), - X86_INTRINSIC_DATA(sse41_pmovsxwq, INTR_TYPE_1OP, X86ISD::VSEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbd, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbq, INTR_TYPE_1OP, X86ISD::VZEXT, 0), X86_INTRINSIC_DATA(sse41_pmovzxbw, INTR_TYPE_1OP, X86ISD::VZEXT, 0), @@ -1105,7 +1849,31 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), X86_INTRINSIC_DATA(ssse3_psign_b_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), X86_INTRINSIC_DATA(ssse3_psign_d_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), - X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0) + X86_INTRINSIC_DATA(ssse3_psign_w_128, INTR_TYPE_2OP, X86ISD::PSIGN, 0), + X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), + X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0), + X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0), + X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0), + X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0) }; /* @@ -1128,6 +1896,102 @@ static void verifyIntrinsicTables() { std::is_sorted(std::begin(IntrinsicsWithChain), std::end(IntrinsicsWithChain)) && "Intrinsic data tables should be sorted by Intrinsic ID"); + assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain), + std::end(IntrinsicsWithoutChain)) == + std::end(IntrinsicsWithoutChain)) && + (std::adjacent_find(std::begin(IntrinsicsWithChain), + std::end(IntrinsicsWithChain)) == + std::end(IntrinsicsWithChain)) && + "Intrinsic data tables should have unique entries"); +} + +// X86 specific compare constants. +// They must be kept in synch with avxintrin.h +#define _X86_CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ +#define _X86_CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ +#define _X86_CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ +#define _X86_CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ +#define _X86_CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ +#define _X86_CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ +#define _X86_CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ +#define _X86_CMP_ORD_Q 0x07 /* Ordered (nonsignaling) */ +#define _X86_CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ +#define _X86_CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unord, signaling) */ +#define _X86_CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ +#define _X86_CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ +#define _X86_CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ +#define _X86_CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ +#define _X86_CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ +#define _X86_CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ +#define _X86_CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ +#define _X86_CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ +#define _X86_CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_UNORD_S 0x13 /* Unordered (signaling) */ +#define _X86_CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ +#define _X86_CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ +#define _X86_CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unord, non-signaling) */ +#define _X86_CMP_ORD_S 0x17 /* Ordered (signaling) */ +#define _X86_CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ +#define _X86_CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unord, non-sign) */ +#define _X86_CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ +#define _X86_CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ +#define _X86_CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ +#define _X86_CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ +#define _X86_CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ +#define _X86_CMP_TRUE_US 0x1f /* True (unordered, signaling) */ + +/* +* Get comparison modifier from _mm_comi_round_sd/ss intrinsic +* Return tuple <isOrdered, X86 condcode> +*/ +static std::tuple<bool,unsigned> TranslateX86ConstCondToX86CC(SDValue &imm) { + ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(imm); + unsigned IntImm = CImm->getZExtValue(); + // On a floating point condition, the flags are set as follows: + // ZF PF CF op + // 0 | 0 | 0 | X > Y + // 0 | 0 | 1 | X < Y + // 1 | 0 | 0 | X == Y + // 1 | 1 | 1 | unordered + switch (IntImm) { + default: llvm_unreachable("Invalid floating point compare value for Comi!"); + case _X86_CMP_EQ_OQ: // 0x00 - Equal (ordered, nonsignaling) + case _X86_CMP_EQ_OS: // 0x10 - Equal (ordered, signaling) + return std::make_tuple(true, X86::COND_E); + case _X86_CMP_EQ_UQ: // 0x08 - Equal (unordered, non-signaling) + case _X86_CMP_EQ_US: // 0x18 - Equal (unordered, signaling) + return std::make_tuple(false , X86::COND_E); + case _X86_CMP_LT_OS: // 0x01 - Less-than (ordered, signaling) + case _X86_CMP_LT_OQ: // 0x11 - Less-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_B); + case _X86_CMP_NGE_US: // 0x09 - Not-greater-than-or-equal (unordered, signaling) + case _X86_CMP_NGE_UQ: // 0x19 - Not-greater-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false , X86::COND_B); + case _X86_CMP_LE_OS: // 0x02 - Less-than-or-equal (ordered, signaling) + case _X86_CMP_LE_OQ: // 0x12 - Less-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_BE); + case _X86_CMP_NGT_US: // 0x0A - Not-greater-than (unordered, signaling) + case _X86_CMP_NGT_UQ: // 0x1A - Not-greater-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_BE); + case _X86_CMP_GT_OS: // 0x0E - Greater-than (ordered, signaling) + case _X86_CMP_GT_OQ: // 0x1E - Greater-than (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_A); + case _X86_CMP_NLE_US: // 0x06 - Not-less-than-or-equal (unordered,signaling) + case _X86_CMP_NLE_UQ: // 0x16 - Not-less-than-or-equal (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_A); + case _X86_CMP_GE_OS: // 0x0D - Greater-than-or-equal (ordered, signaling) + case _X86_CMP_GE_OQ: // 0x1D - Greater-than-or-equal (ordered, nonsignaling) + return std::make_tuple(true, X86::COND_AE); + case _X86_CMP_NLT_US: // 0x05 - Not-less-than (unordered, signaling) + case _X86_CMP_NLT_UQ: // 0x15 - Not-less-than (unordered, nonsignaling) + return std::make_tuple(false, X86::COND_AE); + case _X86_CMP_NEQ_OQ: // 0x0C - Not-equal (ordered, non-signaling) + case _X86_CMP_NEQ_OS: // 0x1C - Not-equal (ordered, signaling) + return std::make_tuple(true, X86::COND_NE); + case _X86_CMP_NEQ_UQ: // 0x04 - Not-equal (unordered, nonsignaling) + case _X86_CMP_NEQ_US: // 0x14 - Not-equal (unordered, signaling) + return std::make_tuple(false, X86::COND_NE); + } } } // End llvm namespace diff --git a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp index 3415ced..e186f70 100644 --- a/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -92,7 +92,6 @@ namespace llvm { SmallVector<MCFixup, 4> Fixups; raw_svector_ostream VecOS(Code); CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI); - VecOS.flush(); CurrentShadowSize += Code.size(); if (CurrentShadowSize >= RequiredShadowSize) InShadow = false; // The shadow is big enough. Stop counting. @@ -128,7 +127,7 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const { /// operand to an MCSymbol. MCSymbol *X86MCInstLower:: GetSymbolFromOperand(const MachineOperand &MO) const { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = MF.getDataLayout(); assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference"); MCSymbol *Sym = nullptr; @@ -151,7 +150,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { } if (!Suffix.empty()) - Name += DL->getPrivateGlobalPrefix(); + Name += DL.getPrivateGlobalPrefix(); unsigned PrefixLen = Name.size(); @@ -159,7 +158,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const { const GlobalValue *GV = MO.getGlobal(); AsmPrinter.getNameWithPrefix(Name, GV); } else if (MO.isSymbol()) { - Mangler::getNameWithPrefix(Name, MO.getSymbolName(), *DL); + Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL); } else if (MO.isMBB()) { assert(Suffix.empty()); Sym = MO.getMBB()->getSymbol(); @@ -461,6 +460,7 @@ ReSimplify: // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B // if one of the registers is extended, but other isn't. + case X86::VMOVZPQILo2PQIrr: case X86::VMOVAPDrr: case X86::VMOVAPDYrr: case X86::VMOVAPSrr: @@ -478,18 +478,19 @@ ReSimplify: unsigned NewOpc; switch (OutMI.getOpcode()) { default: llvm_unreachable("Invalid opcode"); - case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; - case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; - case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; - case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; - case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; - case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; - case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; - case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; - case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; - case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; - case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; - case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; + case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break; + case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break; + case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break; + case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break; + case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break; + case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break; + case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break; + case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break; + case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break; + case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break; + case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break; + case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break; + case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break; } OutMI.setOpcode(NewOpc); } @@ -532,6 +533,23 @@ ReSimplify: break; } + case X86::CLEANUPRET: { + // Replace CATCHRET with the appropriate RET. + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget())); + break; + } + + case X86::CATCHRET: { + // Replace CATCHRET with the appropriate RET. + const X86Subtarget &Subtarget = AsmPrinter.getSubtarget(); + unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; + OutMI = MCInst(); + OutMI.setOpcode(getRetOpcode(Subtarget)); + OutMI.addOperand(MCOperand::createReg(ReturnReg)); + break; + } + // TAILJMPd, TAILJMPd64 - Lower to the correct jump instructions. case X86::TAILJMPr: case X86::TAILJMPd: @@ -598,17 +616,29 @@ ReSimplify: case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify; case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify; case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify; case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify; case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify; case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify; case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify; case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify; case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify; case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify; case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify; case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify; case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; @@ -875,7 +905,10 @@ void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI, MCInst LoadMI; LoadMI.setOpcode(LoadOpcode); - LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + + if (LoadDefRegister != X86::NoRegister) + LoadMI.addOperand(MCOperand::createReg(LoadDefRegister)); + for (auto I = MI.operands_begin() + LoadOperandsBeginIdx, E = MI.operands_end(); I != E; ++I) @@ -1062,6 +1095,18 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86ATTInstPrinter::getRegisterName(Reg)); break; } + case X86::CLEANUPRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CLEANUPRET"); + break; + } + + case X86::CATCHRET: { + // Lower these as normal, but add some comments. + OutStreamer->AddComment("CATCHRET"); + break; + } + case X86::TAILJMPr: case X86::TAILJMPm: case X86::TAILJMPd: @@ -1095,12 +1140,30 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32) .addExpr(MCSymbolRefExpr::create(PICBase, OutContext))); + const X86FrameLowering* FrameLowering = + MF->getSubtarget<X86Subtarget>().getFrameLowering(); + bool hasFP = FrameLowering->hasFP(*MF); + + // TODO: This is needed only if we require precise CFA. + bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() && + !OutStreamer->getDwarfFrameInfos().back().End; + + int stackGrowth = -RI->getSlotSize(); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth); + } + // Emit the label. OutStreamer->EmitLabel(PICBase); // popl $reg EmitAndCountInstruction(MCInstBuilder(X86::POP32r) .addReg(MI->getOperand(0).getReg())); + + if (HasActiveDwarfFrame && !hasFP) { + OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth); + } return; } @@ -1206,19 +1269,48 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; } - // Lower PSHUFB and VPERMILP normally but add a comment if we can find - // a constant shuffle mask. We won't be able to do this at the MC layer - // because the mask isn't an immediate. + // Lower PSHUFB and VPERMILP normally but add a comment if we can find + // a constant shuffle mask. We won't be able to do this at the MC layer + // because the mask isn't an immediate. case X86::PSHUFBrm: case X86::VPSHUFBrm: - case X86::VPSHUFBYrm: { + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrm: + case X86::VPSHUFBZrmk: + case X86::VPSHUFBZrmkz: { if (!OutStreamer->isVerboseAsm()) break; - assert(MI->getNumOperands() > 5 && - "We should always have at least 5 operands!"); + unsigned SrcIdx, MaskIdx; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::PSHUFBrm: + case X86::VPSHUFBrm: + case X86::VPSHUFBYrm: + case X86::VPSHUFBZ128rm: + case X86::VPSHUFBZ256rm: + case X86::VPSHUFBZrm: + SrcIdx = 1; MaskIdx = 5; break; + case X86::VPSHUFBZ128rmkz: + case X86::VPSHUFBZ256rmkz: + case X86::VPSHUFBZrmkz: + SrcIdx = 2; MaskIdx = 6; break; + case X86::VPSHUFBZ128rmk: + case X86::VPSHUFBZ256rmk: + case X86::VPSHUFBZrmk: + SrcIdx = 3; MaskIdx = 7; break; + } + + assert(MI->getNumOperands() >= 6 && + "We should always have at least 6 operands!"); const MachineOperand &DstOp = MI->getOperand(0); - const MachineOperand &SrcOp = MI->getOperand(1); - const MachineOperand &MaskOp = MI->getOperand(5); + const MachineOperand &SrcOp = MI->getOperand(SrcIdx); + const MachineOperand &MaskOp = MI->getOperand(MaskIdx); if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; @@ -1240,35 +1332,53 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { const MachineOperand &SrcOp = MI->getOperand(1); const MachineOperand &MaskOp = MI->getOperand(5); + unsigned ElSize; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VPERMILPSrm: case X86::VPERMILPSYrm: ElSize = 32; break; + case X86::VPERMILPDrm: case X86::VPERMILPDYrm: ElSize = 64; break; + } + if (auto *C = getConstantFromPool(*MI, MaskOp)) { SmallVector<int, 16> Mask; - DecodeVPERMILPMask(C, Mask); + DecodeVPERMILPMask(C, ElSize, Mask); if (!Mask.empty()) OutStreamer->AddComment(getShuffleComment(DstOp, SrcOp, Mask)); } break; } - // For loads from a constant pool to a vector register, print the constant - // loaded. - case X86::MOVAPDrm: - case X86::VMOVAPDrm: - case X86::VMOVAPDYrm: - case X86::MOVUPDrm: - case X86::VMOVUPDrm: - case X86::VMOVUPDYrm: - case X86::MOVAPSrm: - case X86::VMOVAPSrm: - case X86::VMOVAPSYrm: - case X86::MOVUPSrm: - case X86::VMOVUPSrm: - case X86::VMOVUPSYrm: - case X86::MOVDQArm: - case X86::VMOVDQArm: - case X86::VMOVDQAYrm: - case X86::MOVDQUrm: - case X86::VMOVDQUrm: - case X86::VMOVDQUYrm: +#define MOV_CASE(Prefix, Suffix) \ + case X86::Prefix##MOVAPD##Suffix##rm: \ + case X86::Prefix##MOVAPS##Suffix##rm: \ + case X86::Prefix##MOVUPD##Suffix##rm: \ + case X86::Prefix##MOVUPS##Suffix##rm: \ + case X86::Prefix##MOVDQA##Suffix##rm: \ + case X86::Prefix##MOVDQU##Suffix##rm: + +#define MOV_AVX512_CASE(Suffix) \ + case X86::VMOVDQA64##Suffix##rm: \ + case X86::VMOVDQA32##Suffix##rm: \ + case X86::VMOVDQU64##Suffix##rm: \ + case X86::VMOVDQU32##Suffix##rm: \ + case X86::VMOVDQU16##Suffix##rm: \ + case X86::VMOVDQU8##Suffix##rm: \ + case X86::VMOVAPS##Suffix##rm: \ + case X86::VMOVAPD##Suffix##rm: \ + case X86::VMOVUPS##Suffix##rm: \ + case X86::VMOVUPD##Suffix##rm: + +#define CASE_ALL_MOV_RM() \ + MOV_CASE(, ) /* SSE */ \ + MOV_CASE(V, ) /* AVX-128 */ \ + MOV_CASE(V, Y) /* AVX-256 */ \ + MOV_AVX512_CASE(Z) \ + MOV_AVX512_CASE(Z256) \ + MOV_AVX512_CASE(Z128) + + // For loads from a constant pool to a vector register, print the constant + // loaded. + CASE_ALL_MOV_RM() if (!OutStreamer->isVerboseAsm()) break; if (MI->getNumOperands() > 4) @@ -1302,7 +1412,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { if (isa<UndefValue>(COp)) { CS << "u"; } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { - CS << CI->getZExtValue(); + if (CI->getBitWidth() <= 64) { + CS << CI->getZExtValue(); + } else { + // print multi-word constant as (w0,w1) + auto Val = CI->getValue(); + CS << "("; + for (int i = 0, N = Val.getNumWords(); i < N; ++i) { + if (i > 0) + CS << ","; + CS << Val.getRawData()[i]; + } + CS << ")"; + } } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { SmallString<32> Str; CF->getValueAPF().toString(Str); diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp index ac2cdc8c..c9e636f 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- X86MachineFuctionInfo.cpp - X86 machine function info -------------===// +//===-- X86MachineFunctionInfo.cpp - X86 machine function info ------------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index e6db970..3a7a98d 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// +//===-- X86MachineFunctionInfo.h - X86 machine function info ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -84,8 +84,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// of pushes to pass function parameters. bool HasPushSequences = false; - /// True if the function uses llvm.x86.seh.restoreframe, and it needed a spill - /// slot for the frame pointer. + /// True if the function recovers from an SEH exception, and therefore needs + /// to spill and restore the frame pointer. bool HasSEHFramePtrSave = false; /// The frame index of a stack object containing the original frame pointer @@ -100,7 +100,7 @@ private: public: X86MachineFunctionInfo() = default; - explicit X86MachineFunctionInfo(MachineFunction &MF) {}; + explicit X86MachineFunctionInfo(MachineFunction &MF) {} bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } diff --git a/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp new file mode 100644 index 0000000..58020d9 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86OptimizeLEAs.cpp @@ -0,0 +1,326 @@ +//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass that performs some optimizations with LEA +// instructions in order to improve code size. +// Currently, it does one thing: +// 1) Address calculations in load and store instructions are replaced by +// existing LEA def registers where possible. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-optimize-LEAs" + +static cl::opt<bool> EnableX86LEAOpt("enable-x86-lea-opt", cl::Hidden, + cl::desc("X86: Enable LEA optimizations."), + cl::init(false)); + +STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions"); + +namespace { +class OptimizeLEAPass : public MachineFunctionPass { +public: + OptimizeLEAPass() : MachineFunctionPass(ID) {} + + const char *getPassName() const override { return "X86 LEA Optimize"; } + + /// \brief Loop over all of the basic blocks, replacing address + /// calculations in load and store instructions, if it's already + /// been calculated by LEA. Also, remove redundant LEAs. + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + /// \brief Returns a distance between two instructions inside one basic block. + /// Negative result means, that instructions occur in reverse order. + int calcInstrDist(const MachineInstr &First, const MachineInstr &Last); + + /// \brief Choose the best \p LEA instruction from the \p List to replace + /// address calculation in \p MI instruction. Return the address displacement + /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift + /// and \p Dist. + bool chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist); + + /// \brief Returns true if two machine operand are identical and they are not + /// physical registers. + bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2); + + /// \brief Returns true if the instruction is LEA. + bool isLEA(const MachineInstr &MI); + + /// \brief Returns true if two instructions have memory operands that only + /// differ by displacement. The numbers of the first memory operands for both + /// instructions are specified through \p N1 and \p N2. The address + /// displacement is returned through AddrDispShift. + bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift); + + /// \brief Find all LEA instructions in the basic block. + void findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List); + + /// \brief Removes redundant address calculations. + bool removeRedundantAddrCalc(const SmallVectorImpl<MachineInstr *> &List); + + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + const X86RegisterInfo *TRI; + + static char ID; +}; +char OptimizeLEAPass::ID = 0; +} + +FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } + +int OptimizeLEAPass::calcInstrDist(const MachineInstr &First, + const MachineInstr &Last) { + const MachineBasicBlock *MBB = First.getParent(); + + // Both instructions must be in the same basic block. + assert(Last.getParent() == MBB && + "Instructions are in different basic blocks"); + + return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) - + std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First)); +} + +// Find the best LEA instruction in the List to replace address recalculation in +// MI. Such LEA must meet these requirements: +// 1) The address calculated by the LEA differs only by the displacement from +// the address used in MI. +// 2) The register class of the definition of the LEA is compatible with the +// register class of the address base register of MI. +// 3) Displacement of the new memory operand should fit in 1 byte if possible. +// 4) The LEA should be as close to MI as possible, and prior to it if +// possible. +bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl<MachineInstr *> &List, + const MachineInstr &MI, MachineInstr *&LEA, + int64_t &AddrDispShift, int &Dist) { + const MachineFunction *MF = MI.getParent()->getParent(); + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) + + X86II::getOperandBias(Desc); + + LEA = nullptr; + + // Loop over all LEA instructions. + for (auto DefMI : List) { + int64_t AddrDispShiftTemp = 0; + + // Compare instructions memory operands. + if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp)) + continue; + + // Make sure address displacement fits 4 bytes. + if (!isInt<32>(AddrDispShiftTemp)) + continue; + + // Check that LEA def register can be used as MI address base. Some + // instructions can use a limited set of registers as address base, for + // example MOV8mr_NOREX. We could constrain the register class of the LEA + // def to suit MI, however since this case is very rare and hard to + // reproduce in a test it's just more reliable to skip the LEA. + if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) != + MRI->getRegClass(DefMI->getOperand(0).getReg())) + continue; + + // Choose the closest LEA instruction from the list, prior to MI if + // possible. Note that we took into account resulting address displacement + // as well. Also note that the list is sorted by the order in which the LEAs + // occur, so the break condition is pretty simple. + int DistTemp = calcInstrDist(*DefMI, MI); + assert(DistTemp != 0 && + "The distance between two different instructions cannot be zero"); + if (DistTemp > 0 || LEA == nullptr) { + // Do not update return LEA, if the current one provides a displacement + // which fits in 1 byte, while the new candidate does not. + if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) && + isInt<8>(AddrDispShift)) + continue; + + LEA = DefMI; + AddrDispShift = AddrDispShiftTemp; + Dist = DistTemp; + } + + // FIXME: Maybe we should not always stop at the first LEA after MI. + if (DistTemp < 0) + break; + } + + return LEA != nullptr; +} + +bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1, + const MachineOperand &MO2) { + return MO1.isIdenticalTo(MO2) && + (!MO1.isReg() || + !TargetRegisterInfo::isPhysicalRegister(MO1.getReg())); +} + +bool OptimizeLEAPass::isLEA(const MachineInstr &MI) { + unsigned Opcode = MI.getOpcode(); + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +// Check if MI1 and MI2 have memory operands which represent addresses that +// differ only by displacement. +bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1, + const MachineInstr &MI2, unsigned N2, + int64_t &AddrDispShift) { + // Address base, scale, index and segment operands must be identical. + static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt, + X86::AddrIndexReg, X86::AddrSegmentReg}; + for (auto &N : IdenticalOpNums) + if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N))) + return false; + + // Address displacement operands may differ by a constant. + const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp); + const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp); + if (!isIdenticalOp(*Op1, *Op2)) { + if (Op1->isImm() && Op2->isImm()) + AddrDispShift = Op1->getImm() - Op2->getImm(); + else if (Op1->isGlobal() && Op2->isGlobal() && + Op1->getGlobal() == Op2->getGlobal()) + AddrDispShift = Op1->getOffset() - Op2->getOffset(); + else + return false; + } + + return true; +} + +void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB, + SmallVectorImpl<MachineInstr *> &List) { + for (auto &MI : MBB) { + if (isLEA(MI)) + List.push_back(const_cast<MachineInstr *>(&MI)); + } +} + +// Try to find load and store instructions which recalculate addresses already +// calculated by some LEA and replace their memory operands with its def +// register. +bool OptimizeLEAPass::removeRedundantAddrCalc( + const SmallVectorImpl<MachineInstr *> &List) { + bool Changed = false; + + assert(List.size() > 0); + MachineBasicBlock *MBB = List[0]->getParent(); + + // Process all instructions in basic block. + for (auto I = MBB->begin(), E = MBB->end(); I != E;) { + MachineInstr &MI = *I++; + unsigned Opcode = MI.getOpcode(); + + // Instruction must be load or store. + if (!MI.mayLoadOrStore()) + continue; + + // Get the number of the first memory operand. + const MCInstrDesc &Desc = MI.getDesc(); + int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode); + + // If instruction has no memory operand - skip it. + if (MemOpNo < 0) + continue; + + MemOpNo += X86II::getOperandBias(Desc); + + // Get the best LEA instruction to replace address calculation. + MachineInstr *DefMI; + int64_t AddrDispShift; + int Dist; + if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist)) + continue; + + // If LEA occurs before current instruction, we can freely replace + // the instruction. If LEA occurs after, we can lift LEA above the + // instruction and this way to be able to replace it. Since LEA and the + // instruction have similar memory operands (thus, the same def + // instructions for these operands), we can always do that, without + // worries of using registers before their defs. + if (Dist < 0) { + DefMI->removeFromParent(); + MBB->insert(MachineBasicBlock::iterator(&MI), DefMI); + } + + // Since we can possibly extend register lifetime, clear kill flags. + MRI->clearKillFlags(DefMI->getOperand(0).getReg()); + + ++NumSubstLEAs; + DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump();); + + // Change instruction operands. + MI.getOperand(MemOpNo + X86::AddrBaseReg) + .ChangeToRegister(DefMI->getOperand(0).getReg(), false); + MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1); + MI.getOperand(MemOpNo + X86::AddrIndexReg) + .ChangeToRegister(X86::NoRegister, false); + MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift); + MI.getOperand(MemOpNo + X86::AddrSegmentReg) + .ChangeToRegister(X86::NoRegister, false); + + DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump();); + + Changed = true; + } + + return Changed; +} + +bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { + bool Changed = false; + + // Perform this optimization only if we care about code size. + if (!EnableX86LEAOpt || !MF.getFunction()->optForSize()) + return false; + + MRI = &MF.getRegInfo(); + TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); + TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo(); + + // Process all basic blocks. + for (auto &MBB : MF) { + SmallVector<MachineInstr *, 16> LEAs; + + // Find all LEA instructions in basic block. + findLEAs(MBB, LEAs); + + // If current basic block has no LEAs, move on to the next one. + if (LEAs.empty()) + continue; + + // Remove redundant address calculations. + Changed |= removeRedundantAddrCalc(LEAs); + } + + return Changed; +} diff --git a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp index 143e70b..0f425e2 100644 --- a/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/contrib/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -93,8 +93,7 @@ FunctionPass *llvm::createX86PadShortFunctions() { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// NOOP instructions before early exits. bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { - if (MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize) || - MF.getFunction()->hasFnAttribute(Attribute::MinSize)) { + if (MF.getFunction()->optForSize()) { return false; } @@ -107,7 +106,7 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { // Search through basic blocks and mark the ones that have early returns ReturnBBs.clear(); VisitedBBs.clear(); - findReturns(MF.begin()); + findReturns(&MF.front()); bool MadeChange = false; diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index d8495e5..5840443 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -27,7 +27,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" @@ -44,12 +43,6 @@ using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "X86GenRegisterInfo.inc" -cl::opt<bool> -ForceStackAlign("force-align-stack", - cl::desc("Force align the stack to the minimum alignment" - " needed for the function."), - cl::init(false), cl::Hidden); - static cl::opt<bool> EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); @@ -174,21 +167,34 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; - case 2: // Available for tailcall (not callee-saved GPRs). - const Function *F = MF.getFunction(); - if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) - return &X86::GR64_TCW64RegClass; - else if (Is64Bit) - return &X86::GR64_TCRegClass; - - bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); - if (hasHipeCC) - return &X86::GR32RegClass; - return &X86::GR32_TCRegClass; + case 2: // NOREX GPRs. + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREXRegClass; + return &X86::GR32_NOREXRegClass; + case 3: // NOREX GPRs except the stack pointer (for encoding reasons). + if (Subtarget.isTarget64BitLP64()) + return &X86::GR64_NOREX_NOSPRegClass; + return &X86::GR32_NOREX_NOSPRegClass; + case 4: // Available for tailcall (not callee-saved GPRs). + return getGPRsForTailCall(MF); } } const TargetRegisterClass * +X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { + const Function *F = MF.getFunction(); + if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) + return &X86::GR64_TCW64RegClass; + else if (Is64Bit) + return &X86::GR64_TCRegClass; + + bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); + if (hasHipeCC) + return &X86::GR32RegClass; + return &X86::GR32_TCRegClass; +} + +const TargetRegisterClass * X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (RC == &X86::CCRRegClass) { if (Is64Bit) @@ -222,6 +228,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); bool CallsEHReturn = MF->getMMI().callsEHReturn(); @@ -241,6 +248,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (HasAVX) return CSR_64_RT_AllRegs_AVX_SaveList; return CSR_64_RT_AllRegs_SaveList; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return CSR_64_TLS_Darwin_SaveList; + break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; @@ -254,6 +265,8 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_64_Intel_OCL_BI_SaveList; break; } + case CallingConv::HHVM: + return CSR_64_HHVM_SaveList; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; @@ -264,6 +277,18 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (CallsEHReturn) return CSR_64EHRet_SaveList; return CSR_64_SaveList; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_SaveList; + else + return CSR_64_AllRegs_SaveList; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_SaveList; + else + return CSR_32_AllRegs_SaveList; + } default: break; } @@ -284,6 +309,7 @@ const uint32_t * X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); @@ -301,6 +327,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (HasAVX) return CSR_64_RT_AllRegs_AVX_RegMask; return CSR_64_RT_AllRegs_RegMask; + case CallingConv::CXX_FAST_TLS: + if (Is64Bit) + return CSR_64_TLS_Darwin_RegMask; + break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; @@ -314,16 +344,30 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_64_Intel_OCL_BI_RegMask; break; } + case CallingConv::HHVM: + return CSR_64_HHVM_RegMask; case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; break; - default: - break; case CallingConv::X86_64_Win64: return CSR_Win64_RegMask; case CallingConv::X86_64_SysV: return CSR_64_RegMask; + case CallingConv::X86_INTR: + if (Is64Bit) { + if (HasAVX) + return CSR_64_AllRegs_AVX_RegMask; + else + return CSR_64_AllRegs_RegMask; + } else { + if (HasSSE) + return CSR_32_AllRegs_SSE_RegMask; + else + return CSR_32_AllRegs_RegMask; + } + default: + break; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check @@ -341,6 +385,10 @@ X86RegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } +const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const { + return CSR_64_TLS_Darwin_RegMask; +} + BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const X86FrameLowering *TFI = getFrameLowering(MF); @@ -371,8 +419,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); - unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), MVT::i64, - false); + unsigned BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); @@ -439,6 +486,10 @@ void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Stack Frame Processing methods //===----------------------------------------------------------------------===// +static bool CantUseSP(const MachineFrameInfo *MFI) { + return MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); +} + bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -451,13 +502,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { // reference locals while also adjusting the stack pointer. When we can't // use both the SP and the FP, we need a separate base pointer register. bool CantUseFP = needsStackRealignment(MF); - bool CantUseSP = - MFI->hasVarSizedObjects() || MFI->hasOpaqueSPAdjustment(); - return CantUseFP && CantUseSP; + return CantUseFP && CantUseSP(MFI); } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { - if (MF.getFunction()->hasFnAttribute("no-realign-stack")) + if (!TargetRegisterInfo::canRealignStack(MF)) return false; const MachineFrameInfo *MFI = MF.getFrameInfo(); @@ -470,26 +519,11 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { // If a base pointer is necessary. Check that it isn't too late to reserve // it. - if (MFI->hasVarSizedObjects()) + if (CantUseSP(MFI)) return MRI->canReserveReg(BasePtr); return true; } -bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { - const MachineFrameInfo *MFI = MF.getFrameInfo(); - const X86FrameLowering *TFI = getFrameLowering(MF); - const Function *F = MF.getFunction(); - unsigned StackAlign = TFI->getStackAlignment(); - bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || - F->hasFnAttribute(Attribute::StackAlignment)); - - // If we've requested that we force align the stack do so now. - if (ForceStackAlign) - return canRealignStack(MF); - - return requiresRealignment && canRealignStack(MF); -} - bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { // Since X86 defines assignCalleeSavedSpillSlots which always return true @@ -510,6 +544,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned Opc = MI.getOpcode(); bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm || Opc == X86::TCRETURNmi || Opc == X86::TCRETURNmi64; + if (hasBasePointer(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); else if (needsStackRealignment(MF)) @@ -524,14 +559,11 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // offset is from the traditional base pointer location. On 64-bit, the // offset is from the SP at the end of the prologue, not the FP location. This // matches the behavior of llvm.frameaddress. + unsigned IgnoredFrameReg; if (Opc == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); - bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); int Offset; - if (IsWinEH) - Offset = TFI->getFrameIndexOffsetFromSP(MF, FrameIndex); - else - Offset = TFI->getFrameIndexOffset(MF, FrameIndex); + Offset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); FI.ChangeToImmediate(Offset); return; } @@ -540,7 +572,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) - BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false); + BasePtr = getX86SubSuperRegister(BasePtr, 64); // This must be part of a four operand memory reference. Replace the // FrameIndex with base register with EBP. Add an offset to the offset. @@ -553,7 +585,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, const MachineFrameInfo *MFI = MF.getFrameInfo(); FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); } else - FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg); if (BasePtr == StackPtr) FIOffset += SPAdj; @@ -592,193 +624,11 @@ X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); unsigned FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) - FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); + FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; } -namespace llvm { -unsigned getX86SubSuperRegisterOrZero(unsigned Reg, MVT::SimpleValueType VT, - bool High) { - switch (VT) { - default: return 0; - case MVT::i8: - if (High) { - switch (Reg) { - default: return getX86SubSuperRegister(Reg, MVT::i64); - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SP; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AH; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DH; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CH; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BH; - } - } else { - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AL; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DL; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CL; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BL; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SIL; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DIL; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BPL; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SPL; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8B; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9B; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10B; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11B; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12B; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13B; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14B; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15B; - } - } - case MVT::i16: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::AX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::DX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::CX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::BX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::SI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::DI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::BP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::SP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8W; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9W; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10W; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11W; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12W; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13W; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14W; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15W; - } - case MVT::i32: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::EAX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::EDX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::ECX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::EBX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::ESI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::EDI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::EBP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::ESP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8D; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9D; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10D; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11D; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12D; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13D; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14D; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15D; - } - case MVT::i64: - switch (Reg) { - default: return 0; - case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: - return X86::RAX; - case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: - return X86::RDX; - case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: - return X86::RCX; - case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: - return X86::RBX; - case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: - return X86::RSI; - case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: - return X86::RDI; - case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: - return X86::RBP; - case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: - return X86::RSP; - case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: - return X86::R8; - case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: - return X86::R9; - case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: - return X86::R10; - case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: - return X86::R11; - case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: - return X86::R12; - case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: - return X86::R13; - case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: - return X86::R14; - case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: - return X86::R15; - } - } -} - -unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, - bool High) { - unsigned Res = getX86SubSuperRegisterOrZero(Reg, VT, High); - if (Res == 0) - llvm_unreachable("Unexpected register or VT"); - return Res; -} - -unsigned get512BitSuperRegister(unsigned Reg) { +unsigned llvm::get512BitSuperRegister(unsigned Reg) { if (Reg >= X86::XMM0 && Reg <= X86::XMM31) return X86::ZMM0 + (Reg - X86::XMM0); if (Reg >= X86::YMM0 && Reg <= X86::YMM31) @@ -787,5 +637,3 @@ unsigned get512BitSuperRegister(unsigned Reg) { return Reg; llvm_unreachable("Unexpected SIMD register"); } - -} diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h index 8de1d0b..f014c8f 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.h +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.h @@ -87,6 +87,11 @@ public: const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override; + /// getGPRsForTailCall - Returns a register class with registers that can be + /// used in forming tail calls. + const TargetRegisterClass * + getGPRsForTailCall(const MachineFunction &MF) const; + unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; @@ -96,7 +101,11 @@ public: getCalleeSavedRegs(const MachineFunction* MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; - const uint32_t *getNoPreservedMask() const; + const uint32_t *getNoPreservedMask() const override; + + // Calls involved in thread-local variable lookup save more registers than + // normal calls, so they need a different mask to represent this. + const uint32_t *getDarwinTLSCallPreservedMask() const; /// getReservedRegs - Returns a bitset indexed by physical register number /// indicating if a register is a special register that has particular uses and @@ -108,9 +117,7 @@ public: bool hasBasePointer(const MachineFunction &MF) const; - bool canRealignStack(const MachineFunction &MF) const; - - bool needsStackRealignment(const MachineFunction &MF) const override; + bool canRealignStack(const MachineFunction &MF) const override; bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const override; @@ -128,16 +135,6 @@ public: unsigned getSlotSize() const { return SlotSize; } }; -/// Returns the sub or super register of a specific X86 register. -/// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) returns X86::AX. -/// Aborts on error. -unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false); - -/// Returns the sub or super register of a specific X86 register. -/// Like getX86SubSuperRegister() but returns 0 on error. -unsigned getX86SubSuperRegisterOrZero(unsigned, MVT::SimpleValueType, - bool High = false); - //get512BitRegister - X86 utility - returns 512-bit super register unsigned get512BitSuperRegister(unsigned Reg); diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td index cdb151c..56f0d93 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.td @@ -225,15 +225,15 @@ let SubRegIndices = [sub_ymm] in { } } - // Mask Registers, used by AVX-512 instructions. - def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; - def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; - def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; - def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; - def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; - def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; - def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; - def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; +// Mask Registers, used by AVX-512 instructions. +def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>; +def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>; +def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>; +def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>; +def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>; +def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>; +def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>; +def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>; // Floating point stack registers. These don't map one-to-one to the FP // pseudo registers, but we still mark them as aliasing FP registers. That @@ -375,7 +375,7 @@ def GR32_TC : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>; def GR64_TC : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI, R8, R9, R11, RIP)>; def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, - R8, R9, R11)>; + R8, R9, R10, R11, RIP)>; // GR8_NOREX - GR8 registers which do not require a REX prefix. def GR8_NOREX : RegisterClass<"X86", [i8], 8, @@ -423,6 +423,8 @@ def FR32 : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 15)>; def FR64 : RegisterClass<"X86", [f64], 64, (add FR32)>; +def FR128 : RegisterClass<"X86", [i128, f128], 128, (add FR32)>; + // FIXME: This sets up the floating point register files as though they are f64 // values, though they really are f80 values. This will cause us to spill @@ -442,10 +444,11 @@ def RST : RegisterClass<"X86", [f80, f64, f32], 32, (sequence "ST%u", 0, 7)> { } // Generic vector registers: VR64 and VR128. +// Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; -def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], 128, (add FR32)>; -def VR256 : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. @@ -459,8 +462,8 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> { } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], 512, - (sequence "ZMM%u", 0, 31)>; +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], + 512, (sequence "ZMM%u", 0, 31)>; // Scalar AVX-512 floating point registers. def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; @@ -468,10 +471,10 @@ def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; // Extended VR128 and VR256 for AVX-512 instructions -def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - 128, (add FR32X)>; -def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], - 256, (sequence "YMM%u", 0, 31)>; +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64], + 128, (add FR32X)>; +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], + 256, (sequence "YMM%u", 0, 31)>; // Mask registers def VK1 : RegisterClass<"X86", [i1], 8, (sequence "K%u", 0, 7)> {let Size = 8;} @@ -491,4 +494,4 @@ def VK32WM : RegisterClass<"X86", [v32i1], 32, (add VK16WM)> {let Size = 32;} def VK64WM : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;} // Bound registers -def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
\ No newline at end of file +def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; diff --git a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index ce79fcf..b1a0161 100644 --- a/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -44,13 +44,10 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible( return false; } -SDValue -X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, - SDValue Chain, - SDValue Dst, SDValue Src, - SDValue Size, unsigned Align, - bool isVolatile, - MachinePointerInfo DstPtrInfo) const { +SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( + SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, + SDValue Size, unsigned Align, bool isVolatile, + MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget<X86Subtarget>(); @@ -74,10 +71,10 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); - if (const char *bzeroEntry = V && + if (const char *bzeroEntry = V && V->isNullValue() ? Subtarget.getBZeroEntry() : nullptr) { - EVT IntPtr = - DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout()); Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; @@ -94,7 +91,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, 0) .setDiscardResult(); - std::pair<SDValue,SDValue> CallResult = DAG.getTargetLoweringInfo().LowerCallTo(CLI); + std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI); return CallResult.second; } @@ -144,8 +141,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, BytesLeft = SizeVal % UBytes; } - Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), - InFlag); + Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), + InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; @@ -172,9 +169,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, DAG.getConstant((AVT == MVT::i64) ? 7 : 3, dl, CVT)); - Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : - X86::ECX, - Left, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : X86::ECX, + Left, InFlag); InFlag = Chain.getValue(1); Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; @@ -249,17 +245,14 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( unsigned BytesLeft = SizeVal % UBytes; SDValue InFlag; - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : - X86::ECX, - Count, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, + Count, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : - X86::EDI, - Dst, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); InFlag = Chain.getValue(1); - Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : - X86::ESI, - Src, InFlag); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, + Src, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp index dff3624..8ef08c9 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.cpp @@ -44,9 +44,8 @@ X86EarlyIfConv("x86-early-ifcvt", cl::Hidden, cl::desc("Enable early if-conversion on X86")); -/// ClassifyBlockAddressReference - Classify a blockaddress reference for the -/// current subtarget according to how we should reference it in a non-pcrel -/// context. +/// Classify a blockaddress reference for the current subtarget according to how +/// we should reference it in a non-pcrel context. unsigned char X86Subtarget::ClassifyBlockAddressReference() const { if (isPICStyleGOT()) // 32-bit ELF targets. return X86II::MO_GOTOFF; @@ -58,9 +57,8 @@ unsigned char X86Subtarget::ClassifyBlockAddressReference() const { return X86II::MO_NO_FLAG; } -/// ClassifyGlobalReference - Classify a global variable reference for the -/// current subtarget according to how we should reference it in a non-pcrel -/// context. +/// Classify a global variable reference for the current subtarget according to +/// how we should reference it in a non-pcrel context. unsigned char X86Subtarget:: ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { // DLLImport only exists on windows, it is implemented as a load from a @@ -147,9 +145,9 @@ ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const { } -/// getBZeroEntry - This function returns the name of a function which has an -/// interface like the non-standard bzero function, if such a function exists on -/// the current subtarget and it is considered prefereable over memset with zero +/// This function returns the name of a function which has an interface like +/// the non-standard bzero function, if such a function exists on the +/// current subtarget and it is considered preferable over memset with zero /// passed as the second argument. Otherwise it returns null. const char *X86Subtarget::getBZeroEntry() const { // Darwin 10 has a __bzero entry point for this purpose. @@ -166,8 +164,7 @@ bool X86Subtarget::hasSinCos() const { is64Bit(); } -/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls -/// to immediate address. +/// Return true if the subtarget allows calls to immediate address. bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const { // FIXME: I386 PE/COFF supports PC relative calls using IMAGE_REL_I386_REL32 // but WinCOFFObjectWriter::RecordRelocation cannot emit them. Once it does, @@ -192,9 +189,25 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { FullFS = "+64bit,+sse2"; } + // LAHF/SAHF are always supported in non-64-bit mode. + if (!In64BitMode) { + if (!FullFS.empty()) + FullFS = "+sahf," + FullFS; + else + FullFS = "+sahf"; + } + + // Parse features string and set the CPU. ParseSubtargetFeatures(CPUName, FullFS); + // All CPUs that implement SSE4.2 or SSE4A support unaligned accesses of + // 16-bytes and under that are reasonably fast. These features were + // introduced with Intel's Nehalem/Silvermont and AMD's Family10h + // micro-architectures respectively. + if (hasSSE42() || hasSSE4A()) + IsUAMem16Slow = false; + InstrItins = getInstrItineraryForCPU(CPUName); // It's important to keep the MCSubtargetInfo feature bits in sync with @@ -224,13 +237,18 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { } void X86Subtarget::initializeEnvironment() { - X86SSELevel = NoMMXSSE; + X86SSELevel = NoSSE; X863DNowLevel = NoThreeDNow; HasCMov = false; HasX86_64 = false; HasPOPCNT = false; HasSSE4A = false; HasAES = false; + HasFXSR = false; + HasXSAVE = false; + HasXSAVEOPT = false; + HasXSAVEC = false; + HasXSAVES = false; HasPCLMUL = false; HasFMA = false; HasFMA4 = false; @@ -252,13 +270,15 @@ void X86Subtarget::initializeEnvironment() { HasBWI = false; HasVLX = false; HasADX = false; + HasPKU = false; HasSHA = false; HasPRFCHW = false; HasRDSEED = false; + HasLAHFSAHF = false; HasMPX = false; IsBTMemSlow = false; IsSHLDSlow = false; - IsUAMemFast = false; + IsUAMem16Slow = false; IsUAMem32Slow = false; HasSSEUnalignedMem = false; HasCmpxchg16b = false; diff --git a/contrib/llvm/lib/Target/X86/X86Subtarget.h b/contrib/llvm/lib/Target/X86/X86Subtarget.h index f026d42..13d1026 100644 --- a/contrib/llvm/lib/Target/X86/X86Subtarget.h +++ b/contrib/llvm/lib/Target/X86/X86Subtarget.h @@ -47,11 +47,11 @@ class X86Subtarget final : public X86GenSubtargetInfo { protected: enum X86SSEEnum { - NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F + NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F }; enum X863DNowEnum { - NoThreeDNow, ThreeDNow, ThreeDNowA + NoThreeDNow, MMX, ThreeDNow, ThreeDNowA }; enum X86ProcFamilyEnum { @@ -64,10 +64,10 @@ protected: /// Which PIC style to use PICStyles::Style PICStyle; - /// MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. + /// SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or none supported. X86SSEEnum X86SSELevel; - /// 3DNow, 3DNow Athlon, or none supported. + /// MMX, 3DNow, 3DNow Athlon, or none supported. X863DNowEnum X863DNowLevel; /// True if this processor has conditional move instructions @@ -86,6 +86,18 @@ protected: /// Target has AES instructions bool HasAES; + /// Target has FXSAVE/FXRESTOR instructions + bool HasFXSR; + + /// Target has XSAVE instructions + bool HasXSAVE; + /// Target has XSAVEOPT instructions + bool HasXSAVEOPT; + /// Target has XSAVEC instructions + bool HasXSAVEC; + /// Target has XSAVES instructions + bool HasXSAVES; + /// Target has carry-less multiplication bool HasPCLMUL; @@ -140,16 +152,19 @@ protected: /// Processor has RDSEED instructions. bool HasRDSEED; + /// Processor has LAHF/SAHF instructions. + bool HasLAHFSAHF; + /// True if BT (bit test) of memory instructions are slow. bool IsBTMemSlow; /// True if SHLD instructions are slow. bool IsSHLDSlow; - /// True if unaligned memory access is fast. - bool IsUAMemFast; + /// True if unaligned memory accesses of 16-bytes are slow. + bool IsUAMem16Slow; - /// True if unaligned 32-byte memory accesses are slow. + /// True if unaligned memory accesses of 32-bytes are slow. bool IsUAMem32Slow; /// True if SSE operations can have unaligned memory operands. @@ -208,6 +223,9 @@ protected: /// Processor has AVX-512 Vector Length eXtenstions bool HasVLX; + /// Processor has PKU extenstions + bool HasPKU; + /// Processot supports MPX - Memory Protection Extensions bool HasMPX; @@ -319,7 +337,6 @@ public: void setPICStyle(PICStyles::Style Style) { PICStyle = Style; } bool hasCMov() const { return HasCMov; } - bool hasMMX() const { return X86SSELevel >= MMX; } bool hasSSE1() const { return X86SSELevel >= SSE1; } bool hasSSE2() const { return X86SSELevel >= SSE2; } bool hasSSE3() const { return X86SSELevel >= SSE3; } @@ -332,14 +349,22 @@ public: bool hasFp256() const { return hasAVX(); } bool hasInt256() const { return hasAVX2(); } bool hasSSE4A() const { return HasSSE4A; } + bool hasMMX() const { return X863DNowLevel >= MMX; } bool has3DNow() const { return X863DNowLevel >= ThreeDNow; } bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasPOPCNT() const { return HasPOPCNT; } bool hasAES() const { return HasAES; } + bool hasFXSR() const { return HasFXSR; } + bool hasXSAVE() const { return HasXSAVE; } + bool hasXSAVEOPT() const { return HasXSAVEOPT; } + bool hasXSAVEC() const { return HasXSAVEC; } + bool hasXSAVES() const { return HasXSAVES; } bool hasPCLMUL() const { return HasPCLMUL; } - bool hasFMA() const { return HasFMA; } - // FIXME: Favor FMA when both are enabled. Is this the right thing to do? - bool hasFMA4() const { return HasFMA4 && !HasFMA; } + // Prefer FMA4 to FMA - its better for commutation/memory folding and + // has equal or better performance on all supported targets. + bool hasFMA() const { return HasFMA && !HasFMA4; } + bool hasFMA4() const { return HasFMA4; } + bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); } bool hasXOP() const { return HasXOP; } bool hasTBM() const { return HasTBM; } bool hasMOVBE() const { return HasMOVBE; } @@ -355,9 +380,10 @@ public: bool hasSHA() const { return HasSHA; } bool hasPRFCHW() const { return HasPRFCHW; } bool hasRDSEED() const { return HasRDSEED; } + bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } - bool isUnalignedMemAccessFast() const { return IsUAMemFast; } + bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } @@ -375,6 +401,7 @@ public: bool hasDQI() const { return HasDQI; } bool hasBWI() const { return HasBWI; } bool hasVLX() const { return HasVLX; } + bool hasPKU() const { return HasPKU; } bool hasMPX() const { return HasMPX; } bool isAtom() const { return X86ProcFamily == IntelAtom; } @@ -394,9 +421,11 @@ public: bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); } bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + bool isTargetAndroid() const { return TargetTriple.isAndroid(); } bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); } bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); } bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); } + bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); } bool isTargetWindowsMSVC() const { return TargetTriple.isWindowsMSVCEnvironment(); @@ -406,6 +435,10 @@ public: return TargetTriple.isKnownWindowsMSVCEnvironment(); } + bool isTargetWindowsCoreCLR() const { + return TargetTriple.isWindowsCoreCLREnvironment(); + } + bool isTargetWindowsCygwin() const { return TargetTriple.isWindowsCygwinEnvironment(); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index fb9cb4b..0e7e4c0 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -28,10 +28,17 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner", cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +namespace llvm { +void initializeWinEHStatePassPass(PassRegistry &); +} + extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target); RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target); + + PassRegistry &PR = *PassRegistry::getPassRegistry(); + initializeWinEHStatePassPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -45,7 +52,7 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { return make_unique<X86LinuxNaClTargetObjectFile>(); if (TT.isOSBinFormatELF()) return make_unique<X86ELFTargetObjectFile>(); - if (TT.isKnownWindowsMSVCEnvironment()) + if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment()) return make_unique<X86WindowsTargetObjectFile>(); if (TT.isOSBinFormatCOFF()) return make_unique<TargetLoweringObjectFileCOFF>(); @@ -175,8 +182,9 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, //===----------------------------------------------------------------------===// TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis( - [this](Function &F) { return TargetTransformInfo(X86TTIImpl(this, F)); }); + return TargetIRAnalysis([this](const Function &F) { + return TargetTransformInfo(X86TTIImpl(this, F)); + }); } @@ -246,6 +254,9 @@ bool X86PassConfig::addPreISel() { } void X86PassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createX86OptimizeLEAs()); + addPass(createX86CallFrameOptimization()); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp index 6f900ea..782768d 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -16,6 +16,7 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCValue.h" +#include "llvm/Support/COFF.h" #include "llvm/Support/Dwarf.h" #include "llvm/Target/TargetLowering.h" @@ -152,9 +153,8 @@ static std::string scalarConstantToHexString(const Constant *C) { } } -MCSection * -X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *X86WindowsTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isMergeableConst() && C) { const unsigned Characteristics = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ | @@ -171,5 +171,5 @@ X86WindowsTargetObjectFile::getSectionForConstant(SectionKind Kind, COFF::IMAGE_COMDAT_SELECT_ANY); } - return TargetLoweringObjectFile::getSectionForConstant(Kind, C); + return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C); } diff --git a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h index 66366b2..6b2448c 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/contrib/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -58,7 +58,7 @@ namespace llvm { /// \brief Given a mergeable constant with the specified size and relocation /// information, return a section that it should be placed in. - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override; }; diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 7df7260..2e7bbb2 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Target/CostTable.h" #include "llvm/Target/TargetLowering.h" + using namespace llvm; #define DEBUG_TYPE "x86tti" @@ -62,8 +63,8 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) { if (ST->is64Bit()) return 64; - return 32; + return 32; } unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { @@ -84,12 +85,12 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { return 2; } -unsigned X86TTIImpl::getArithmeticInstrCost( +int X86TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo, TTI::OperandValueProperties Opd2PropInfo) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); @@ -101,10 +102,9 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence SRA + SRL + ADD + SRA. // The OperandValue properties many not be same as that of previous // operation;conservatively assume OP_None. - unsigned Cost = - 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + int Cost = 2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, + Op2Info, TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -115,8 +115,7 @@ unsigned X86TTIImpl::getArithmeticInstrCost( return Cost; } - static const CostTblEntry<MVT::SimpleValueType> - AVX2UniformConstCostTable[] = { + static const CostTblEntry AVX2UniformConstCostTable[] = { { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v16i16, 6 }, // vpmulhw sequence @@ -127,12 +126,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost( if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasAVX2()) { - int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX2UniformConstCostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> AVX512CostTable[] = { + static const CostTblEntry AVX512CostTable[] = { { ISD::SHL, MVT::v16i32, 1 }, { ISD::SRL, MVT::v16i32, 1 }, { ISD::SRA, MVT::v16i32, 1 }, @@ -141,7 +140,12 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v8i64, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = { + if (ST->hasAVX512()) { + if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CostTable[] = { // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to // customize them to detect the cases where shift amount is a scalar one. { ISD::SHL, MVT::v4i32, 1 }, @@ -154,7 +158,57 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v2i64, 1 }, { ISD::SHL, MVT::v4i64, 1 }, { ISD::SRL, MVT::v4i64, 1 }, + }; + + // Look for AVX2 lowering tricks. + if (ST->hasAVX2()) { + if (ISD == ISD::SHL && LT.second == MVT::v16i16 && + (Op2Info == TargetTransformInfo::OK_UniformConstantValue || + Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) + // On AVX2, a packed v16i16 shift left by a constant build_vector + // is lowered into a vector multiply (vpmullw). + return LT.first; + + if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry XOPCostTable[] = { + // 128bit shifts take 1cy, but right shifts require negation beforehand. + { ISD::SHL, MVT::v16i8, 1 }, + { ISD::SRL, MVT::v16i8, 2 }, + { ISD::SRA, MVT::v16i8, 2 }, + { ISD::SHL, MVT::v8i16, 1 }, + { ISD::SRL, MVT::v8i16, 2 }, + { ISD::SRA, MVT::v8i16, 2 }, + { ISD::SHL, MVT::v4i32, 1 }, + { ISD::SRL, MVT::v4i32, 2 }, + { ISD::SRA, MVT::v4i32, 2 }, + { ISD::SHL, MVT::v2i64, 1 }, + { ISD::SRL, MVT::v2i64, 2 }, + { ISD::SRA, MVT::v2i64, 2 }, + // 256bit shifts require splitting if AVX2 didn't catch them above. + { ISD::SHL, MVT::v32i8, 2 }, + { ISD::SRL, MVT::v32i8, 4 }, + { ISD::SRA, MVT::v32i8, 4 }, + { ISD::SHL, MVT::v16i16, 2 }, + { ISD::SRL, MVT::v16i16, 4 }, + { ISD::SRA, MVT::v16i16, 4 }, + { ISD::SHL, MVT::v8i32, 2 }, + { ISD::SRL, MVT::v8i32, 4 }, + { ISD::SRA, MVT::v8i32, 4 }, + { ISD::SHL, MVT::v4i64, 2 }, + { ISD::SRL, MVT::v4i64, 4 }, + { ISD::SRA, MVT::v4i64, 4 }, + }; + // Look for XOP lowering tricks. + if (ST->hasXOP()) { + if (const auto *Entry = CostTableLookup(XOPCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX2CustomCostTable[] = { { ISD::SHL, MVT::v32i8, 11 }, // vpblendvb sequence. { ISD::SHL, MVT::v16i16, 10 }, // extend/vpsrlvd/pack sequence. @@ -163,7 +217,8 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::SRA, MVT::v32i8, 24 }, // vpblendvb sequence. { ISD::SRA, MVT::v16i16, 10 }, // extend/vpsravd/pack sequence. - { ISD::SRA, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SRA, MVT::v2i64, 4 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 4 }, // srl/xor/sub sequence. // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v32i8, 32*20 }, @@ -176,44 +231,44 @@ unsigned X86TTIImpl::getArithmeticInstrCost( { ISD::UDIV, MVT::v4i64, 4*20 }, }; - if (ST->hasAVX512()) { - int Idx = CostTableLookup(AVX512CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX512CostTable[Idx].Cost; - } - // Look for AVX2 lowering tricks. + // Look for AVX2 lowering tricks for custom cases. if (ST->hasAVX2()) { - if (ISD == ISD::SHL && LT.second == MVT::v16i16 && - (Op2Info == TargetTransformInfo::OK_UniformConstantValue || - Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)) - // On AVX2, a packed v16i16 shift left by a constant build_vector - // is lowered into a vector multiply (vpmullw). - return LT.first; - - int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * AVX2CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX2CustomCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> + static const CostTblEntry SSE2UniformConstCostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // Constant splats are cheaper for the following instructions. { ISD::SHL, MVT::v16i8, 1 }, // psllw. + { ISD::SHL, MVT::v32i8, 2 }, // psllw. { ISD::SHL, MVT::v8i16, 1 }, // psllw. + { ISD::SHL, MVT::v16i16, 2 }, // psllw. { ISD::SHL, MVT::v4i32, 1 }, // pslld + { ISD::SHL, MVT::v8i32, 2 }, // pslld { ISD::SHL, MVT::v2i64, 1 }, // psllq. + { ISD::SHL, MVT::v4i64, 2 }, // psllq. { ISD::SRL, MVT::v16i8, 1 }, // psrlw. + { ISD::SRL, MVT::v32i8, 2 }, // psrlw. { ISD::SRL, MVT::v8i16, 1 }, // psrlw. + { ISD::SRL, MVT::v16i16, 2 }, // psrlw. { ISD::SRL, MVT::v4i32, 1 }, // psrld. + { ISD::SRL, MVT::v8i32, 2 }, // psrld. { ISD::SRL, MVT::v2i64, 1 }, // psrlq. + { ISD::SRL, MVT::v4i64, 2 }, // psrlq. { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + { ISD::SRA, MVT::v32i8, 8 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v8i16, 1 }, // psraw. + { ISD::SRA, MVT::v16i16, 2 }, // psraw. { ISD::SRA, MVT::v4i32, 1 }, // psrad. + { ISD::SRA, MVT::v8i32, 2 }, // psrad. { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. + { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence @@ -227,27 +282,34 @@ unsigned X86TTIImpl::getArithmeticInstrCost( if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; - int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * SSE2UniformConstCostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; } if (ISD == ISD::SHL && Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) { - EVT VT = LT.second; + MVT VT = LT.second; + // Vector shift left by non uniform constant can be lowered + // into vector multiply (pmullw/pmulld). if ((VT == MVT::v8i16 && ST->hasSSE2()) || (VT == MVT::v4i32 && ST->hasSSE41())) - // Vector shift left by non uniform constant can be lowered - // into vector multiply (pmullw/pmulld). return LT.first; + + // v16i16 and v8i32 shifts by non-uniform constants are lowered into a + // sequence of extract + two vector multiply + insert. + if ((VT == MVT::v8i32 || VT == MVT::v16i16) && + (ST->hasAVX() && !ST->hasAVX2())) + ISD = ISD::MUL; + + // A vector shift left by non uniform constant is converted + // into a vector multiply; the new multiply is eventually + // lowered into a sequence of shuffles and 2 x pmuludq. if (VT == MVT::v4i32 && ST->hasSSE2()) - // A vector shift left by non uniform constant is converted - // into a vector multiply; the new multiply is eventually - // lowered into a sequence of shuffles and 2 x pmuludq. ISD = ISD::MUL; } - static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = { + static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. // For some cases, where the shift amount is a scalar we would be able @@ -257,20 +319,31 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // used for vectorization and we don't want to make vectorized code worse // than scalar code. { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. - { ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized. - { ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized. + { ISD::SHL, MVT::v8i32, 2*2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v32i8, 2*26 }, // cmpgtb sequence. { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SRL, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v32i8, 2*54 }, // unpacked cmpgtb sequence. { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v16i16, 2*32 }, // cmpgtb sequence. { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 2*10 }, // Scalarized. + { ISD::SRA, MVT::v8i32, 2*16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular @@ -289,12 +362,11 @@ unsigned X86TTIImpl::getArithmeticInstrCost( }; if (ST->hasSSE2()) { - int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second); - if (Idx != -1) - return LT.first * SSE2CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) + return LT.first * Entry->Cost; } - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = { + static const CostTblEntry AVX1CostTable[] = { // We don't have to scalarize unsupported ops. We can issue two half-sized // operations and we only need to extract the upper YMM half. // Two ops + 1 extract + 1 insert = 4. @@ -314,29 +386,21 @@ unsigned X86TTIImpl::getArithmeticInstrCost( // Look for AVX1 lowering tricks. if (ST->hasAVX() && !ST->hasAVX2()) { - EVT VT = LT.second; + MVT VT = LT.second; - // v16i16 and v8i32 shifts by non-uniform constants are lowered into a - // sequence of extract + two vector multiply + insert. - if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) && - Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) - ISD = ISD::MUL; - - int Idx = CostTableLookup(AVX1CostTable, ISD, VT); - if (Idx != -1) - return LT.first * AVX1CostTable[Idx].Cost; + if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, VT)) + return LT.first * Entry->Cost; } // Custom lowering of vectors. - static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = { + static const CostTblEntry CustomLowered[] = { // A v2i64/v4i64 and multiply is custom lowered as a series of long // multiplies(3), shifts(4) and adds(2). { ISD::MUL, MVT::v2i64, 9 }, { ISD::MUL, MVT::v4i64, 9 }, }; - int Idx = CostTableLookup(CustomLowered, ISD, LT.second); - if (Idx != -1) - return LT.first * CustomLowered[Idx].Cost; + if (const auto *Entry = CostTableLookup(CustomLowered, ISD, LT.second)) + return LT.first * Entry->Cost; // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle, // 2x pmuludq, 2x shuffle. @@ -348,15 +412,15 @@ unsigned X86TTIImpl::getArithmeticInstrCost( return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info); } -unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp) { +int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { // We only estimate the cost of reverse and alternate shuffles. if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate) return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); if (Kind == TTI::SK_Reverse) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); - unsigned Cost = 1; + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + int Cost = 1; if (LT.second.getSizeInBits() > 128) Cost = 3; // Extract + insert + copy. @@ -367,14 +431,14 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (Kind == TTI::SK_Alternate) { // 64-bit packed float vectors (v2f32) are widened to type v4f32. // 64-bit packed integer vectors (v2i32) are promoted to type v2i64. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp); // The backend knows how to generate a single VEX.256 version of // instruction VPBLENDW if the target supports AVX2. if (ST->hasAVX2() && LT.second == MVT::v16i16) return LT.first; - static const CostTblEntry<MVT::SimpleValueType> AVXAltShuffleTbl[] = { + static const CostTblEntry AVXAltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1}, // vblendpd {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1}, // vblendpd @@ -390,13 +454,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v32i8, 9} }; - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVXAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * AVXAltShuffleTbl[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVXAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSE41AltShuffleTbl[] = { + static const CostTblEntry SSE41AltShuffleTbl[] = { // These are lowered into movsd. {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, @@ -414,13 +477,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} }; - if (ST->hasSSE41()) { - int Idx = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSE41AltShuffleTbl[Idx].Cost; - } + if (ST->hasSSE41()) + if (const auto *Entry = CostTableLookup(SSE41AltShuffleTbl, ISD::VECTOR_SHUFFLE, + LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSSE3AltShuffleTbl[] = { + static const CostTblEntry SSSE3AltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd @@ -433,13 +495,12 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3} // pshufb + pshufb + or }; - if (ST->hasSSSE3()) { - int Idx = CostTableLookup(SSSE3AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSSE3AltShuffleTbl[Idx].Cost; - } + if (ST->hasSSSE3()) + if (const auto *Entry = CostTableLookup(SSSE3AltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; - static const CostTblEntry<MVT::SimpleValueType> SSEAltShuffleTbl[] = { + static const CostTblEntry SSEAltShuffleTbl[] = { {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1}, // movsd {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1}, // movsd @@ -454,65 +515,47 @@ unsigned X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, }; // Fall-back (SSE3 and SSE2). - int Idx = CostTableLookup(SSEAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second); - if (Idx != -1) - return LT.first * SSEAltShuffleTbl[Idx].Cost; + if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl, + ISD::VECTOR_SHUFFLE, LT.second)) + return LT.first * Entry->Cost; return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); } -unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { +int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); - std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); - - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - SSE2ConvTbl[] = { - // These are somewhat magic numbers justified by looking at the output of - // Intel's IACA, running some kernels and making sure when we take - // legalization into account the throughput will be overestimated. - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, - { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, - // There are faster sequences for float conversions. - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, - { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + // FIXME: Need a better design of the cost table to handle non-simple types of + // potential massive combinations (elem_num x src_type x dst_type). + + static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, + + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, + { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, }; - if (ST->hasSSE2() && !ST->hasAVX()) { - int Idx = - ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second); - if (Idx != -1) - return LTSrc.first * SSE2ConvTbl[Idx].Cost; - } - - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVX512ConversionTbl[] = { + static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, - { ISD::FP_ROUND, MVT::v16f32, MVT::v8f64, 3 }, { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 1 }, { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 1 }, { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, - { ISD::TRUNCATE, MVT::v16i32, MVT::v8i64, 4 }, // v16i1 -> v16i32 - load + broadcast { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, @@ -522,33 +565,49 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, - { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, - { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i32, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, - }; - if (ST->hasAVX512()) { - int Idx = ConvertCostTableLookup(AVX512ConversionTbl, ISD, LTDest.second, - LTSrc.second); - if (Idx != -1) - return AVX512ConversionTbl[Idx].Cost; - } - EVT SrcTy = TLI->getValueType(DL, Src); - EVT DstTy = TLI->getValueType(DL, Dst); - - // The function getSimpleVT only handles simple value types. - if (!SrcTy.isSimple() || !DstTy.isSimple()) - return BaseT::getCastInstrCost(Opcode, Dst, Src); + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 2 }, + { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 2 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i8, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i16, 2 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 5 }, + { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, + { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 12 }, + { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 26 }, + + { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, + { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, + }; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVX2ConversionTbl[] = { + static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, @@ -579,8 +638,7 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, }; - static const TypeConversionCostTblEntry<MVT::SimpleValueType> - AVXConversionTbl[] = { + static const TypeConversionCostTblEntry AVXConversionTbl[] = { { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 7 }, @@ -650,34 +708,158 @@ unsigned X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4*4 }, }; + static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 2 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 30 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, + }; + + static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { + // These are somewhat magic numbers justified by looking at the output of + // Intel's IACA, running some kernels and making sure when we take + // legalization into account the throughput will be overestimated. + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 2*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 8*10 }, + { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 16*10 }, + // There are faster sequences for float conversions. + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 8 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 15 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 8 }, + + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 6 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 8 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 4 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 2 }, + { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 9 }, + { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 12 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 6 }, + { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 }, + { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 3 }, + { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, + { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 }, + { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 2 }, + { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i8, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i8, 6 }, + + { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 10 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, + { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 4 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 3 }, + { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, + { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, + { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 4 }, + }; + + std::pair<int, MVT> LTSrc = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LTDest = TLI->getTypeLegalizationCost(DL, Dst); + + if (ST->hasSSE2() && !ST->hasAVX()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + LTDest.second, LTSrc.second)) + return LTSrc.first * Entry->Cost; + } + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + // The function getSimpleVT only handles simple value types. + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return BaseT::getCastInstrCost(Opcode, Dst, Src); + + if (ST->hasDQI()) + if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + + if (ST->hasAVX512()) + if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + if (ST->hasAVX2()) { - int Idx = ConvertCostTableLookup(AVX2ConversionTbl, ISD, - DstTy.getSimpleVT(), SrcTy.getSimpleVT()); - if (Idx != -1) - return AVX2ConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } if (ST->hasAVX()) { - int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(), - SrcTy.getSimpleVT()); - if (Idx != -1) - return AVXConversionTbl[Idx].Cost; + if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE41()) { + if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; + } + + if (ST->hasSSE2()) { + if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, + DstTy.getSimpleVT(), + SrcTy.getSimpleVT())) + return Entry->Cost; } return BaseT::getCastInstrCost(Opcode, Dst, Src); } -unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, - Type *CondTy) { +int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = { + static const CostTblEntry SSE42CostTbl[] = { { ISD::SETCC, MVT::v2f64, 1 }, { ISD::SETCC, MVT::v4f32, 1 }, { ISD::SETCC, MVT::v2i64, 1 }, @@ -686,7 +868,7 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v16i8, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = { + static const CostTblEntry AVX1CostTbl[] = { { ISD::SETCC, MVT::v4f64, 1 }, { ISD::SETCC, MVT::v8f32, 1 }, // AVX1 does not support 8-wide integer compare. @@ -696,54 +878,45 @@ unsigned X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, { ISD::SETCC, MVT::v32i8, 4 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = { + static const CostTblEntry AVX2CostTbl[] = { { ISD::SETCC, MVT::v4i64, 1 }, { ISD::SETCC, MVT::v8i32, 1 }, { ISD::SETCC, MVT::v16i16, 1 }, { ISD::SETCC, MVT::v32i8, 1 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX512CostTbl[] = { + static const CostTblEntry AVX512CostTbl[] = { { ISD::SETCC, MVT::v8i64, 1 }, { ISD::SETCC, MVT::v16i32, 1 }, { ISD::SETCC, MVT::v8f64, 1 }, { ISD::SETCC, MVT::v16f32, 1 }, }; - if (ST->hasAVX512()) { - int Idx = CostTableLookup(AVX512CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX512CostTbl[Idx].Cost; - } + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX2()) { - int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX2CostTbl[Idx].Cost; - } + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTbl[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTbl[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) + return LT.first * Entry->Cost; return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy); } -unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, - unsigned Index) { +int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { assert(Val->isVectorTy() && "This must be a vector type"); if (Index != -1U) { // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val); // This type is legalized to a scalar type. if (!LT.second.isVector()) @@ -761,10 +934,9 @@ unsigned X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, return BaseT::getVectorInstrCost(Opcode, Val, Index); } -unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, - bool Extract) { +int X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) { assert (Ty->isVectorTy() && "Can only scalarize vectors"); - unsigned Cost = 0; + int Cost = 0; for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { if (Insert) @@ -776,9 +948,8 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty, bool Insert, return Cost; } -unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, - unsigned Alignment, - unsigned AddressSpace) { +int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace) { // Handle non-power-of-two vectors such as <3 x float> if (VectorType *VTy = dyn_cast<VectorType>(Src)) { unsigned NumElem = VTy->getVectorNumElements(); @@ -796,22 +967,21 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, // Assume that all other non-power-of-two numbers are scalarized. if (!isPowerOf2_32(NumElem)) { - unsigned Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), - Alignment, AddressSpace); - unsigned SplitCost = getScalarizationOverhead(Src, - Opcode == Instruction::Load, - Opcode==Instruction::Store); + int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment, + AddressSpace); + int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load, + Opcode == Instruction::Store); return NumElem * Cost + SplitCost; } } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src); assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && "Invalid Opcode"); // Each load/store unit costs 1. - unsigned Cost = LT.first * 1; + int Cost = LT.first * 1; // On Sandybridge 256bit load/stores are double pumped // (but not on Haswell). @@ -821,9 +991,9 @@ unsigned X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, return Cost; } -unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, - unsigned Alignment, - unsigned AddressSpace) { +int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, + unsigned Alignment, + unsigned AddressSpace) { VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy); if (!SrcVTy) // To calculate scalar take the regular cost, without mask @@ -832,34 +1002,33 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, unsigned NumElem = SrcVTy->getVectorNumElements(); VectorType *MaskTy = VectorType::get(Type::getInt8Ty(getGlobalContext()), NumElem); - if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy, 1)) || - (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy, 1)) || + if ((Opcode == Instruction::Load && !isLegalMaskedLoad(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedStore(SrcVTy)) || !isPowerOf2_32(NumElem)) { // Scalarization - unsigned MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); - unsigned ScalarCompareCost = - getCmpSelInstrCost(Instruction::ICmp, - Type::getInt8Ty(getGlobalContext()), NULL); - unsigned BranchCost = getCFInstrCost(Instruction::Br); - unsigned MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); - - unsigned ValueSplitCost = - getScalarizationOverhead(SrcVTy, Opcode == Instruction::Load, - Opcode == Instruction::Store); - unsigned MemopCost = + int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = getCmpSelInstrCost( + Instruction::ICmp, Type::getInt8Ty(getGlobalContext()), nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); + + int ValueSplitCost = getScalarizationOverhead( + SrcVTy, Opcode == Instruction::Load, Opcode == Instruction::Store); + int MemopCost = NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; } // Legalize the type. - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); - unsigned Cost = 0; - if (LT.second != TLI->getValueType(DL, SrcVTy).getSimpleVT() && + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, SrcVTy); + auto VT = TLI->getValueType(DL, SrcVTy); + int Cost = 0; + if (VT.isSimple() && LT.second != VT.getSimpleVT() && LT.second.getVectorNumElements() == NumElem) // Promotion requires expand/truncate for data and a shuffle for mask. - Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, 0) + - getShuffleCost(TTI::SK_Alternate, MaskTy, 0, 0); + Cost += getShuffleCost(TTI::SK_Alternate, SrcVTy, 0, nullptr) + + getShuffleCost(TTI::SK_Alternate, MaskTy, 0, nullptr); else if (LT.second.getVectorNumElements() > NumElem) { VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(), @@ -874,7 +1043,7 @@ unsigned X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, return Cost+LT.first; } -unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { +int X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { // Address computations in vectorized code with non-consecutive addresses will // likely result in more instructions compared to scalar code where the // computation can more often be merged into the index mode. The resulting @@ -887,10 +1056,10 @@ unsigned X86TTIImpl::getAddressComputationCost(Type *Ty, bool IsComplex) { return BaseT::getAddressComputationCost(Ty, IsComplex); } -unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) { +int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { - std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; @@ -900,7 +1069,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = { + static const CostTblEntry SSE42CostTblPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". @@ -908,7 +1077,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 5 }, }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = { + static const CostTblEntry AVX1CostTblPairWise[] = { { ISD::FADD, MVT::v4f32, 4 }, { ISD::FADD, MVT::v4f64, 5 }, { ISD::FADD, MVT::v8f32, 7 }, @@ -919,7 +1088,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i32, 5 }, }; - static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = { + static const CostTblEntry SSE42CostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 2 }, { ISD::FADD, MVT::v4f32, 4 }, { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". @@ -927,7 +1096,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". }; - static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = { + static const CostTblEntry AVX1CostTblNoPairWise[] = { { ISD::FADD, MVT::v4f32, 3 }, { ISD::FADD, MVT::v4f64, 3 }, { ISD::FADD, MVT::v8f32, 4 }, @@ -939,29 +1108,21 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, }; if (IsPairwise) { - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTblPairWise[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTblPairWise[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } else { - if (ST->hasAVX()) { - int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * AVX1CostTblNoPairWise[Idx].Cost; - } + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; - if (ST->hasSSE42()) { - int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy); - if (Idx != -1) - return LT.first * SSE42CostTblNoPairWise[Idx].Cost; - } + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; } return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); @@ -970,7 +1131,7 @@ unsigned X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, /// \brief Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. -unsigned X86TTIImpl::getIntImmCost(int64_t Val) { +int X86TTIImpl::getIntImmCost(int64_t Val) { if (Val == 0) return TTI::TCC_Free; @@ -980,7 +1141,7 @@ unsigned X86TTIImpl::getIntImmCost(int64_t Val) { return 2 * TTI::TCC_Basic; } -unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1004,18 +1165,18 @@ unsigned X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { // Split the constant into 64-bit chunks and calculate the cost for each // chunk. - unsigned Cost = 0; + int Cost = 0; for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64); int64_t Val = Tmp.getSExtValue(); Cost += getIntImmCost(Val); } // We need at least one instruction to materialze the constant. - return std::max(1U, Cost); + return std::max(1, Cost); } -unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, - const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, + Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1038,6 +1199,26 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, case Instruction::Store: ImmIdx = 0; break; + case Instruction::ICmp: + // This is an imperfect hack to prevent constant hoisting of + // compares that might be trying to check if a 64-bit value fits in + // 32-bits. The backend can optimize these cases using a right shift by 32. + // Ideally we would check the compare predicate here. There also other + // similar immediates the backend can use shifts for. + if (Idx == 1 && Imm.getBitWidth() == 64) { + uint64_t ImmVal = Imm.getZExtValue(); + if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) + return TTI::TCC_Free; + } + ImmIdx = 1; + break; + case Instruction::And: + // We support 64-bit ANDs with immediates with 32-bits of leading zeroes + // by using a 32-bit operation with implicit zero extension. Detect such + // immediates here as the normal path expects bit 31 to be sign extended. + if (Idx == 1 && Imm.getBitWidth() == 64 && isUInt<32>(Imm.getZExtValue())) + return TTI::TCC_Free; + // Fallthrough case Instruction::Add: case Instruction::Sub: case Instruction::Mul: @@ -1045,10 +1226,8 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::ICmp: ImmIdx = 1; break; // Always return TCC_Free for the shift value of a shift instruction. @@ -1073,18 +1252,18 @@ unsigned X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, } if (Idx == ImmIdx) { - unsigned NumConstants = (BitSize + 63) / 64; - unsigned Cost = X86TTIImpl::getIntImmCost(Imm, Ty); + int NumConstants = (BitSize + 63) / 64; + int Cost = X86TTIImpl::getIntImmCost(Imm, Ty); return (Cost <= NumConstants * TTI::TCC_Basic) - ? static_cast<unsigned>(TTI::TCC_Free) + ? static_cast<int>(TTI::TCC_Free) : Cost; } return X86TTIImpl::getIntImmCost(Imm, Ty); } -unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, - const APInt &Imm, Type *Ty) { +int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty) { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); @@ -1118,23 +1297,181 @@ unsigned X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, return X86TTIImpl::getIntImmCost(Imm, Ty); } -bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, int Consecutive) { - int DataWidth = DataTy->getPrimitiveSizeInBits(); +// Return an average cost of Gather / Scatter instruction, maybe improved later +int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace) { + + assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost"); + unsigned VF = SrcVTy->getVectorNumElements(); + + // Try to reduce index size from 64 bit (default for GEP) + // to 32. It is essential for VF 16. If the index can't be reduced to 32, the + // operation will use 16 x 64 indices which do not fit in a zmm and needs + // to split. Also check that the base pointer is the same for all lanes, + // and that there's at most one variable index. + auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) { + unsigned IndexSize = DL.getPointerSizeInBits(); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr); + if (IndexSize < 64 || !GEP) + return IndexSize; + + unsigned NumOfVarIndices = 0; + Value *Ptrs = GEP->getPointerOperand(); + if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs)) + return IndexSize; + for (unsigned i = 1; i < GEP->getNumOperands(); ++i) { + if (isa<Constant>(GEP->getOperand(i))) + continue; + Type *IndxTy = GEP->getOperand(i)->getType(); + if (IndxTy->isVectorTy()) + IndxTy = IndxTy->getVectorElementType(); + if ((IndxTy->getPrimitiveSizeInBits() == 64 && + !isa<SExtInst>(GEP->getOperand(i))) || + ++NumOfVarIndices > 1) + return IndexSize; // 64 + } + return (unsigned)32; + }; + + + // Trying to reduce IndexSize to 32 bits for vector 16. + // By default the IndexSize is equal to pointer size. + unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : + DL.getPointerSizeInBits(); + + Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(), + IndexSize), VF); + std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy); + std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy); + int SplitFactor = std::max(IdxsLT.first, SrcLT.first); + if (SplitFactor > 1) { + // Handle splitting of vector of pointers + Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor); + return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment, + AddressSpace); + } + + // The gather / scatter cost is given by Intel architects. It is a rough + // number since we are looking at one instruction in a time. + const int GSOverhead = 2; + return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); +} + +/// Return the cost of full scalarization of gather / scatter operation. +/// +/// Opcode - Load or Store instruction. +/// SrcVTy - The type of the data vector that should be gathered or scattered. +/// VariableMask - The mask is non-constant at compile time. +/// Alignment - Alignment for one element. +/// AddressSpace - pointer[s] address space. +/// +int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy, + bool VariableMask, unsigned Alignment, + unsigned AddressSpace) { + unsigned VF = SrcVTy->getVectorNumElements(); + + int MaskUnpackCost = 0; + if (VariableMask) { + VectorType *MaskTy = + VectorType::get(Type::getInt1Ty(getGlobalContext()), VF); + MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true); + int ScalarCompareCost = + getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()), + nullptr); + int BranchCost = getCFInstrCost(Instruction::Br); + MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); + } - // Todo: AVX512 allows gather/scatter, works with strided and random as well - if ((DataWidth < 32) || (Consecutive == 0)) + // The cost of the scalar loads/stores. + int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), + Alignment, AddressSpace); + + int InsertExtractCost = 0; + if (Opcode == Instruction::Load) + for (unsigned i = 0; i < VF; ++i) + // Add the cost of inserting each scalar load into the vector + InsertExtractCost += + getVectorInstrCost(Instruction::InsertElement, SrcVTy, i); + else + for (unsigned i = 0; i < VF; ++i) + // Add the cost of extracting each element out of the data vector + InsertExtractCost += + getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i); + + return MemoryOpCost + MaskUnpackCost + InsertExtractCost; +} + +/// Calculate the cost of Gather / Scatter operation +int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, + Value *Ptr, bool VariableMask, + unsigned Alignment) { + assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter"); + unsigned VF = SrcVTy->getVectorNumElements(); + PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType()); + if (!PtrTy && Ptr->getType()->isVectorTy()) + PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType()); + assert(PtrTy && "Unexpected type for Ptr argument"); + unsigned AddressSpace = PtrTy->getAddressSpace(); + + bool Scalarize = false; + if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) || + (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy))) + Scalarize = true; + // Gather / Scatter for vector 2 is not profitable on KNL / SKX + // Vector-4 of gather/scatter instruction does not exist on KNL. + // We can extend it to 8 elements, but zeroing upper bits of + // the mask vector will add more instructions. Right now we give the scalar + // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is + // better in the VariableMask case. + if (VF == 2 || (VF == 4 && !ST->hasVLX())) + Scalarize = true; + + if (Scalarize) + return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace); + + return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); +} + +bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + return (DataWidth >= 32 && ST->hasAVX2()); +} + +bool X86TTIImpl::isLegalMaskedStore(Type *DataType) { + return isLegalMaskedLoad(DataType); +} + +bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { + // This function is called now in two cases: from the Loop Vectorizer + // and from the Scalarizer. + // When the Loop Vectorizer asks about legality of the feature, + // the vectorization factor is not calculated yet. The Loop Vectorizer + // sends a scalar type and the decision is based on the width of the + // scalar element. + // Later on, the cost model will estimate usage this intrinsic based on + // the vector type. + // The Scalarizer asks again about legality. It sends a vector type. + // In this case we can reject non-power-of-2 vectors. + if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements())) return false; - if (ST->hasAVX512() || ST->hasAVX2()) - return true; - return false; + Type *ScalarTy = DataTy->getScalarType(); + int DataWidth = isa<PointerType>(ScalarTy) ? + DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); + + // AVX-512 allows gather and scatter + return DataWidth >= 32 && ST->hasAVX512(); } -bool X86TTIImpl::isLegalMaskedStore(Type *DataType, int Consecutive) { - return isLegalMaskedLoad(DataType, Consecutive); +bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + return isLegalMaskedGather(DataType); } -bool X86TTIImpl::hasCompatibleFunctionAttributes(const Function *Caller, - const Function *Callee) const { +bool X86TTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); // Work this as a subsetting of subtarget features. diff --git a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h index da3f36c..adb745e 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/contrib/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -33,13 +33,13 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> { const X86Subtarget *ST; const X86TargetLowering *TLI; - unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); + int getScalarizationOverhead(Type *Ty, bool Insert, bool Extract); const X86Subtarget *getST() const { return ST; } const X86TargetLowering *getTLI() const { return TLI; } public: - explicit X86TTIImpl(const X86TargetMachine *TM, Function &F) + explicit X86TTIImpl(const X86TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} @@ -62,38 +62,44 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); unsigned getMaxInterleaveFactor(unsigned VF); - unsigned getArithmeticInstrCost( + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None); - unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, - Type *SubTp); - unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); - unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); - unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); - unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); - unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, - unsigned AddressSpace); - - unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex); - - unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); - - unsigned getIntImmCost(int64_t); - - unsigned getIntImmCost(const APInt &Imm, Type *Ty); - - unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, - Type *Ty); - unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, - Type *Ty); - bool isLegalMaskedLoad(Type *DataType, int Consecutive); - bool isLegalMaskedStore(Type *DataType, int Consecutive); - bool hasCompatibleFunctionAttributes(const Function *Caller, - const Function *Callee) const; + int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); + int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy); + int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, + unsigned AddressSpace); + int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr, + bool VariableMask, unsigned Alignment); + int getAddressComputationCost(Type *PtrTy, bool IsComplex); + + int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + + int getIntImmCost(int64_t); + + int getIntImmCost(const APInt &Imm, Type *Ty); + + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); + int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, + Type *Ty); + bool isLegalMaskedLoad(Type *DataType); + bool isLegalMaskedStore(Type *DataType); + bool isLegalMaskedGather(Type *DataType); + bool isLegalMaskedScatter(Type *DataType); + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; +private: + int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, + unsigned Alignment, unsigned AddressSpace); + int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr, + unsigned Alignment, unsigned AddressSpace); /// @} }; diff --git a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp index 9190d0b..dce94a9 100644 --- a/contrib/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/contrib/llvm/lib/Target/X86/X86WinEHState.cpp @@ -15,7 +15,8 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "llvm/Analysis/LibCallSemantics.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/WinEHFuncInfo.h" @@ -38,12 +39,16 @@ using namespace llvm::PatternMatch; #define DEBUG_TYPE "winehstate" +namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); } + namespace { class WinEHStatePass : public FunctionPass { public: static char ID; // Pass identification, replacement for typeid. - WinEHStatePass() : FunctionPass(ID) {} + WinEHStatePass() : FunctionPass(ID) { + initializeWinEHStatePassPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &Fn) override; @@ -62,18 +67,13 @@ private: void linkExceptionRegistration(IRBuilder<> &Builder, Function *Handler); void unlinkExceptionRegistration(IRBuilder<> &Builder); - void addCXXStateStores(Function &F, MachineModuleInfo &MMI); - void addSEHStateStores(Function &F, MachineModuleInfo &MMI); - void addCXXStateStoresToFunclet(Value *ParentRegNode, WinEHFuncInfo &FuncInfo, - Function &F, int BaseState); + void addStateStores(Function &F, WinEHFuncInfo &FuncInfo); void insertStateNumberStore(Value *ParentRegNode, Instruction *IP, int State); Value *emitEHLSDA(IRBuilder<> &Builder, Function *F); Function *generateLSDAInEAXThunk(Function *ParentFunc); - int escapeRegNode(Function &F); - // Module-level type getters. Type *getEHLinkRegistrationType(); Type *getSEHRegistrationType(); @@ -111,6 +111,9 @@ FunctionPass *llvm::createX86WinEHStatePass() { return new WinEHStatePass(); } char WinEHStatePass::ID = 0; +INITIALIZE_PASS(WinEHStatePass, "x86-winehstate", + "Insert stores for EH state numbers", false, false) + bool WinEHStatePass::doInitialization(Module &M) { TheModule = &M; FrameEscape = Intrinsic::getDeclaration(TheModule, Intrinsic::localescape); @@ -138,14 +141,7 @@ void WinEHStatePass::getAnalysisUsage(AnalysisUsage &AU) const { } bool WinEHStatePass::runOnFunction(Function &F) { - // If this is an outlined handler, don't do anything. We'll do state insertion - // for it in the parent. - StringRef WinEHParentName = - F.getFnAttribute("wineh-parent").getValueAsString(); - if (WinEHParentName != F.getName() && !WinEHParentName.empty()) - return false; - - // Check the personality. Do nothing if this is not an MSVC personality. + // Check the personality. Do nothing if this personality doesn't use funclets. if (!F.hasPersonalityFn()) return false; PersonalityFn = @@ -153,7 +149,19 @@ bool WinEHStatePass::runOnFunction(Function &F) { if (!PersonalityFn) return false; Personality = classifyEHPersonality(PersonalityFn); - if (!isMSVCEHPersonality(Personality)) + if (!isFuncletEHPersonality(Personality)) + return false; + + // Skip this function if there are no EH pads and we aren't using IR-level + // outlining. + bool HasPads = false; + for (BasicBlock &BB : F) { + if (BB.isEHPad()) { + HasPads = true; + break; + } + } + if (!HasPads) return false; // Disable frame pointer elimination in this function. @@ -163,14 +171,13 @@ bool WinEHStatePass::runOnFunction(Function &F) { emitExceptionRegistrationRecord(&F); - auto *MMIPtr = getAnalysisIfAvailable<MachineModuleInfo>(); - assert(MMIPtr && "MachineModuleInfo should always be available"); - MachineModuleInfo &MMI = *MMIPtr; - switch (Personality) { - default: llvm_unreachable("unexpected personality function"); - case EHPersonality::MSVC_CXX: addCXXStateStores(F, MMI); break; - case EHPersonality::MSVC_X86SEH: addSEHStateStores(F, MMI); break; - } + // The state numbers calculated here in IR must agree with what we calculate + // later on for the MachineFunction. In particular, if an IR pass deletes an + // unreachable EH pad after this point before machine CFG construction, we + // will be in trouble. If this assumption is ever broken, we should turn the + // numbers into an immutable analysis pass. + WinEHFuncInfo FuncInfo; + addStateStores(F, FuncInfo); // Reset per-function state. PersonalityFn = nullptr; @@ -261,7 +268,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); // TryLevel = -1 StateFieldIndex = 2; - insertStateNumberStore(RegNode, Builder.GetInsertPoint(), -1); + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), -1); // Handler = __ehhandler$F Function *Trampoline = generateLSDAInEAXThunk(F); Link = Builder.CreateStructGEP(RegNodeTy, RegNode, 1); @@ -278,7 +285,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Builder.CreateStore(SP, Builder.CreateStructGEP(RegNodeTy, RegNode, 0)); // TryLevel = -2 / -1 StateFieldIndex = 4; - insertStateNumberStore(RegNode, Builder.GetInsertPoint(), + insertStateNumberStore(RegNode, &*Builder.GetInsertPoint(), UseStackGuard ? -2 : -1); // ScopeTable = llvm.x86.seh.lsda(F) Value *FI8 = Builder.CreateBitCast(F, Int8PtrType); @@ -347,7 +354,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { Value *CastPersonality = Builder.CreateBitCast(PersonalityFn, TargetFuncTy->getPointerTo()); auto AI = Trampoline->arg_begin(); - Value *Args[5] = {LSDA, AI++, AI++, AI++, AI++}; + Value *Args[5] = {LSDA, &*AI++, &*AI++, &*AI++, &*AI++}; CallInst *Call = Builder.CreateCall(CastPersonality, Args); // Can't use musttail due to prototype mismatch, but we can use tail. Call->setTailCall(true); @@ -391,160 +398,53 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) { Builder.CreateStore(Next, FSZero); } -void WinEHStatePass::addCXXStateStores(Function &F, MachineModuleInfo &MMI) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); - calculateWinCXXEHStateNumbers(&F, FuncInfo); - - // The base state for the parent is -1. - addCXXStateStoresToFunclet(RegNode, FuncInfo, F, -1); - - // Set up RegNodeEscapeIndex - int RegNodeEscapeIndex = escapeRegNode(F); - FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; - - // Only insert stores in catch handlers. - Constant *FI8 = - ConstantExpr::getBitCast(&F, Type::getInt8PtrTy(TheModule->getContext())); - for (auto P : FuncInfo.HandlerBaseState) { - Function *Handler = const_cast<Function *>(P.first); - int BaseState = P.second; - IRBuilder<> Builder(&Handler->getEntryBlock(), - Handler->getEntryBlock().begin()); - // FIXME: Find and reuse such a call if present. - Value *ParentFP = Builder.CreateCall(FrameAddress, {Builder.getInt32(1)}); - Value *RecoveredRegNode = Builder.CreateCall( - FrameRecover, {FI8, ParentFP, Builder.getInt32(RegNodeEscapeIndex)}); - RecoveredRegNode = - Builder.CreateBitCast(RecoveredRegNode, RegNodeTy->getPointerTo(0)); - addCXXStateStoresToFunclet(RecoveredRegNode, FuncInfo, *Handler, BaseState); - } -} - -/// Escape RegNode so that we can access it from child handlers. Find the call -/// to localescape, if any, in the entry block and append RegNode to the list -/// of arguments. -int WinEHStatePass::escapeRegNode(Function &F) { - // Find the call to localescape and extract its arguments. - IntrinsicInst *EscapeCall = nullptr; - for (Instruction &I : F.getEntryBlock()) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); - if (II && II->getIntrinsicID() == Intrinsic::localescape) { - EscapeCall = II; - break; - } - } - SmallVector<Value *, 8> Args; - if (EscapeCall) { - auto Ops = EscapeCall->arg_operands(); - Args.append(Ops.begin(), Ops.end()); - } - Args.push_back(RegNode); - - // Replace the call (if it exists) with new one. Otherwise, insert at the end - // of the entry block. - Instruction *InsertPt = EscapeCall; - if (!EscapeCall) - InsertPt = F.getEntryBlock().getTerminator(); - IRBuilder<> Builder(&F.getEntryBlock(), InsertPt); - Builder.CreateCall(FrameEscape, Args); - if (EscapeCall) - EscapeCall->eraseFromParent(); - return Args.size() - 1; -} +void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { + // Mark the registration node. The backend needs to know which alloca it is so + // that it can recover the original frame pointer. + IRBuilder<> Builder(RegNode->getParent(), std::next(RegNode->getIterator())); + Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getInt8PtrTy()); + Builder.CreateCall( + Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode), + {RegNodeI8}); + + // Calculate state numbers. + if (isAsynchronousEHPersonality(Personality)) + calculateSEHStateNumbers(&F, FuncInfo); + else + calculateWinCXXEHStateNumbers(&F, FuncInfo); -void WinEHStatePass::addCXXStateStoresToFunclet(Value *ParentRegNode, - WinEHFuncInfo &FuncInfo, - Function &F, int BaseState) { // Iterate all the instructions and emit state number stores. + DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(F); for (BasicBlock &BB : F) { + // Figure out what state we should assign calls in this block. + int BaseState = -1; + auto &BBColors = BlockColors[&BB]; + + assert(BBColors.size() == 1 && + "multi-color BB not removed by preparation"); + BasicBlock *FuncletEntryBB = BBColors.front(); + if (auto *FuncletPad = + dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) { + auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad); + if (BaseStateI != FuncInfo.FuncletBaseStateMap.end()) + BaseState = BaseStateI->second; + } + for (Instruction &I : BB) { if (auto *CI = dyn_cast<CallInst>(&I)) { // Possibly throwing call instructions have no actions to take after // an unwind. Ensure they are in the -1 state. if (CI->doesNotThrow()) continue; - insertStateNumberStore(ParentRegNode, CI, BaseState); + insertStateNumberStore(RegNode, CI, BaseState); } else if (auto *II = dyn_cast<InvokeInst>(&I)) { // Look up the state number of the landingpad this unwinds to. - LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst(); - // FIXME: Why does this assertion fail? - //assert(FuncInfo.LandingPadStateMap.count(LPI) && "LP has no state!"); - int State = FuncInfo.LandingPadStateMap[LPI]; - insertStateNumberStore(ParentRegNode, II, State); - } - } - } -} - -/// Assign every distinct landingpad a unique state number for SEH. Unlike C++ -/// EH, we can use this very simple algorithm while C++ EH cannot because catch -/// handlers aren't outlined and the runtime doesn't have to figure out which -/// catch handler frame to unwind to. -/// FIXME: __finally blocks are outlined, so this approach may break down there. -void WinEHStatePass::addSEHStateStores(Function &F, MachineModuleInfo &MMI) { - WinEHFuncInfo &FuncInfo = MMI.getWinEHFuncInfo(&F); - - // Remember and return the index that we used. We save it in WinEHFuncInfo so - // that we can lower llvm.x86.seh.recoverfp later in filter functions without - // too much trouble. - int RegNodeEscapeIndex = escapeRegNode(F); - FuncInfo.EHRegNodeEscapeIndex = RegNodeEscapeIndex; - - // Iterate all the instructions and emit state number stores. - int CurState = 0; - SmallPtrSet<BasicBlock *, 4> ExceptBlocks; - for (BasicBlock &BB : F) { - for (auto I = BB.begin(), E = BB.end(); I != E; ++I) { - if (auto *CI = dyn_cast<CallInst>(I)) { - auto *Intrin = dyn_cast<IntrinsicInst>(CI); - if (Intrin) { - // Calls that "don't throw" are considered to be able to throw asynch - // exceptions, but intrinsics cannot. - continue; - } - insertStateNumberStore(RegNode, CI, -1); - } else if (auto *II = dyn_cast<InvokeInst>(I)) { - // Look up the state number of the landingpad this unwinds to. - LandingPadInst *LPI = II->getUnwindDest()->getLandingPadInst(); - auto InsertionPair = - FuncInfo.LandingPadStateMap.insert(std::make_pair(LPI, CurState)); - auto Iter = InsertionPair.first; - int &State = Iter->second; - bool Inserted = InsertionPair.second; - if (Inserted) { - // Each action consumes a state number. - auto *EHActions = cast<IntrinsicInst>(LPI->getNextNode()); - SmallVector<std::unique_ptr<ActionHandler>, 4> ActionList; - parseEHActions(EHActions, ActionList); - assert(!ActionList.empty()); - CurState += ActionList.size(); - State += ActionList.size() - 1; - - // Remember all the __except block targets. - for (auto &Handler : ActionList) { - if (auto *CH = dyn_cast<CatchHandler>(Handler.get())) { - auto *BA = cast<BlockAddress>(CH->getHandlerBlockOrFunc()); -#ifndef NDEBUG - for (BasicBlock *Pred : predecessors(BA->getBasicBlock())) - assert(Pred->isLandingPad() && - "WinEHPrepare failed to split block"); -#endif - ExceptBlocks.insert(BA->getBasicBlock()); - } - } - } + assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!"); + int State = FuncInfo.InvokeStateMap[II]; insertStateNumberStore(RegNode, II, State); } } } - - // Insert llvm.x86.seh.restoreframe() into each __except block. - Function *RestoreFrame = - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_restoreframe); - for (BasicBlock *ExceptBB : ExceptBlocks) { - IRBuilder<> Builder(ExceptBB->begin()); - Builder.CreateCall(RestoreFrame, {}); - } } void WinEHStatePass::insertStateNumberStore(Value *ParentRegNode, diff --git a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp index 2e44ac9..aaf267a 100644 --- a/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp +++ b/contrib/llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp @@ -224,7 +224,7 @@ static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder) { if (Val > 11) return MCDisassembler::Fail; - static unsigned Values[] = { + static const unsigned Values[] = { 32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32 }; Inst.addOperand(MCOperand::createImm(Values[Val])); diff --git a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h index 6fd2dec..dc513f7 100644 --- a/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h +++ b/contrib/llvm/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h @@ -19,8 +19,6 @@ namespace llvm { -class TargetMachine; - class XCoreInstPrinter : public MCInstPrinter { public: XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII, diff --git a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp index 702056d..b00cdd5 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp @@ -115,14 +115,14 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { EmitSpecialLLVMGlobal(GV)) return; - const DataLayout *TD = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); OutStreamer->SwitchSection( getObjFileLowering().SectionForGlobal(GV, *Mang, TM)); MCSymbol *GVSym = getSymbol(GV); const Constant *C = GV->getInitializer(); - unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType()); - + unsigned Align = (unsigned)DL.getPreferredTypeAlignmentShift(C->getType()); + // Mark the start of the global getTargetStreamer().emitCCTopData(GVSym->getName()); @@ -154,15 +154,15 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { if (GV->isThreadLocal()) { report_fatal_error("TLS is not supported by this target!"); } - unsigned Size = TD->getTypeAllocSize(C->getType()); + unsigned Size = DL.getTypeAllocSize(C->getType()); if (MAI->hasDotTypeDotSizeDirective()) { OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject); OutStreamer->emitELFSize(cast<MCSymbolELF>(GVSym), MCConstantExpr::create(Size, OutContext)); } OutStreamer->EmitLabel(GVSym); - - EmitGlobalConstant(C); + + EmitGlobalConstant(DL, C); // The ABI requires that unsigned scalar types smaller than 32 bits // are padded to 32 bits. if (Size < 4) @@ -208,7 +208,7 @@ printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O, void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum, raw_ostream &O) { - const DataLayout *DL = TM.getDataLayout(); + const DataLayout &DL = getDataLayout(); const MachineOperand &MO = MI->getOperand(opNum); switch (MO.getType()) { case MachineOperand::MO_Register: @@ -224,8 +224,8 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum, getSymbol(MO.getGlobal())->print(O, MAI); break; case MachineOperand::MO_ConstantPoolIndex: - O << DL->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() - << '_' << MO.getIndex(); + O << DL.getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_' + << MO.getIndex(); break; case MachineOperand::MO_BlockAddress: GetBlockAddressSymbol(MO.getBlockAddress())->print(O, MAI); diff --git a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp index 76c3d81..ae493de 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreFrameLowering.cpp @@ -160,27 +160,26 @@ static void GetSpillList(SmallVectorImpl<StackSlotInfo> &SpillList, /// As offsets are negative, the largest offsets will be first. static void GetEHSpillList(SmallVectorImpl<StackSlotInfo> &SpillList, MachineFrameInfo *MFI, XCoreFunctionInfo *XFI, + const Constant *PersonalityFn, const TargetLowering *TL) { assert(XFI->hasEHSpillSlot() && "There are no EH register spill slots"); - const int* EHSlot = XFI->getEHSpillSlot(); - SpillList.push_back(StackSlotInfo(EHSlot[0], - MFI->getObjectOffset(EHSlot[0]), - TL->getExceptionPointerRegister())); - SpillList.push_back(StackSlotInfo(EHSlot[0], - MFI->getObjectOffset(EHSlot[1]), - TL->getExceptionSelectorRegister())); + const int *EHSlot = XFI->getEHSpillSlot(); + SpillList.push_back( + StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[0]), + TL->getExceptionPointerRegister(PersonalityFn))); + SpillList.push_back( + StackSlotInfo(EHSlot[0], MFI->getObjectOffset(EHSlot[1]), + TL->getExceptionSelectorRegister(PersonalityFn))); std::sort(SpillList.begin(), SpillList.end(), CompareSSIOffset); } - static MachineMemOperand * getFrameIndexMMO(MachineBasicBlock &MBB, int FrameIndex, unsigned flags) { MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex), - flags, MFI.getObjectSize(FrameIndex), - MFI.getObjectAlignment(FrameIndex)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIndex), flags, + MFI.getObjectSize(FrameIndex), MFI.getObjectAlignment(FrameIndex)); return MMO; } @@ -323,8 +322,11 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF, if (XFI->hasEHSpillSlot()) { // The unwinder requires stack slot & CFI offsets for the exception info. // We do not save/spill these registers. - SmallVector<StackSlotInfo,2> SpillList; - GetEHSpillList(SpillList, MFI, XFI, + const Function *Fn = MF.getFunction(); + const Constant *PersonalityFn = + Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr; + SmallVector<StackSlotInfo, 2> SpillList; + GetEHSpillList(SpillList, MFI, XFI, PersonalityFn, MF.getSubtarget().getTargetLowering()); assert(SpillList.size()==2 && "Unexpected SpillList size"); EmitCfiOffset(MBB, MBBI, dl, TII, MMI, @@ -355,8 +357,12 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF, if (RetOpcode == XCore::EH_RETURN) { // 'Restore' the exception info the unwinder has placed into the stack // slots. - SmallVector<StackSlotInfo,2> SpillList; - GetEHSpillList(SpillList, MFI, XFI, MF.getSubtarget().getTargetLowering()); + const Function *Fn = MF.getFunction(); + const Constant *PersonalityFn = + Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr; + SmallVector<StackSlotInfo, 2> SpillList; + GetEHSpillList(SpillList, MFI, XFI, PersonalityFn, + MF.getSubtarget().getTargetLowering()); RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList); // Return to the landing pad. diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp index 9d4a966..9f61c84 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp @@ -151,8 +151,9 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) { MVT::Other, CPIdx, CurDAG->getEntryNode()); MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = MF->getMachineMemOperand( - MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, 4, 4); + MemOp[0] = + MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), + MachineMemOperand::MOLoad, 4, 4); cast<MachineSDNode>(node)->setMemRefs(MemOp, MemOp + 1); return node; } diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp index d62e742..105b2cf 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.cpp @@ -79,9 +79,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM, // Compute derived properties from the register classes computeRegisterProperties(Subtarget.getRegisterInfo()); - // Division is expensive - setIntDivIsCheap(false); - setStackPointerRegisterToSaveRestore(XCore::SP); setSchedulingPreference(Sched::Source); @@ -154,8 +151,6 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM, // Exception handling setOperationAction(ISD::EH_RETURN, MVT::Other, Custom); - setExceptionPointerRegister(XCore::R0); - setExceptionSelectorRegister(XCore::R1); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); // Atomic operations @@ -839,7 +834,7 @@ LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); return DAG.getLoad( getPointerTy(DAG.getDataLayout()), SDLoc(Op), DAG.getEntryNode(), FIN, - MachinePointerInfo::getFixedStack(FI), false, false, false, 0); + MachinePointerInfo::getFixedStack(MF, FI), false, false, false, 0); } SDValue XCoreTargetLowering:: @@ -1367,8 +1362,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain, //from this parameter SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, - MachinePointerInfo::getFixedStack(FI), - false, false, false, 0); + MachinePointerInfo::getFixedStack(MF, FI), false, + false, false, 0); } const ArgDataPair ADP = { ArgIn, Ins[i].Flags }; ArgData.push_back(ADP); @@ -1517,9 +1512,10 @@ XCoreTargetLowering::LowerReturn(SDValue Chain, // Create a SelectionDAG node corresponding to a store // to this memory location. SDValue FIN = DAG.getFrameIndex(FI, MVT::i32); - MemOpChains.push_back(DAG.getStore(Chain, dl, OutVals[i], FIN, - MachinePointerInfo::getFixedStack(FI), false, false, - 0)); + MemOpChains.push_back(DAG.getStore( + Chain, dl, OutVals[i], FIN, + MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, + false, 0)); } // Transform all store nodes into one single node because @@ -1567,8 +1563,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, // to set, the condition code register to branch on, the true/false values to // select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = BB; - ++It; + MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... @@ -1828,9 +1823,8 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N, SDValue Chain = ST->getChain(); unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits(); - if (StoreBits % 8) { - break; - } + assert((StoreBits % 8) == 0 && + "Store size in bits must be a multiple of 8"); unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment( ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext())); unsigned Alignment = ST->getAlignment(); diff --git a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h index ddd675c..b6f09ff 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h +++ b/contrib/llvm/lib/Target/XCore/XCoreISelLowering.h @@ -125,6 +125,20 @@ namespace llvm { bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + /// If a physical register, this returns the register that receives the + /// exception address on entry to an EH pad. + unsigned + getExceptionPointerRegister(const Constant *PersonalityFn) const override { + return XCore::R0; + } + + /// If a physical register, this returns the register that receives the + /// exception typeid on entry to a landing pad. + unsigned + getExceptionSelectorRegister(const Constant *PersonalityFn) const override { + return XCore::R1; + } + private: const TargetMachine &TM; const XCoreSubtarget &Subtarget; diff --git a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp index ee30344..e4129ae 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreInstrInfo.cpp @@ -368,11 +368,10 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex), - MachineMemOperand::MOStore, - MFI.getObjectSize(FrameIndex), - MFI.getObjectAlignment(FrameIndex)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIndex), + MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex), + MFI.getObjectAlignment(FrameIndex)); BuildMI(MBB, I, DL, get(XCore::STWFI)) .addReg(SrcReg, getKillRegState(isKill)) .addFrameIndex(FrameIndex) @@ -391,11 +390,10 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, DL = I->getDebugLoc(); MachineFunction *MF = MBB.getParent(); const MachineFrameInfo &MFI = *MF->getFrameInfo(); - MachineMemOperand *MMO = - MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIndex), - MachineMemOperand::MOLoad, - MFI.getObjectSize(FrameIndex), - MFI.getObjectAlignment(FrameIndex)); + MachineMemOperand *MMO = MF->getMachineMemOperand( + MachinePointerInfo::getFixedStack(*MF, FrameIndex), + MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex), + MFI.getObjectAlignment(FrameIndex)); BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg) .addFrameIndex(FrameIndex) .addImm(0) diff --git a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 996c6f5..f0b7201 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -228,12 +228,9 @@ bool XCoreLowerThreadLocal::runOnModule(Module &M) { // Find thread local globals. bool MadeChange = false; SmallVector<GlobalVariable *, 16> ThreadLocalGlobals; - for (Module::global_iterator GVI = M.global_begin(), E = M.global_end(); - GVI != E; ++GVI) { - GlobalVariable *GV = GVI; - if (GV->isThreadLocal()) - ThreadLocalGlobals.push_back(GV); - } + for (GlobalVariable &GV : M.globals()) + if (GV.isThreadLocal()) + ThreadLocalGlobals.push_back(&GV); for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) { MadeChange |= lowerGlobal(ThreadLocalGlobals[I]); } diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp index 9ef9752..6c77096 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp @@ -1,4 +1,4 @@ -//===-- XCoreMachineFuctionInfo.cpp - XCore machine function info ---------===// +//===-- XCoreMachineFunctionInfo.cpp - XCore machine function info --------===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h index 078ffde..cdcc52f 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.h @@ -1,4 +1,4 @@ -//===-- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===// +//===- XCoreMachineFunctionInfo.h - XCore machine function info -*- C++ -*-===// // // The LLVM Compiler Infrastructure // diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp index f420081..4a79dac 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetMachine.cpp @@ -85,7 +85,7 @@ extern "C" void LLVMInitializeXCoreTarget() { } TargetIRAnalysis XCoreTargetMachine::getTargetIRAnalysis() { - return TargetIRAnalysis([this](Function &F) { + return TargetIRAnalysis([this](const Function &F) { return TargetTransformInfo(XCoreTTIImpl(this, F)); }); } diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp index b5a9905..aa16ecc 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp @@ -123,18 +123,21 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, if (Kind.isMergeableConst16()) return MergeableConst16Section; } Type *ObjType = GV->getType()->getPointerElementType(); + auto &DL = GV->getParent()->getDataLayout(); if (TM.getCodeModel() == CodeModel::Small || !ObjType->isSized() || - TM.getDataLayout()->getTypeAllocSize(ObjType) < CodeModelLargeSize) { + DL.getTypeAllocSize(ObjType) < CodeModelLargeSize) { if (Kind.isReadOnly()) return UseCPRel? ReadOnlySection : DataRelROSection; if (Kind.isBSS() || Kind.isCommon())return BSSSection; - if (Kind.isDataRel()) return DataSection; + if (Kind.isData()) + return DataSection; if (Kind.isReadOnlyWithRel()) return DataRelROSection; } else { if (Kind.isReadOnly()) return UseCPRel? ReadOnlySectionLarge : DataRelROSectionLarge; if (Kind.isBSS() || Kind.isCommon())return BSSSectionLarge; - if (Kind.isDataRel()) return DataSectionLarge; + if (Kind.isData()) + return DataSectionLarge; if (Kind.isReadOnlyWithRel()) return DataRelROSectionLarge; } @@ -142,9 +145,8 @@ XCoreTargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV, report_fatal_error("Target does not support TLS or Common sections"); } -MCSection * -XCoreTargetObjectFile::getSectionForConstant(SectionKind Kind, - const Constant *C) const { +MCSection *XCoreTargetObjectFile::getSectionForConstant( + const DataLayout &DL, SectionKind Kind, const Constant *C) const { if (Kind.isMergeableConst4()) return MergeableConst4Section; if (Kind.isMergeableConst8()) return MergeableConst8Section; if (Kind.isMergeableConst16()) return MergeableConst16Section; diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h index 2a5ac23..6701c66 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetObjectFile.h @@ -33,7 +33,7 @@ static const unsigned CodeModelLargeSize = 256; Mangler &Mang, const TargetMachine &TM) const override; - MCSection *getSectionForConstant(SectionKind Kind, + MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind, const Constant *C) const override; }; } // end namespace llvm diff --git a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h index e23aef3..b2cb889 100644 --- a/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h +++ b/contrib/llvm/lib/Target/XCore/XCoreTargetTransformInfo.h @@ -37,7 +37,7 @@ class XCoreTTIImpl : public BasicTTIImplBase<XCoreTTIImpl> { const XCoreTargetLowering *getTLI() const { return TLI; } public: - explicit XCoreTTIImpl(const XCoreTargetMachine *TM, Function &F) + explicit XCoreTTIImpl(const XCoreTargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl()), TLI(ST->getTargetLowering()) {} |